Add bbduk complexity (entropy-based) filtering

2024-11-25 04:09:55 +00:00 · 2022-04-02 17:02:05 +02:00 · 2022-04-02 17:02:05 +02:00 · b055df5ea0
commit b055df5ea0
parent 1dfbcacf68
13 changed files with 222 additions and 45 deletions
--- a/CITATIONS.md
+++ b/CITATIONS.md
@ -26,6 +26,8 @@

 - [Porechop](https://github.com/rrwick/Porechop)

+- [BBTools](http://sourceforge.net/projects/bbmap/)
+
 - [Kraken2](https://doi.org/10.1186/s13059-019-1891-0)

  > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. “Improved Metagenomic Analysis with Kraken 2.” Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0.
--- a/conf/modules.config
+++ b/conf/modules.config
@ -132,7 +132,6 @@ process {
        ]
    }

-
    withName: PORECHOP {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
@ -142,11 +141,17 @@ process {
        ]
    }

-    withName: CAT_FASTQ {
+    withName: BBMAP_BBDUK {
+        ext.args =  [
+                "entropy=${params.shortread_complexityfilter_bbduk_entropy}",
+                "entropywindow=${params.shortread_complexityfilter_bbduk_windowsize}",
+                params.shortread_complexityfilter_bbduk_mask ?  "entropymask=t" : "entropymask=f"
+            ].join(' ').trim()
+        ext.prefix = { "${meta.id}-${meta.run_accession}" }
        publishDir = [
-            path: { "${params.outdir}/prepared_sequences" },
-            mode: 'copy',
-            pattern: '*.fastq.gz'
+            path: { "${params.outdir}/bbduk/" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{fastq.gz,log}'
        ]
    }

--- a/modules.json
+++ b/modules.json
@ -6,6 +6,9 @@
            "adapterremoval": {
                "git_sha": "f0800157544a82ae222931764483331a81812012"
            },
+            "bbmap/bbduk": {
+                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
+            },
            "cat/fastq": {
                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
            },
--- a/modules/nf-core/modules/bbmap/bbduk/main.nf
+++ b/modules/nf-core/modules/bbmap/bbduk/main.nf
@ -0,0 +1,43 @@
+process BBMAP_BBDUK {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda (params.enable_conda ? "bioconda::bbmap=38.90" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/bbmap:38.90--he522d1c_1' :
+        'quay.io/biocontainers/bbmap:38.90--he522d1c_1' }"
+
+    input:
+    tuple val(meta), path(reads)
+    path contaminants
+
+    output:
+    tuple val(meta), path('*.fastq.gz'), emit: reads
+    tuple val(meta), path('*.log')     , emit: log
+    path "versions.yml"                , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def raw      = meta.single_end ? "in=${reads[0]}" : "in1=${reads[0]} in2=${reads[1]}"
+    def trimmed  = meta.single_end ? "out=${prefix}.fastq.gz" : "out1=${prefix}_1.fastq.gz out2=${prefix}_2.fastq.gz"
+    def contaminants_fa = contaminants ? "ref=$contaminants" : ''
+    """
+    maxmem=\$(echo \"$task.memory\"| sed 's/ GB/g/g')
+    bbduk.sh \\
+        -Xmx\$maxmem \\
+        $raw \\
+        $trimmed \\
+        threads=$task.cpus \\
+        $args \\
+        $contaminants_fa \\
+        &> ${prefix}.bbduk.log
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        bbmap: \$(bbversion.sh)
+    END_VERSIONS
+    """
+}
--- a/modules/nf-core/modules/bbmap/bbduk/meta.yml
+++ b/modules/nf-core/modules/bbmap/bbduk/meta.yml
@ -0,0 +1,52 @@
+name: bbmap_bbduk
+description: Adapter and quality trimming of sequencing reads
+keywords:
+  - trimming
+  - adapter trimming
+  - quality trimming
+tools:
+  - bbmap:
+      description: BBMap is a short read aligner, as well as various other bioinformatic tools.
+      homepage: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/
+      documentation: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/
+      tool_dev_url: None
+      doi: ""
+      licence: ["UC-LBL license (see package)"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: |
+        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+        respectively.
+  - contaminants:
+      type: file
+      description: |
+        Reference files containing adapter and/or contaminant sequences for sequence kmer matching
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: The trimmed/modified fastq reads
+      pattern: "*fastq.gz"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - log:
+      type: file
+      description: Bbduk log file
+      pattern: "*bbduk.log"
+
+authors:
+  - "@MGordon09"
--- a/nextflow.config
+++ b/nextflow.config
@ -65,6 +65,13 @@ params {
    shortread_clipmerge_minlength           = 15
    longread_clip                           = false

+    // Complexity filtering
+    shortread_complexityfilter                  = false
+    shortread_complexityfilter_tool             = 'bbduk'
+    shortread_complexityfilter_bbduk_entropy    = 0.3
+    shortread_complexityfilter_bbduk_windowsize = 50
+    shortread_complexityfilter_bbduk_mask       = false
+
    // MALT
    run_malt                   = false
    malt_mode                  = 'BlastN'
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -266,8 +266,7 @@
            "type": "boolean"
        },
        "shortread_clipmerge_excludeunmerged": {
-            "type": "boolean",
-            "default": false
+            "type": "boolean"
        },
        "longread_clip": {
            "type": "boolean"
@ -304,6 +303,24 @@
        "shortread_clipmerge_minlength": {
            "type": "integer",
            "default": 15
+        },
+        "shortread_complexityfilter_tool": {
+            "type": "string",
+            "default": "bbduk"
+        },
+        "shortread_complexityfilter_bbduk_entropy": {
+            "type": "number",
+            "default": 0.3
+        },
+        "shortread_complexityfilter_bbduk_windowsize": {
+            "type": "integer",
+            "default": 50
+        },
+        "shortread_complexityfilter_bbduk_mask": {
+            "type": "boolean"
+        },
+        "shortread_complexityfilter": {
+            "type": "boolean"
        }
    }
 }
--- a/subworkflows/local/longread_preprocessing.nf
+++ b/subworkflows/local/longread_preprocessing.nf
@ -1,6 +1,6 @@
-/*
-Process long raw reads with porechop
-*/
+//
+// Process long raw reads with porechop
+//

 include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main'
 include { PORECHOP                   } from '../../modules/nf-core/modules/porechop/main'
@ -25,7 +25,7 @@ workflow LONGREAD_PREPROCESSING {

    FASTQC_PROCESSED ( PORECHOP.out.reads )
    ch_versions = ch_versions.mix(PORECHOP.out.versions.first())
-    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip )


    emit:
--- a/subworkflows/local/shortread_adapterremoval.nf
+++ b/subworkflows/local/shortread_adapterremoval.nf
@ -1,6 +1,6 @@
-/*
-Process short raw reads with AdapterRemoval
-*/
+//
+// Process short raw reads with AdapterRemoval
+//

 include { ADAPTERREMOVAL as ADAPTERREMOVAL_SINGLE       } from '../../modules/nf-core/modules/adapterremoval/main'
 include { ADAPTERREMOVAL as ADAPTERREMOVAL_PAIRED       } from '../../modules/nf-core/modules/adapterremoval/main'
--- a/subworkflows/local/shortread_complexityfiltering.nf
+++ b/subworkflows/local/shortread_complexityfiltering.nf
@ -0,0 +1,28 @@
+//
+// Check input samplesheet and get read channels
+//
+
+include { BBMAP_BBDUK } from '../../modules/nf-core/modules/bbmap/bbduk/main'
+
+workflow SHORTREAD_COMPLEXITYFILTERING {
+    take:
+    reads // [ [ meta ], [ reads ] ]
+
+    main:
+    ch_versions       = Channel.empty()
+    ch_multiqc_files  = Channel.empty()
+
+    if ( params.shortread_complexityfilter_tool == 'bbduk' ) {
+        ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads
+        ch_versions        =  ch_versions.mix( BBMAP_BBDUK.out.versions.first() )
+        ch_multiqc_files   =  ch_multiqc_files.mix( BBMAP_BBDUK.out.log )
+    } else {
+        ch_filtered_reads = reads
+    }
+
+    emit:
+    reads    = ch_filtered_reads    // channel: [ val(meta), [ reads ] ]
+    versions = ch_versions          // channel: [ versions.yml ]
+    mqc      = ch_multiqc_files
+}
+
--- a/subworkflows/local/shortread_fastp.nf
+++ b/subworkflows/local/shortread_fastp.nf
@ -1,6 +1,6 @@
-/*
-Process short raw reads with FastP
-*/
+//
+// Process short raw reads with FastP
+//

 include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
 include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
@ -44,8 +44,8 @@ workflow SHORTREAD_FASTP {

    ch_processed_reads = ch_fastp_reads_prepped

-    ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
-    ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json )

    emit:
    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@ -1,5 +1,5 @@
 //
-// Check input samplesheet and get read channels
+// Perform read trimming and merging
 //


@ -9,7 +9,7 @@ include { FASTQC as FASTQC_PROCESSED       } from '../../modules/nf-core/modules

 workflow SHORTREAD_PREPROCESSING {
    take:
-    reads // file: /path/to/samplesheet.csv
+    reads //  [ [ meta ], [ reads ] ]

    main:
    ch_versions       = Channel.empty()
@ -29,7 +29,7 @@ workflow SHORTREAD_PREPROCESSING {

    FASTQC_PROCESSED ( ch_processed_reads )
    ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions )
-    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip )

    emit:
    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -43,6 +43,7 @@ include { INPUT_CHECK             } from '../subworkflows/local/input_check'
 include { DB_CHECK                      } from '../subworkflows/local/db_check'
 include { SHORTREAD_PREPROCESSING       } from '../subworkflows/local/shortread_preprocessing'
 include { LONGREAD_PREPROCESSING        } from '../subworkflows/local/longread_preprocessing'
+include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_complexityfiltering'

 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -61,7 +62,6 @@ include { CAT_FASTQ                   } from '../modules/nf-core/modules/cat/fas
 include { MALT_RUN                    } from '../modules/nf-core/modules/malt/run/main'
 include { KRAKEN2_KRAKEN2             } from '../modules/nf-core/modules/kraken2/kraken2/main'

-
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    RUN MAIN WORKFLOW
@ -98,10 +98,6 @@ workflow TAXPROFILER {

    ch_versions = ch_versions.mix(FASTQC.out.versions.first())

-    CUSTOM_DUMPSOFTWAREVERSIONS (
-        ch_versions.unique().collectFile(name: 'collated_versions.yml')
-    )
-
    /*
        SUBWORKFLOW: PERFORM PREPROCESSING
    */
@ -114,17 +110,26 @@ workflow TAXPROFILER {
    if ( params.longread_clip ) {
        ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ).reads
                                        .map { it -> [ it[0], [it[1]] ] }
-    ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first())
    } else {
        ch_longreads_preprocessed = INPUT_CHECK.out.nanopore
    }

+    /*
+        SUBWORKFLOW: COMPLEXITY FILTERING
+    */
+
+    if ( params.shortread_complexityfilter ) {
+        ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads
+    } else {
+        ch_shortreads_filtered = ch_shortreads_preprocessed
+    }
+
    /*
        COMBINE READS WITH POSSIBLE DATABASES
    */

    // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
-    ch_input_for_profiling = ch_shortreads_preprocessed
+    ch_input_for_profiling = ch_shortreads_filtered
            .mix( ch_longreads_preprocessed )
            .combine(DB_CHECK.out.dbs)
            .branch {
@ -177,6 +182,12 @@ workflow TAXPROFILER {
    /*
        MODULE: MultiQC
    */
+
+    CUSTOM_DUMPSOFTWAREVERSIONS (
+        ch_versions.unique().collectFile(name: 'collated_versions.yml')
+    )
+
+
    workflow_summary    = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params)
    ch_workflow_summary = Channel.value(workflow_summary)

@ -188,21 +199,30 @@ workflow TAXPROFILER {
    ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))

    if (params.shortread_clipmerge) {
-        ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_PREPROCESSING.out.mqc)
-    }
-    if (params.longread_clip) {
-        ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc)
-    }
-    if (params.run_kraken2) {
-        ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]))
-        ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first())
-    }
-    if (params.run_malt) {
-        ch_multiqc_files = ch_multiqc_files.mix(MALT_RUN.out.log.collect{it[1]}.ifEmpty([]))
-        ch_versions = ch_versions.mix(MALT_RUN.out.versions.first())
+        ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_shortclipmerge")
+        ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions )
+    }
+
+    if (params.longread_clip) {
+        ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_longclipmerge")
+        ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions )
+    }
+
+    if (params.shortread_complexityfilter){
+        ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_compelxity")
+        ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions )
+    }
+
+    if (params.run_kraken2) {
+        ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_kraken")
+        ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() )
+    }
+
+    if (params.run_malt) {
+        ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_malt")
+        ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() )
    }

-    // TODO MALT results overwriting per database?
    // TODO Versions for Karken/MALT not report?
    MULTIQC (
        ch_multiqc_files.collect()