Merge pull request #33 from nf-core/improve-shortread-input

Improve shortread input preprocessing
2024-11-25 02:59:55 +00:00 · 2022-03-30 11:48:32 +02:00 · 2022-03-30 11:48:32 +02:00 · 323883bd3e
commit 323883bd3e
parent fafb7e0f6f 76e9624e3f
10 changed files with 173 additions and 131 deletions
--- a/conf/modules.config
+++ b/conf/modules.config
@ -52,12 +52,27 @@ process {
        ]
    }

+    withName: FASTQC_PROCESSED {
+        ext.args = '--quiet'
+        ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
+        publishDir = [
+            path: { "${params.outdir}/fastqc/processed" },
+            mode: 'copy',
+            pattern: '*.html'
+        ]
+    }
+
    withName: FASTP {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
-        // TODO also include option to NOT merge
        ext.args   = [
-            { ${meta.single_end} } == 0 ? "-m" : '',
-            params.shortread_excludeunmerged ? '' : "--include_unmerged"
+            // collapsing options - option to retain singletons
+            params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged",
+            // trimming options
+            params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "",
+            params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
+            !{ ${meta.single_end} } && params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : "",
+            // filtering options
+            "--length_required ${params.shortread_clipmerge_minlength}"
        ].join(' ').trim()
        publishDir = [
            path: { "${params.outdir}/fastp" },
@ -75,16 +90,6 @@ process {
        ]
    }

-    withName: FASTQC_POST {
-        ext.args = '--quiet'
-        ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
-        publishDir = [
-            path: { "${params.outdir}/fastqc/processed" },
-            mode: 'copy',
-            pattern: '*.html'
-        ]
-    }
-
    withName: CAT_FASTQ {
        publishDir = [
            path: { "${params.outdir}/prepared_sequences" },
@ -100,7 +105,7 @@ process {
            pattern: '*.{rma6,tab,text,sam,log}'
        ]
        ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.db_name}" }
+        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
    }

    withName: KRAKEN2_KRAKEN2 {
@ -110,7 +115,7 @@ process {
            pattern: '*.{fastq.gz,txt}'
        ]
        ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.db_name}" }
+        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
    }

    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
--- a/conf/test.config
+++ b/conf/test.config
@ -23,7 +23,6 @@ params {
    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
    // TODO nf-core: Give any required params for the test so that command line flags are not needed
    input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
-    outdir              = "./results"
    databases           = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
    run_kraken2         = true
    run_malt            = true
--- a/modules.json
+++ b/modules.json
@ -24,11 +24,11 @@
            "multiqc": {
                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
            },
-            "untar": {
-                "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918"
-            },
            "porechop": {
                "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046"
+            },
+            "untar": {
+                "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918"
            }
        }
    }
--- a/nextflow.config
+++ b/nextflow.config
@ -55,9 +55,15 @@ params {
    databases = null

    // FASTQ preprocessing
-    shortread_clipmerge           = false
-    shortread_excludeunmerged     = true
-    longread_clip                 = false
+    shortread_clipmerge                     = false
+    shortread_clipmerge_tool                = 'fastp'
+    shortread_clipmerge_skipadaptertrim     = false
+    shortread_clipmerge_mergepairs          = false
+    shortread_clipmerge_excludeunmerged     = true
+    shortread_clipmerge_adapter1            = null
+    shortread_clipmerge_adapter2            = null
+    shortread_clipmerge_minlength           = 15
+    longread_clip                           = false

    // MALT
    run_malt                   = false
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -265,7 +265,7 @@
        "shortread_clipmerge": {
            "type": "boolean"
        },
-        "shortread_excludeunmerged": {
+        "shortread_clipmerge_excludeunmerged": {
            "type": "boolean",
            "default": true
        },
@ -281,6 +281,28 @@
        },
        "run_kraken2": {
            "type": "boolean"
+        },
+        "shortread_clipmerge_tool": {
+            "type": "string",
+            "default": "fastp"
+        },
+        "shortread_clipmerge_skipadaptertrim": {
+            "type": "boolean"
+        },
+        "shortread_clipmerge_mergepairs": {
+            "type": "boolean"
+        },
+        "shortread_clipmerge_adapter1": {
+            "type": "string",
+            "default": null
+        },
+        "shortread_clipmerge_adapter2": {
+            "type": "string",
+            "default": null
+        },
+        "shortread_clipmerge_minlength": {
+            "type": "integer",
+            "default": 15
        }
    }
 }
--- a/subworkflows/local/db_check.nf
+++ b/subworkflows/local/db_check.nf
@ -12,6 +12,9 @@ workflow DB_CHECK {
    main:

    // TODO: make database sheet check
+    // Checks:
+    // 1) no duplicates,
+    // 2) args do not have quotes, e.g. just `,,` and NOT `,"",`
    parsed_samplesheet = DATABASE_CHECK ( dbsheet )
        .csv
        .splitCsv ( header:true, sep:',' )
@ -21,7 +24,7 @@ workflow DB_CHECK {

    ch_dbs_for_untar = parsed_samplesheet
        .branch {
-            untar: it[1].toString().endsWith(".tar.gz")
+            untar: it[1].toString().endsWith(".tar.gz") && it[0]['tool'] != 'centrifuge'
            skip: true
        }

--- a/subworkflows/local/longread_preprocessing.nf
+++ b/subworkflows/local/longread_preprocessing.nf
@ -1,6 +1,9 @@
+/*
+Process long raw reads with porechop
+*/

-include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main'
-include { PORECHOP              } from '../../modules/nf-core/modules/porechop/main'
+include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main'
+include { PORECHOP                   } from '../../modules/nf-core/modules/porechop/main'

 workflow LONGREAD_PREPROCESSING {
    take:
@ -21,9 +24,9 @@ workflow LONGREAD_PREPROCESSING {
                                        [ meta_new, reads ]
                                    }

-    FASTQC_POST ( PORECHOP.out.reads )
+    FASTQC_PROCESSED ( PORECHOP.out.reads )
    ch_versions = ch_versions.mix(PORECHOP.out.versions.first())
-    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )


    emit:
--- a/subworkflows/local/shortread_fastp.nf
+++ b/subworkflows/local/shortread_fastp.nf
@ -0,0 +1,61 @@
+/*
+Process short raw reads with FastP
+*/
+
+include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
+include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
+
+workflow SHORTREAD_FASTP {
+    take:
+    reads // [[meta], [reads]]
+
+    main:
+    ch_versions = Channel.empty()
+    ch_multiqc_files      = Channel.empty()
+
+    ch_input_for_fastp = reads
+                            .dump(tag: "pre-fastp_branch")
+                            .branch{
+                                single: it[0]['single_end'] == true
+                                paired: it[0]['single_end'] == false
+                            }
+
+    ch_input_for_fastp.single.dump(tag: "input_fastp_single")
+    ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
+
+    FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
+    // Last parameter here turns on merging of PE data
+    FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs )
+
+    if ( params.shortread_clipmerge_mergepairs ) {
+        ch_fastp_reads_prepped_pe = FASTP_PAIRED.out.reads_merged
+                                        .map {
+                                            meta, reads ->
+                                                def meta_new = meta.clone()
+                                                meta_new['single_end'] = 1
+                                                [ meta_new, reads ]
+                                        }
+
+        ch_fastp_reads_prepped = ch_fastp_reads_prepped_pe.mix( FASTP_SINGLE.out.reads )
+
+    } else {
+        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads
+                                    .mix( FASTP_SINGLE.out.reads )
+    }
+
+    ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
+    ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
+
+    ch_processed_reads = ch_fastp_reads_prepped.dump(tag: "ch_fastp_reads_prepped")
+
+    ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
+
+    ch_multiqc_files.dump(tag: "preprocessing_fastp_mqc_final")
+
+    emit:
+    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
+    versions = ch_versions          // channel: [ versions.yml ]
+    mqc      = ch_multiqc_files
+}
+
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@ -3,67 +3,28 @@
 //


-include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
-include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
-include { FASTQC as FASTQC_POST       } from '../../modules/nf-core/modules/fastqc/main'
+include { SHORTREAD_FASTP             } from './shortread_fastp'
+include { FASTQC as FASTQC_PROCESSED       } from '../../modules/nf-core/modules/fastqc/main'

 workflow SHORTREAD_PREPROCESSING {
    take:
    reads // file: /path/to/samplesheet.csv

    main:
-    ch_versions = Channel.empty()
-    ch_multiqc_files      = Channel.empty()
-
-    //
-    // STEP: Read clipping and merging
-    //
-    // TODO give option to clip only and retain pairs
-    // TODO give option to retain singletons (probably fastp option likely)
-    // TODO move to subworkflow
-
-
-    if ( params.shortread_clipmerge ) {
-
-        ch_input_for_fastp = reads
-                                .dump(tag: "pre-fastp_branch")
-                                .branch{
-                                    single: it[0]['single_end'] == true
-                                    paired: it[0]['single_end'] == false
-                                }
-
-        ch_input_for_fastp.single.dump(tag: "input_fastp_single")
-        ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
-
-        FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
-        FASTP_PAIRED ( ch_input_for_fastp.paired, false, true )
-
-        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
-                                    .mix( FASTP_SINGLE.out.reads )
-                                    .map {
-                                        meta, reads ->
-                                        def meta_new = meta.clone()
-                                        meta_new['single_end'] = 1
-                                        [ meta_new, reads ]
-                                    }
-
-        FASTQC_POST ( ch_fastp_reads_prepped )
-
-        ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
-        ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
-
-        ch_processed_reads = ch_fastp_reads_prepped
-
-        ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} )
-        ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
-        ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
-
-        ch_multiqc_files.dump(tag: "preprocessing_mqc_final")
+    ch_versions       = Channel.empty()
+    ch_multiqc_files  = Channel.empty()

+    if ( params.shortread_clipmerge_tool == "fastp" ) {
+        ch_processed_reads = SHORTREAD_FASTP ( reads ).reads
+        ch_versions        =  ch_versions.mix( SHORTREAD_FASTP.out.versions )
+        ch_multiqc_files   =  ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc )
    } else {
        ch_processed_reads = reads
    }

+    FASTQC_PROCESSED ( ch_processed_reads )
+    ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )

    emit:
    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -17,6 +17,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true
 // Check mandatory parameters
 if (params.input    ) { ch_input     = file(params.input)     } else { exit 1, 'Input samplesheet not specified!' }
 if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
+if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not except uncollapsed paired-reads. Pairs will be profiled as separate files."

 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -36,11 +37,11 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi
 //
 // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
 //
-include { INPUT_CHECK         } from '../subworkflows/local/input_check'
+include { INPUT_CHECK             } from '../subworkflows/local/input_check'

-include { DB_CHECK            } from '../subworkflows/local/db_check'
+include { DB_CHECK                } from '../subworkflows/local/db_check'
 include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing'
-include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing'
+include { LONGREAD_PREPROCESSING  } from '../subworkflows/local/longread_preprocessing'

 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -73,9 +74,9 @@ workflow TAXPROFILER {

    ch_versions = Channel.empty()

-    //
-    // SUBWORKFLOW: Read in samplesheet, validate and stage input files
-    //
+    /*
+        SUBWORKFLOW: Read in samplesheet, validate and stage input files
+    */
    INPUT_CHECK (
        ch_input
    )
@ -85,22 +86,24 @@ workflow TAXPROFILER {
        ch_databases
    )

-    //
-    // MODULE: Run FastQC
-    //
+    /*
+        MODULE: Run FastQC
+    */
    ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ).dump(tag: "input_to_fastq")
+
    FASTQC (
        ch_input_for_fastqc
    )
+
    ch_versions = ch_versions.mix(FASTQC.out.versions.first())

    CUSTOM_DUMPSOFTWAREVERSIONS (
        ch_versions.unique().collectFile(name: 'collated_versions.yml')
    )

-    //
-    // PERFORM PREPROCESSING
-    //
+    /*
+        SUBWORKFLOW: PERFORM PREPROCESSING
+    */
    if ( params.shortread_clipmerge ) {
        ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads
    } else {
@ -115,53 +118,32 @@ workflow TAXPROFILER {
        ch_longreads_preprocessed = INPUT_CHECK.out.nanopore
    }

-    //
-    // PERFORM SHORT READ RUN MERGING
-    // TODO: Check not necessary for long reads too?
-    //
-    ch_processed_for_combine = ch_shortreads_preprocessed
-        .dump(tag: "prep_for_combine_grouping")
-        .map {
-            meta, reads ->
-            def meta_new = meta.clone()
-            meta_new['run_accession'] = 'combined'
-            [ meta_new, reads ]
-        }
-        .groupTuple ( by: 0 )
-        .branch{
-            combine: it[1].size() >= 2
-            skip: it[1].size() < 2
-        }
-
-    CAT_FASTQ ( ch_processed_for_combine.combine )
-
-    ch_reads_for_profiling = ch_processed_for_combine.skip
-                                .dump(tag: "skip_combine")
-                                .mix( CAT_FASTQ.out.reads )
-                                .dump(tag: "files_for_profiling")
-
-    //
-    // COMBINE READS WITH POSSIBLE DATABASES
-    //
+    /*
+        COMBINE READS WITH POSSIBLE DATABASES
+    */

    // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
-    ch_input_for_profiling = ch_reads_for_profiling
+    ch_input_for_profiling = ch_shortreads_preprocessed
            .mix( ch_longreads_preprocessed )
            .combine(DB_CHECK.out.dbs)
-            .dump(tag: "reads_plus_db")
+            .dump(tag: "reads_plus_db_clean")
            .branch {
                malt:    it[2]['tool'] == 'malt'
                kraken2: it[2]['tool'] == 'kraken2'
                unknown: true
            }

-    //
-    // PREPARE PROFILER INPUT CHANNELS
-    //
+    /*
+        PREPARE PROFILER INPUT CHANNELS
+    */

    // We groupTuple to have all samples in one channel for MALT as database
    // loading takes a long time, so we only want to run it once per database
+    // TODO document somewhere we only accept illumina short reads for MALT?
    ch_input_for_malt =  ch_input_for_profiling.malt
+                            .dump(tag: "input_to_malt_prefilter")
+                            .filter { it[0]['instrument_platform'] == 'ILLUMINA' }
+                            .dump(tag: "input_to_malt_postfilter")
                            .map {
                                it ->
                                    def temp_meta =  [ id: it[2]['db_name']]  + it[2]
@ -169,7 +151,7 @@ workflow TAXPROFILER {
                                    [ temp_meta, it[1], db ]
                            }
                            .groupTuple(by: [0,2])
-                            .dump(tag: "input for malt")
+                            .dump(tag: "input_to_malt")
                            .multiMap {
                                it ->
                                    reads: [ it[0], it[1].flatten() ]
@ -178,16 +160,16 @@ workflow TAXPROFILER {

    // We can run Kraken2 one-by-one sample-wise
    ch_input_for_kraken2 =  ch_input_for_profiling.kraken2
-                            .dump(tag: "input for kraken")
+                            .dump(tag: "input_to_kraken")
                            .multiMap {
                                it ->
                                    reads: [ it[0] + it[2], it[1] ]
                                    db: it[3]
                            }

-    //
-    // RUN PROFILING
-    //
+    /*
+        MODULE: RUN PROFILING
+    */
    if ( params.run_malt ) {
        MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db )
    }
@ -196,9 +178,9 @@ workflow TAXPROFILER {
        KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db  )
    }

-    //
-    // MODULE: MultiQC
-    //
+    /*
+        MODULE: MultiQC
+    */
    workflow_summary    = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params)
    ch_workflow_summary = Channel.value(workflow_summary)