Merge pull request #33 from nf-core/improve-shortread-input

Improve shortread input preprocessing
2024-11-22 09:19:54 +00:00 · 2022-03-30 11:48:32 +02:00 · 2022-03-30 11:48:32 +02:00 · 323883bd3e
commit 323883bd3e
parent fafb7e0f6f 76e9624e3f
10 changed files with 173 additions and 131 deletions
--- a/conf/modules.config
+++ b/conf/modules.config
@ -52,12 +52,27 @@ process {
        ]
    }
    withName: FASTQC_PROCESSED {
        ext.args = '--quiet'
        ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
        publishDir = [
            path: { "${params.outdir}/fastqc/processed" },
            mode: 'copy',
            pattern: '*.html'
        ]
    }
    withName: FASTP {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        // TODO also include option to NOT merge
        ext.args   = [
-            { ${meta.single_end} } == 0 ? "-m" : '',
+            // collapsing options - option to retain singletons
-            params.shortread_excludeunmerged ? '' : "--include_unmerged"
+            params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged",
            // trimming options
            params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "",
            params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
            !{ ${meta.single_end} } && params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : "",
            // filtering options
            "--length_required ${params.shortread_clipmerge_minlength}"
        ].join(' ').trim()
        publishDir = [
            path: { "${params.outdir}/fastp" },
@ -75,16 +90,6 @@ process {
        ]
    }
    withName: FASTQC_POST {
        ext.args = '--quiet'
        ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
        publishDir = [
            path: { "${params.outdir}/fastqc/processed" },
            mode: 'copy',
            pattern: '*.html'
        ]
    }
    withName: CAT_FASTQ {
        publishDir = [
            path: { "${params.outdir}/prepared_sequences" },
@ -100,7 +105,7 @@ process {
            pattern: '*.{rma6,tab,text,sam,log}'
        ]
        ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.db_name}" }
+        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
    }
    withName: KRAKEN2_KRAKEN2 {
@ -110,7 +115,7 @@ process {
            pattern: '*.{fastq.gz,txt}'
        ]
        ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.db_name}" }
+        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
    }
    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
--- a/conf/test.config
+++ b/conf/test.config
@ -23,7 +23,6 @@ params {
    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
    // TODO nf-core: Give any required params for the test so that command line flags are not needed
    input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
    outdir              = "./results"
    databases           = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
    run_kraken2         = true
    run_malt            = true
--- a/modules.json
+++ b/modules.json
@ -24,11 +24,11 @@
            "multiqc": {
                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
            },
            "untar": {
                "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918"
            },
            "porechop": {
                "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046"
            },
            "untar": {
                "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918"
            }
        }
    }
--- a/nextflow.config
+++ b/nextflow.config
@ -56,7 +56,13 @@ params {
    // FASTQ preprocessing
    shortread_clipmerge                     = false
-    shortread_excludeunmerged     = true
+    shortread_clipmerge_tool                = 'fastp'
    shortread_clipmerge_skipadaptertrim     = false
    shortread_clipmerge_mergepairs          = false
    shortread_clipmerge_excludeunmerged     = true
    shortread_clipmerge_adapter1            = null
    shortread_clipmerge_adapter2            = null
    shortread_clipmerge_minlength           = 15
    longread_clip                           = false
    // MALT
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -265,7 +265,7 @@
        "shortread_clipmerge": {
            "type": "boolean"
        },
-        "shortread_excludeunmerged": {
+        "shortread_clipmerge_excludeunmerged": {
            "type": "boolean",
            "default": true
        },
@ -281,6 +281,28 @@
        },
        "run_kraken2": {
            "type": "boolean"
        },
        "shortread_clipmerge_tool": {
            "type": "string",
            "default": "fastp"
        },
        "shortread_clipmerge_skipadaptertrim": {
            "type": "boolean"
        },
        "shortread_clipmerge_mergepairs": {
            "type": "boolean"
        },
        "shortread_clipmerge_adapter1": {
            "type": "string",
            "default": null
        },
        "shortread_clipmerge_adapter2": {
            "type": "string",
            "default": null
        },
        "shortread_clipmerge_minlength": {
            "type": "integer",
            "default": 15
        }
    }
 }
--- a/subworkflows/local/db_check.nf
+++ b/subworkflows/local/db_check.nf
@ -12,6 +12,9 @@ workflow DB_CHECK {
    main:
    // TODO: make database sheet check
    // Checks:
    // 1) no duplicates,
    // 2) args do not have quotes, e.g. just `,,` and NOT `,"",`
    parsed_samplesheet = DATABASE_CHECK ( dbsheet )
        .csv
        .splitCsv ( header:true, sep:',' )
@ -21,7 +24,7 @@ workflow DB_CHECK {
    ch_dbs_for_untar = parsed_samplesheet
        .branch {
-            untar: it[1].toString().endsWith(".tar.gz")
+            untar: it[1].toString().endsWith(".tar.gz") && it[0]['tool'] != 'centrifuge'
            skip: true
        }
--- a/subworkflows/local/longread_preprocessing.nf
+++ b/subworkflows/local/longread_preprocessing.nf
@ -1,5 +1,8 @@
 /*
 Process long raw reads with porechop
 */
-include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main'
+include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main'
 include { PORECHOP                   } from '../../modules/nf-core/modules/porechop/main'
 workflow LONGREAD_PREPROCESSING {
@ -21,9 +24,9 @@ workflow LONGREAD_PREPROCESSING {
                                        [ meta_new, reads ]
                                    }
-    FASTQC_POST ( PORECHOP.out.reads )
+    FASTQC_PROCESSED ( PORECHOP.out.reads )
    ch_versions = ch_versions.mix(PORECHOP.out.versions.first())
-    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )
    emit:
--- a/subworkflows/local/shortread_fastp.nf
+++ b/subworkflows/local/shortread_fastp.nf
@ -0,0 +1,61 @@
 /*
 Process short raw reads with FastP
 */
 include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
 include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
 workflow SHORTREAD_FASTP {
    take:
    reads // [[meta], [reads]]
    main:
    ch_versions = Channel.empty()
    ch_multiqc_files      = Channel.empty()
    ch_input_for_fastp = reads
                            .dump(tag: "pre-fastp_branch")
                            .branch{
                                single: it[0]['single_end'] == true
                                paired: it[0]['single_end'] == false
                            }
    ch_input_for_fastp.single.dump(tag: "input_fastp_single")
    ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
    FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
    // Last parameter here turns on merging of PE data
    FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs )
    if ( params.shortread_clipmerge_mergepairs ) {
        ch_fastp_reads_prepped_pe = FASTP_PAIRED.out.reads_merged
                                        .map {
                                            meta, reads ->
                                                def meta_new = meta.clone()
                                                meta_new['single_end'] = 1
                                                [ meta_new, reads ]
                                        }
        ch_fastp_reads_prepped = ch_fastp_reads_prepped_pe.mix( FASTP_SINGLE.out.reads )
    } else {
        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads
                                    .mix( FASTP_SINGLE.out.reads )
    }
    ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
    ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
    ch_processed_reads = ch_fastp_reads_prepped.dump(tag: "ch_fastp_reads_prepped")
    ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
    ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
    ch_multiqc_files.dump(tag: "preprocessing_fastp_mqc_final")
    emit:
    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
    versions = ch_versions          // channel: [ versions.yml ]
    mqc      = ch_multiqc_files
 }
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@ -3,9 +3,8 @@
 //
-include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
+include { SHORTREAD_FASTP             } from './shortread_fastp'
-include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
+include { FASTQC as FASTQC_PROCESSED       } from '../../modules/nf-core/modules/fastqc/main'
 include { FASTQC as FASTQC_POST       } from '../../modules/nf-core/modules/fastqc/main'
 workflow SHORTREAD_PREPROCESSING {
    take:
@ -15,55 +14,17 @@ workflow SHORTREAD_PREPROCESSING {
    ch_versions       = Channel.empty()
    ch_multiqc_files  = Channel.empty()
-    //
+    if ( params.shortread_clipmerge_tool == "fastp" ) {
-    // STEP: Read clipping and merging
+        ch_processed_reads = SHORTREAD_FASTP ( reads ).reads
-    //
+        ch_versions        =  ch_versions.mix( SHORTREAD_FASTP.out.versions )
-    // TODO give option to clip only and retain pairs
+        ch_multiqc_files   =  ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc )
    // TODO give option to retain singletons (probably fastp option likely)
    // TODO move to subworkflow
    if ( params.shortread_clipmerge ) {
        ch_input_for_fastp = reads
                                .dump(tag: "pre-fastp_branch")
                                .branch{
                                    single: it[0]['single_end'] == true
                                    paired: it[0]['single_end'] == false
                                }
        ch_input_for_fastp.single.dump(tag: "input_fastp_single")
        ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
        FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
        FASTP_PAIRED ( ch_input_for_fastp.paired, false, true )
        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
                                    .mix( FASTP_SINGLE.out.reads )
                                    .map {
                                        meta, reads ->
                                        def meta_new = meta.clone()
                                        meta_new['single_end'] = 1
                                        [ meta_new, reads ]
                                    }
        FASTQC_POST ( ch_fastp_reads_prepped )
        ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
        ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
        ch_processed_reads = ch_fastp_reads_prepped
        ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} )
        ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
        ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
        ch_multiqc_files.dump(tag: "preprocessing_mqc_final")
    } else {
        ch_processed_reads = reads
    }
    FASTQC_PROCESSED ( ch_processed_reads )
    ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions )
    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )
    emit:
    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -17,6 +17,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true
 // Check mandatory parameters
 if (params.input    ) { ch_input     = file(params.input)     } else { exit 1, 'Input samplesheet not specified!' }
 if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
 if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not except uncollapsed paired-reads. Pairs will be profiled as separate files."
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -73,9 +74,9 @@ workflow TAXPROFILER {
    ch_versions = Channel.empty()
-    //
+    /*
-    // SUBWORKFLOW: Read in samplesheet, validate and stage input files
+        SUBWORKFLOW: Read in samplesheet, validate and stage input files
-    //
+    */
    INPUT_CHECK (
        ch_input
    )
@ -85,22 +86,24 @@ workflow TAXPROFILER {
        ch_databases
    )
-    //
+    /*
-    // MODULE: Run FastQC
+        MODULE: Run FastQC
-    //
+    */
    ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ).dump(tag: "input_to_fastq")
    FASTQC (
        ch_input_for_fastqc
    )
    ch_versions = ch_versions.mix(FASTQC.out.versions.first())
    CUSTOM_DUMPSOFTWAREVERSIONS (
        ch_versions.unique().collectFile(name: 'collated_versions.yml')
    )
-    //
+    /*
-    // PERFORM PREPROCESSING
+        SUBWORKFLOW: PERFORM PREPROCESSING
-    //
+    */
    if ( params.shortread_clipmerge ) {
        ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads
    } else {
@ -115,53 +118,32 @@ workflow TAXPROFILER {
        ch_longreads_preprocessed = INPUT_CHECK.out.nanopore
    }
-    //
+    /*
-    // PERFORM SHORT READ RUN MERGING
+        COMBINE READS WITH POSSIBLE DATABASES
-    // TODO: Check not necessary for long reads too?
+    */
    //
    ch_processed_for_combine = ch_shortreads_preprocessed
        .dump(tag: "prep_for_combine_grouping")
        .map {
            meta, reads ->
            def meta_new = meta.clone()
            meta_new['run_accession'] = 'combined'
            [ meta_new, reads ]
        }
        .groupTuple ( by: 0 )
        .branch{
            combine: it[1].size() >= 2
            skip: it[1].size() < 2
        }
    CAT_FASTQ ( ch_processed_for_combine.combine )
    ch_reads_for_profiling = ch_processed_for_combine.skip
                                .dump(tag: "skip_combine")
                                .mix( CAT_FASTQ.out.reads )
                                .dump(tag: "files_for_profiling")
    //
    // COMBINE READS WITH POSSIBLE DATABASES
    //
    // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
-    ch_input_for_profiling = ch_reads_for_profiling
+    ch_input_for_profiling = ch_shortreads_preprocessed
            .mix( ch_longreads_preprocessed )
            .combine(DB_CHECK.out.dbs)
-            .dump(tag: "reads_plus_db")
+            .dump(tag: "reads_plus_db_clean")
            .branch {
                malt:    it[2]['tool'] == 'malt'
                kraken2: it[2]['tool'] == 'kraken2'
                unknown: true
            }
-    //
+    /*
-    // PREPARE PROFILER INPUT CHANNELS
+        PREPARE PROFILER INPUT CHANNELS
-    //
+    */
    // We groupTuple to have all samples in one channel for MALT as database
    // loading takes a long time, so we only want to run it once per database
    // TODO document somewhere we only accept illumina short reads for MALT?
    ch_input_for_malt =  ch_input_for_profiling.malt
                            .dump(tag: "input_to_malt_prefilter")
                            .filter { it[0]['instrument_platform'] == 'ILLUMINA' }
                            .dump(tag: "input_to_malt_postfilter")
                            .map {
                                it ->
                                    def temp_meta =  [ id: it[2]['db_name']]  + it[2]
@ -169,7 +151,7 @@ workflow TAXPROFILER {
                                    [ temp_meta, it[1], db ]
                            }
                            .groupTuple(by: [0,2])
-                            .dump(tag: "input for malt")
+                            .dump(tag: "input_to_malt")
                            .multiMap {
                                it ->
                                    reads: [ it[0], it[1].flatten() ]
@ -178,16 +160,16 @@ workflow TAXPROFILER {
    // We can run Kraken2 one-by-one sample-wise
    ch_input_for_kraken2 =  ch_input_for_profiling.kraken2
-                            .dump(tag: "input for kraken")
+                            .dump(tag: "input_to_kraken")
                            .multiMap {
                                it ->
                                    reads: [ it[0] + it[2], it[1] ]
                                    db: it[3]
                            }
-    //
+    /*
-    // RUN PROFILING
+        MODULE: RUN PROFILING
-    //
+    */
    if ( params.run_malt ) {
        MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db )
    }
@ -196,9 +178,9 @@ workflow TAXPROFILER {
        KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db  )
    }
-    //
+    /*
-    // MODULE: MultiQC
+        MODULE: MultiQC
-    //
+    */
    workflow_summary    = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params)
    ch_workflow_summary = Channel.value(workflow_summary)