Add read improved read preprocessing

2024-12-22 15:18:16 +00:00 · 2022-03-25 14:58:06 +01:00 · 2022-03-25 14:58:06 +01:00 · 4e93abc7c0
commit 4e93abc7c0
parent fafb7e0f6f
6 changed files with 107 additions and 50 deletions
--- a/conf/modules.config
+++ b/conf/modules.config
@ -52,12 +52,27 @@ process {
        ]
    }

+    withName: FASTQC_PROCESSED {
+        ext.args = '--quiet'
+        ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
+        publishDir = [
+            path: { "${params.outdir}/fastqc/processed" },
+            mode: 'copy',
+            pattern: '*.html'
+        ]
+    }
+
    withName: FASTP {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
-        // TODO also include option to NOT merge
        ext.args   = [
-            { ${meta.single_end} } == 0 ? "-m" : '',
-            params.shortread_excludeunmerged ? '' : "--include_unmerged"
+            // collapsing options
+            params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged",
+            // trimming options
+            params.shortread_clipmerge_skiptrim ? "--disable_adapter_trimming" : "",
+            params.shortread_adapter1 ? "--adapter_sequence ${params.shortread_adapter1}" : "",
+            !{ ${meta.single_end} } && params.shortread_adapter2 ? "--adapter_sequence_r2 ${params.shortread_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : ""
+            // filtering options
+            "--length_required ${params.shortread_clipmerge_minlength}"
        ].join(' ').trim()
        publishDir = [
            path: { "${params.outdir}/fastp" },
--- a/nextflow.config
+++ b/nextflow.config
@ -55,9 +55,15 @@ params {
    databases = null

    // FASTQ preprocessing
-    shortread_clipmerge           = false
-    shortread_excludeunmerged     = true
-    longread_clip                 = false
+    shortread_clipmerge                     = false
+    shortread_clipmerge_tool                = 'fastp'
+    shortread_clipmerge_skiptrim            = false
+    shortread_clipmerge_mergepairs          = false
+    shortread_clipmerge_excludeunmerged     = true
+    shortread_clipmerge_adapter1            = null
+    shortread_clipmerge_adapter2            = null
+    shortread_clipmerge_minlength           = 15
+    longread_clip                           = false

    // MALT
    run_malt                   = false
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -265,7 +265,7 @@
        "shortread_clipmerge": {
            "type": "boolean"
        },
-        "shortread_excludeunmerged": {
+        "shortread_clipmerge_excludeunmerged": {
            "type": "boolean",
            "default": true
        },
--- a/subworkflows/local/shortread_fastp.nf
+++ b/subworkflows/local/shortread_fastp.nf
@ -0,0 +1,65 @@
+//
+// Check input samplesheet and get read channels
+//
+
+
+include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
+include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
+
+workflow SHORTREAD_FASTP {
+    take:
+    reads // file: /path/to/samplesheet.csv
+
+    main:
+    ch_versions = Channel.empty()
+    ch_multiqc_files      = Channel.empty()
+
+    //
+    // STEP: Read clipping and merging
+    //
+    // TODO give option to retain singletons (probably fastp option likely)
+    // TODO move to subworkflow
+
+    ch_input_for_fastp = reads
+                            .dump(tag: "pre-fastp_branch")
+                            .branch{
+                                single: it[0]['single_end'] == true
+                                paired: it[0]['single_end'] == false
+                            }
+
+    ch_input_for_fastp.single.dump(tag: "input_fastp_single")
+    ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
+
+    FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
+    FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs )
+
+    if ( params.shortread_clipmerge_mergepairs ) {
+        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
+                                    .mix( FASTP_SINGLE.out.reads )
+                                    .map {
+                                        meta, reads ->
+                                        def meta_new = meta.clone()
+                                        meta_new['single_end'] = 1
+                                        [ meta_new, reads ]
+                                        }
+    } else {
+        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads
+                                    .mix( FASTP_SINGLE.out.reads )
+    }
+
+    ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
+    ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
+
+    ch_processed_reads = ch_fastp_reads_prepped.dump(tag: "ch_fastp_reads_prepped")
+
+    ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
+
+    ch_multiqc_files.dump(tag: "preprocessing_fastp_mqc_final")
+
+    emit:
+    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
+    versions = ch_versions          // channel: [ versions.yml ]
+    mqc      = ch_multiqc_files
+}
+
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@ -3,17 +3,16 @@
 //


-include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
-include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
-include { FASTQC as FASTQC_POST       } from '../../modules/nf-core/modules/fastqc/main'
+include { SHORTREAD_FASTP             } from './shortread_fastp'
+include { FASTQC as FASTQC_PROCESSED       } from '../../modules/nf-core/modules/fastqc/main'

 workflow SHORTREAD_PREPROCESSING {
    take:
    reads // file: /path/to/samplesheet.csv

    main:
-    ch_versions = Channel.empty()
-    ch_multiqc_files      = Channel.empty()
+    ch_versions       = Channel.empty()
+    ch_multiqc_files  = Channel.empty()

    //
    // STEP: Read clipping and merging
@ -22,50 +21,20 @@ workflow SHORTREAD_PREPROCESSING {
    // TODO give option to retain singletons (probably fastp option likely)
    // TODO move to subworkflow

-
-    if ( params.shortread_clipmerge ) {
-
-        ch_input_for_fastp = reads
-                                .dump(tag: "pre-fastp_branch")
-                                .branch{
-                                    single: it[0]['single_end'] == true
-                                    paired: it[0]['single_end'] == false
-                                }
-
-        ch_input_for_fastp.single.dump(tag: "input_fastp_single")
-        ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
-
-        FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
-        FASTP_PAIRED ( ch_input_for_fastp.paired, false, true )
-
-        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
-                                    .mix( FASTP_SINGLE.out.reads )
-                                    .map {
-                                        meta, reads ->
-                                        def meta_new = meta.clone()
-                                        meta_new['single_end'] = 1
-                                        [ meta_new, reads ]
-                                    }
-
-        FASTQC_POST ( ch_fastp_reads_prepped )
-
-        ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
-        ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
-
-        ch_processed_reads = ch_fastp_reads_prepped
-
-        ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} )
-        ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
-        ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
-
-        ch_multiqc_files.dump(tag: "preprocessing_mqc_final")
-
+    if ( params.shortread_clipmerge_tool == "fastp" ) {
+        ch_processed_reads = SHORTREAD_FASTP ( reads ).reads
+        ch_versions        =  ch_versions.mix( SHORTREAD_FASTP.out.versions )
+        ch_multiqc_files   =  ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc )
    } else {
        ch_processed_reads = reads
    }

+    //FASTQC_PROCESSED ( ch_processed_reads )
+    //ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions )
+    //ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )

    emit:
+            // TODO: problem, this is being exported as a multi-channel output? This is why FASTQC is broken
    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
    versions = ch_versions          // channel: [ versions.yml ]
    mqc      = ch_multiqc_files
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -17,6 +17,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true
 // Check mandatory parameters
 if (params.input    ) { ch_input     = file(params.input)     } else { exit 1, 'Input samplesheet not specified!' }
 if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
+if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not except uncollapsed paired-reads. Pairs will be profiled as separate files."

 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -135,6 +136,7 @@ workflow TAXPROFILER {

    CAT_FASTQ ( ch_processed_for_combine.combine )

+    // TODO May need to flatten reads?
    ch_reads_for_profiling = ch_processed_for_combine.skip
                                .dump(tag: "skip_combine")
                                .mix( CAT_FASTQ.out.reads )