Add read improved read preprocessing

2024-11-25 19:59:55 +00:00 · 2022-03-25 14:58:06 +01:00 · 2022-03-25 14:58:06 +01:00 · 4e93abc7c0
commit 4e93abc7c0
parent fafb7e0f6f
6 changed files with 107 additions and 50 deletions
--- a/conf/modules.config
+++ b/conf/modules.config
@ -52,12 +52,27 @@ process {
        ]
    }
    withName: FASTQC_PROCESSED {
        ext.args = '--quiet'
        ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
        publishDir = [
            path: { "${params.outdir}/fastqc/processed" },
            mode: 'copy',
            pattern: '*.html'
        ]
    }
    withName: FASTP {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        // TODO also include option to NOT merge
        ext.args   = [
-            { ${meta.single_end} } == 0 ? "-m" : '',
+            // collapsing options
-            params.shortread_excludeunmerged ? '' : "--include_unmerged"
+            params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged",
            // trimming options
            params.shortread_clipmerge_skiptrim ? "--disable_adapter_trimming" : "",
            params.shortread_adapter1 ? "--adapter_sequence ${params.shortread_adapter1}" : "",
            !{ ${meta.single_end} } && params.shortread_adapter2 ? "--adapter_sequence_r2 ${params.shortread_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : ""
            // filtering options
            "--length_required ${params.shortread_clipmerge_minlength}"
        ].join(' ').trim()
        publishDir = [
            path: { "${params.outdir}/fastp" },
--- a/nextflow.config
+++ b/nextflow.config
@ -55,9 +55,15 @@ params {
    databases = null
    // FASTQ preprocessing
-    shortread_clipmerge           = false
+    shortread_clipmerge                     = false
-    shortread_excludeunmerged     = true
+    shortread_clipmerge_tool                = 'fastp'
-    longread_clip                 = false
+    shortread_clipmerge_skiptrim            = false
    shortread_clipmerge_mergepairs          = false
    shortread_clipmerge_excludeunmerged     = true
    shortread_clipmerge_adapter1            = null
    shortread_clipmerge_adapter2            = null
    shortread_clipmerge_minlength           = 15
    longread_clip                           = false
    // MALT
    run_malt                   = false
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -265,7 +265,7 @@
        "shortread_clipmerge": {
            "type": "boolean"
        },
-        "shortread_excludeunmerged": {
+        "shortread_clipmerge_excludeunmerged": {
            "type": "boolean",
            "default": true
        },
--- a/subworkflows/local/shortread_fastp.nf
+++ b/subworkflows/local/shortread_fastp.nf
@ -0,0 +1,65 @@
 //
 // Check input samplesheet and get read channels
 //
 include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
 include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
 workflow SHORTREAD_FASTP {
    take:
    reads // file: /path/to/samplesheet.csv
    main:
    ch_versions = Channel.empty()
    ch_multiqc_files      = Channel.empty()
    //
    // STEP: Read clipping and merging
    //
    // TODO give option to retain singletons (probably fastp option likely)
    // TODO move to subworkflow
    ch_input_for_fastp = reads
                            .dump(tag: "pre-fastp_branch")
                            .branch{
                                single: it[0]['single_end'] == true
                                paired: it[0]['single_end'] == false
                            }
    ch_input_for_fastp.single.dump(tag: "input_fastp_single")
    ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
    FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
    FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs )
    if ( params.shortread_clipmerge_mergepairs ) {
        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
                                    .mix( FASTP_SINGLE.out.reads )
                                    .map {
                                        meta, reads ->
                                        def meta_new = meta.clone()
                                        meta_new['single_end'] = 1
                                        [ meta_new, reads ]
                                        }
    } else {
        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads
                                    .mix( FASTP_SINGLE.out.reads )
    }
    ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
    ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
    ch_processed_reads = ch_fastp_reads_prepped.dump(tag: "ch_fastp_reads_prepped")
    ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
    ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
    ch_multiqc_files.dump(tag: "preprocessing_fastp_mqc_final")
    emit:
    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
    versions = ch_versions          // channel: [ versions.yml ]
    mqc      = ch_multiqc_files
 }
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@ -3,17 +3,16 @@
 //
-include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
+include { SHORTREAD_FASTP             } from './shortread_fastp'
-include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
+include { FASTQC as FASTQC_PROCESSED       } from '../../modules/nf-core/modules/fastqc/main'
 include { FASTQC as FASTQC_POST       } from '../../modules/nf-core/modules/fastqc/main'
 workflow SHORTREAD_PREPROCESSING {
    take:
    reads // file: /path/to/samplesheet.csv
    main:
-    ch_versions = Channel.empty()
+    ch_versions       = Channel.empty()
-    ch_multiqc_files      = Channel.empty()
+    ch_multiqc_files  = Channel.empty()
    //
    // STEP: Read clipping and merging
@ -22,50 +21,20 @@ workflow SHORTREAD_PREPROCESSING {
    // TODO give option to retain singletons (probably fastp option likely)
    // TODO move to subworkflow
-
+    if ( params.shortread_clipmerge_tool == "fastp" ) {
-    if ( params.shortread_clipmerge ) {
+        ch_processed_reads = SHORTREAD_FASTP ( reads ).reads
-
+        ch_versions        =  ch_versions.mix( SHORTREAD_FASTP.out.versions )
-        ch_input_for_fastp = reads
+        ch_multiqc_files   =  ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc )
                                .dump(tag: "pre-fastp_branch")
                                .branch{
                                    single: it[0]['single_end'] == true
                                    paired: it[0]['single_end'] == false
                                }
        ch_input_for_fastp.single.dump(tag: "input_fastp_single")
        ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
        FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
        FASTP_PAIRED ( ch_input_for_fastp.paired, false, true )
        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
                                    .mix( FASTP_SINGLE.out.reads )
                                    .map {
                                        meta, reads ->
                                        def meta_new = meta.clone()
                                        meta_new['single_end'] = 1
                                        [ meta_new, reads ]
                                    }
        FASTQC_POST ( ch_fastp_reads_prepped )
        ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
        ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
        ch_processed_reads = ch_fastp_reads_prepped
        ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} )
        ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
        ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
        ch_multiqc_files.dump(tag: "preprocessing_mqc_final")
    } else {
        ch_processed_reads = reads
    }
    //FASTQC_PROCESSED ( ch_processed_reads )
    //ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions )
    //ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )
    emit:
            // TODO: problem, this is being exported as a multi-channel output? This is why FASTQC is broken
    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
    versions = ch_versions          // channel: [ versions.yml ]
    mqc      = ch_multiqc_files
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -17,6 +17,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true
 // Check mandatory parameters
 if (params.input    ) { ch_input     = file(params.input)     } else { exit 1, 'Input samplesheet not specified!' }
 if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
 if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not except uncollapsed paired-reads. Pairs will be profiled as separate files."
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -135,6 +136,7 @@ workflow TAXPROFILER {
    CAT_FASTQ ( ch_processed_for_combine.combine )
    // TODO May need to flatten reads?
    ch_reads_for_profiling = ch_processed_for_combine.skip
                                .dump(tag: "skip_combine")
                                .mix( CAT_FASTQ.out.reads )