From 4e93abc7c06b35eae584af4338c8aeaa60b8fcf8 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 25 Mar 2022 14:58:06 +0100 Subject: [PATCH 01/21] Add read improved read preprocessing --- conf/modules.config | 21 +++++- nextflow.config | 12 +++- nextflow_schema.json | 2 +- subworkflows/local/shortread_fastp.nf | 65 +++++++++++++++++++ subworkflows/local/shortread_preprocessing.nf | 55 ++++------------ workflows/taxprofiler.nf | 2 + 6 files changed, 107 insertions(+), 50 deletions(-) create mode 100644 subworkflows/local/shortread_fastp.nf diff --git a/conf/modules.config b/conf/modules.config index 29a5135..36bc626 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -52,12 +52,27 @@ process { ] } + withName: FASTQC_PROCESSED { + ext.args = '--quiet' + ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } + publishDir = [ + path: { "${params.outdir}/fastqc/processed" }, + mode: 'copy', + pattern: '*.html' + ] + } + withName: FASTP { ext.prefix = { "${meta.id}_${meta.run_accession}" } - // TODO also include option to NOT merge ext.args = [ - { ${meta.single_end} } == 0 ? "-m" : '', - params.shortread_excludeunmerged ? '' : "--include_unmerged" + // collapsing options + params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged", + // trimming options + params.shortread_clipmerge_skiptrim ? "--disable_adapter_trimming" : "", + params.shortread_adapter1 ? "--adapter_sequence ${params.shortread_adapter1}" : "", + !{ ${meta.single_end} } && params.shortread_adapter2 ? "--adapter_sequence_r2 ${params.shortread_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : "" + // filtering options + "--length_required ${params.shortread_clipmerge_minlength}" ].join(' ').trim() publishDir = [ path: { "${params.outdir}/fastp" }, diff --git a/nextflow.config b/nextflow.config index cc77a99..a312d0c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -55,9 +55,15 @@ params { databases = null // FASTQ preprocessing - shortread_clipmerge = false - shortread_excludeunmerged = true - longread_clip = false + shortread_clipmerge = false + shortread_clipmerge_tool = 'fastp' + shortread_clipmerge_skiptrim = false + shortread_clipmerge_mergepairs = false + shortread_clipmerge_excludeunmerged = true + shortread_clipmerge_adapter1 = null + shortread_clipmerge_adapter2 = null + shortread_clipmerge_minlength = 15 + longread_clip = false // MALT run_malt = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 9527da4..0fa217f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -265,7 +265,7 @@ "shortread_clipmerge": { "type": "boolean" }, - "shortread_excludeunmerged": { + "shortread_clipmerge_excludeunmerged": { "type": "boolean", "default": true }, diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf new file mode 100644 index 0000000..87aba25 --- /dev/null +++ b/subworkflows/local/shortread_fastp.nf @@ -0,0 +1,65 @@ +// +// Check input samplesheet and get read channels +// + + +include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main' +include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' + +workflow SHORTREAD_FASTP { + take: + reads // file: /path/to/samplesheet.csv + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + // + // STEP: Read clipping and merging + // + // TODO give option to retain singletons (probably fastp option likely) + // TODO move to subworkflow + + ch_input_for_fastp = reads + .dump(tag: "pre-fastp_branch") + .branch{ + single: it[0]['single_end'] == true + paired: it[0]['single_end'] == false + } + + ch_input_for_fastp.single.dump(tag: "input_fastp_single") + ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") + + FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) + FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs ) + + if ( params.shortread_clipmerge_mergepairs ) { + ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged + .mix( FASTP_SINGLE.out.reads ) + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] + } + } else { + ch_fastp_reads_prepped = FASTP_PAIRED.out.reads + .mix( FASTP_SINGLE.out.reads ) + } + + ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) + ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) + + ch_processed_reads = ch_fastp_reads_prepped.dump(tag: "ch_fastp_reads_prepped") + + ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} ) + + ch_multiqc_files.dump(tag: "preprocessing_fastp_mqc_final") + + emit: + reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index d996a76..c31289d 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -3,17 +3,16 @@ // -include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main' -include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' -include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main' +include { SHORTREAD_FASTP } from './shortread_fastp' +include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main' workflow SHORTREAD_PREPROCESSING { take: reads // file: /path/to/samplesheet.csv main: - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() // // STEP: Read clipping and merging @@ -22,50 +21,20 @@ workflow SHORTREAD_PREPROCESSING { // TODO give option to retain singletons (probably fastp option likely) // TODO move to subworkflow - - if ( params.shortread_clipmerge ) { - - ch_input_for_fastp = reads - .dump(tag: "pre-fastp_branch") - .branch{ - single: it[0]['single_end'] == true - paired: it[0]['single_end'] == false - } - - ch_input_for_fastp.single.dump(tag: "input_fastp_single") - ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") - - FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) - FASTP_PAIRED ( ch_input_for_fastp.paired, false, true ) - - ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged - .mix( FASTP_SINGLE.out.reads ) - .map { - meta, reads -> - def meta_new = meta.clone() - meta_new['single_end'] = 1 - [ meta_new, reads ] - } - - FASTQC_POST ( ch_fastp_reads_prepped ) - - ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) - ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) - - ch_processed_reads = ch_fastp_reads_prepped - - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} ) - ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} ) - ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} ) - - ch_multiqc_files.dump(tag: "preprocessing_mqc_final") - + if ( params.shortread_clipmerge_tool == "fastp" ) { + ch_processed_reads = SHORTREAD_FASTP ( reads ).reads + ch_versions = ch_versions.mix( SHORTREAD_FASTP.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc ) } else { ch_processed_reads = reads } + //FASTQC_PROCESSED ( ch_processed_reads ) + //ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) + //ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} ) emit: + // TODO: problem, this is being exported as a multi-channel output? This is why FASTQC is broken reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] versions = ch_versions // channel: [ versions.yml ] mqc = ch_multiqc_files diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 29058e6..0a907bf 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -17,6 +17,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } +if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not except uncollapsed paired-reads. Pairs will be profiled as separate files." /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -135,6 +136,7 @@ workflow TAXPROFILER { CAT_FASTQ ( ch_processed_for_combine.combine ) + // TODO May need to flatten reads? ch_reads_for_profiling = ch_processed_for_combine.skip .dump(tag: "skip_combine") .mix( CAT_FASTQ.out.reads ) From ff1f28f4f0b8e88898e0f568f83e2da90ff81bc6 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 25 Mar 2022 15:01:25 +0100 Subject: [PATCH 02/21] Update schema --- conf/modules.config | 4 ++-- nextflow_schema.json | 38 +++++++++++++++++++++++++++++++++++--- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 36bc626..1052bed 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -69,8 +69,8 @@ process { params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged", // trimming options params.shortread_clipmerge_skiptrim ? "--disable_adapter_trimming" : "", - params.shortread_adapter1 ? "--adapter_sequence ${params.shortread_adapter1}" : "", - !{ ${meta.single_end} } && params.shortread_adapter2 ? "--adapter_sequence_r2 ${params.shortread_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : "" + params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", + !{ ${meta.single_end} } && params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : "", // filtering options "--length_required ${params.shortread_clipmerge_minlength}" ].join(' ').trim() diff --git a/nextflow_schema.json b/nextflow_schema.json index 0fa217f..42307e9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -281,6 +291,28 @@ }, "run_kraken2": { "type": "boolean" + }, + "shortread_clipmerge_tool": { + "type": "string", + "default": "fastp" + }, + "shortread_clipmerge_skiptrim": { + "type": "boolean" + }, + "shortread_clipmerge_mergepairs": { + "type": "boolean" + }, + "shortread_clipmerge_adapter1": { + "type": "string", + "default": null + }, + "shortread_clipmerge_adapter2": { + "type": "string", + "default": null + }, + "shortread_clipmerge_minlength": { + "type": "integer", + "default": 15 } } -} +} \ No newline at end of file From dfcf8f7b1ae183a233d8a1c94f39a1f27084a966 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 25 Mar 2022 15:10:52 +0100 Subject: [PATCH 03/21] Prettier --- conf/test.config | 1 - nextflow_schema.json | 16 +++------------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/conf/test.config b/conf/test.config index 5924d7a..92a10e4 100644 --- a/conf/test.config +++ b/conf/test.config @@ -23,7 +23,6 @@ params { // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - outdir = "./results" databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' run_kraken2 = true run_malt = true diff --git a/nextflow_schema.json b/nextflow_schema.json index 42307e9..545f990 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -315,4 +305,4 @@ "default": 15 } } -} \ No newline at end of file +} From b5f5c755046b3ae1af9648e879dfa48519385891 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 25 Mar 2022 15:21:52 +0100 Subject: [PATCH 04/21] Fix file header descriptions --- subworkflows/local/longread_preprocessing.nf | 9 ++++++--- subworkflows/local/shortread_fastp.nf | 10 +++------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf index da1049a..58968e8 100644 --- a/subworkflows/local/longread_preprocessing.nf +++ b/subworkflows/local/longread_preprocessing.nf @@ -1,6 +1,9 @@ +/* +Process long raw reads with porechop +*/ -include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main' -include { PORECHOP } from '../../modules/nf-core/modules/porechop/main' +include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main' +include { PORECHOP } from '../../modules/nf-core/modules/porechop/main' workflow LONGREAD_PREPROCESSING { take: @@ -23,7 +26,7 @@ workflow LONGREAD_PREPROCESSING { FASTQC_POST ( PORECHOP.out.reads ) ch_versions = ch_versions.mix(PORECHOP.out.versions.first()) - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} ) emit: diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index 87aba25..f457cf3 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -1,7 +1,6 @@ -// -// Check input samplesheet and get read channels -// - +/* +Process short raw reads with FastP +*/ include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main' include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' @@ -17,9 +16,6 @@ workflow SHORTREAD_FASTP { // // STEP: Read clipping and merging // - // TODO give option to retain singletons (probably fastp option likely) - // TODO move to subworkflow - ch_input_for_fastp = reads .dump(tag: "pre-fastp_branch") .branch{ From 7ddcb09a85ed8c50f5d661b398f0e0a5e113add8 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 25 Mar 2022 15:22:41 +0100 Subject: [PATCH 05/21] Some more cleanup --- .../local/shortread_adapterremoval.nf | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 subworkflows/local/shortread_adapterremoval.nf diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf new file mode 100644 index 0000000..e15d2ef --- /dev/null +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -0,0 +1,64 @@ +/* +Process raw reads with AdapterRemoval +*/ + +include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main' +include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' + +workflow SHORTREAD_FASTP { + take: + reads // file: /path/to/samplesheet.csv + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + // + // STEP: Read clipping and merging + // + // TODO give option to retain singletons (probably fastp option likely) + // TODO move to subworkflow + + ch_input_for_fastp = reads + .dump(tag: "pre-fastp_branch") + .branch{ + single: it[0]['single_end'] == true + paired: it[0]['single_end'] == false + } + + ch_input_for_fastp.single.dump(tag: "input_fastp_single") + ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") + + FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) + FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs ) + + if ( params.shortread_clipmerge_mergepairs ) { + ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged + .mix( FASTP_SINGLE.out.reads ) + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] + } + } else { + ch_fastp_reads_prepped = FASTP_PAIRED.out.reads + .mix( FASTP_SINGLE.out.reads ) + } + + ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) + ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) + + ch_processed_reads = ch_fastp_reads_prepped.dump(tag: "ch_fastp_reads_prepped") + + ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} ) + + ch_multiqc_files.dump(tag: "preprocessing_fastp_mqc_final") + + emit: + reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + From 0e6f7e2ca137fbb8306b733ab3abc3b6d8ad83f7 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 25 Mar 2022 15:24:10 +0100 Subject: [PATCH 06/21] Revert "Some more cleanup" This reverts commit 7ddcb09a85ed8c50f5d661b398f0e0a5e113add8. --- .../local/shortread_adapterremoval.nf | 64 ------------------- 1 file changed, 64 deletions(-) delete mode 100644 subworkflows/local/shortread_adapterremoval.nf diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf deleted file mode 100644 index e15d2ef..0000000 --- a/subworkflows/local/shortread_adapterremoval.nf +++ /dev/null @@ -1,64 +0,0 @@ -/* -Process raw reads with AdapterRemoval -*/ - -include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main' -include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' - -workflow SHORTREAD_FASTP { - take: - reads // file: /path/to/samplesheet.csv - - main: - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() - - // - // STEP: Read clipping and merging - // - // TODO give option to retain singletons (probably fastp option likely) - // TODO move to subworkflow - - ch_input_for_fastp = reads - .dump(tag: "pre-fastp_branch") - .branch{ - single: it[0]['single_end'] == true - paired: it[0]['single_end'] == false - } - - ch_input_for_fastp.single.dump(tag: "input_fastp_single") - ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") - - FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) - FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs ) - - if ( params.shortread_clipmerge_mergepairs ) { - ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged - .mix( FASTP_SINGLE.out.reads ) - .map { - meta, reads -> - def meta_new = meta.clone() - meta_new['single_end'] = 1 - [ meta_new, reads ] - } - } else { - ch_fastp_reads_prepped = FASTP_PAIRED.out.reads - .mix( FASTP_SINGLE.out.reads ) - } - - ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) - ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) - - ch_processed_reads = ch_fastp_reads_prepped.dump(tag: "ch_fastp_reads_prepped") - - ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} ) - ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} ) - - ch_multiqc_files.dump(tag: "preprocessing_fastp_mqc_final") - - emit: - reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] - versions = ch_versions // channel: [ versions.yml ] - mqc = ch_multiqc_files -} - From b763bfa2c0a7dabe4ad8c4216cf852a3938a49b6 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 25 Mar 2022 15:26:57 +0100 Subject: [PATCH 07/21] More cleanup --- subworkflows/local/shortread_fastp.nf | 3 --- 1 file changed, 3 deletions(-) diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index f457cf3..57ce2a6 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -13,9 +13,6 @@ workflow SHORTREAD_FASTP { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() - // - // STEP: Read clipping and merging - // ch_input_for_fastp = reads .dump(tag: "pre-fastp_branch") .branch{ From ede362dbf91df9f5118f0122f8bfc09d888cb141 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 25 Mar 2022 15:28:30 +0100 Subject: [PATCH 08/21] Remove duplicate config and sync longread fastqc --- conf/modules.config | 10 ---------- subworkflows/local/longread_preprocessing.nf | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 1052bed..7d50174 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -90,16 +90,6 @@ process { ] } - withName: FASTQC_POST { - ext.args = '--quiet' - ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } - publishDir = [ - path: { "${params.outdir}/fastqc/processed" }, - mode: 'copy', - pattern: '*.html' - ] - } - withName: CAT_FASTQ { publishDir = [ path: { "${params.outdir}/prepared_sequences" }, diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf index 58968e8..7c7c24b 100644 --- a/subworkflows/local/longread_preprocessing.nf +++ b/subworkflows/local/longread_preprocessing.nf @@ -24,7 +24,7 @@ workflow LONGREAD_PREPROCESSING { [ meta_new, reads ] } - FASTQC_POST ( PORECHOP.out.reads ) + FASTQC_PROCESSED ( PORECHOP.out.reads ) ch_versions = ch_versions.mix(PORECHOP.out.versions.first()) ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} ) From 000f129dab1b139cec6e9194670addf8d9e6173a Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 25 Mar 2022 15:32:56 +0100 Subject: [PATCH 09/21] Apply suggestions from code review --- subworkflows/local/shortread_preprocessing.nf | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index c31289d..c82536f 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -29,12 +29,11 @@ workflow SHORTREAD_PREPROCESSING { ch_processed_reads = reads } - //FASTQC_PROCESSED ( ch_processed_reads ) - //ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) - //ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} ) + FASTQC_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} ) emit: - // TODO: problem, this is being exported as a multi-channel output? This is why FASTQC is broken reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] versions = ch_versions // channel: [ versions.yml ] mqc = ch_multiqc_files From 120382f51d184bed0375ab8c642f6bdab77a617f Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 26 Mar 2022 06:45:16 +0100 Subject: [PATCH 10/21] Fix input channels to kraken2 --- workflows/taxprofiler.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 0a907bf..c1d7f34 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -37,11 +37,11 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { DB_CHECK } from '../subworkflows/local/db_check' +include { DB_CHECK } from '../subworkflows/local/db_check' include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' -include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' +include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -171,7 +171,7 @@ workflow TAXPROFILER { [ temp_meta, it[1], db ] } .groupTuple(by: [0,2]) - .dump(tag: "input for malt") + .dump(tag: "input_for_malt") .multiMap { it -> reads: [ it[0], it[1].flatten() ] @@ -180,10 +180,10 @@ workflow TAXPROFILER { // We can run Kraken2 one-by-one sample-wise ch_input_for_kraken2 = ch_input_for_profiling.kraken2 - .dump(tag: "input for kraken") + .dump(tag: "input_for_kraken") .multiMap { it -> - reads: [ it[0] + it[2], it[1] ] + reads: [ it[0] + it[2], it[1].flatten() ] db: it[3] } From 622fafedc88489026d4c8cd5eb62319e736d1375 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Sat, 26 Mar 2022 20:22:35 +0000 Subject: [PATCH 11/21] Prempt centrifuge database untar crash --- subworkflows/local/db_check.nf | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index 890e373..96f815a 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -12,6 +12,9 @@ workflow DB_CHECK { main: // TODO: make database sheet check + // Checks: + // 1) no duplicates, + // 2) dbs with no special arguments does not have quotes, e.g. just `,,` and NOT `,"",` parsed_samplesheet = DATABASE_CHECK ( dbsheet ) .csv .splitCsv ( header:true, sep:',' ) @@ -21,7 +24,7 @@ workflow DB_CHECK { ch_dbs_for_untar = parsed_samplesheet .branch { - untar: it[1].toString().endsWith(".tar.gz") + untar: it[1].toString().endsWith(".tar.gz") && it[0]['tool'] != 'centrifuge' skip: true } From e6e8ed7cc9904973ce6236c98eb74f8246c3ec09 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Sat, 26 Mar 2022 20:54:50 +0000 Subject: [PATCH 12/21] Add some debugging notes --- conf/modules.config | 4 +-- workflows/taxprofiler.nf | 60 +++++++++++++++++++++------------------- 2 files changed, 34 insertions(+), 30 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 7d50174..5823d3f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -105,7 +105,7 @@ process { pattern: '*.{rma6,tab,text,sam,log}' ] ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.db_name}" } + ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } withName: KRAKEN2_KRAKEN2 { @@ -115,7 +115,7 @@ process { pattern: '*.{fastq.gz,txt}' ] ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.db_name}" } + ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } withName: CUSTOM_DUMPSOFTWAREVERSIONS { diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index c1d7f34..ee921ea 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -74,9 +74,9 @@ workflow TAXPROFILER { ch_versions = Channel.empty() - // - // SUBWORKFLOW: Read in samplesheet, validate and stage input files - // + /* + SUBWORKFLOW: Read in samplesheet, validate and stage input files + */ INPUT_CHECK ( ch_input ) @@ -86,9 +86,9 @@ workflow TAXPROFILER { ch_databases ) - // - // MODULE: Run FastQC - // + /* + MODULE: Run FastQC + */ ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ).dump(tag: "input_to_fastq") FASTQC ( ch_input_for_fastqc @@ -99,9 +99,9 @@ workflow TAXPROFILER { ch_versions.unique().collectFile(name: 'collated_versions.yml') ) - // - // PERFORM PREPROCESSING - // + /* + SUBWORKFLOW: PERFORM PREPROCESSING + */ if ( params.shortread_clipmerge ) { ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads } else { @@ -116,16 +116,19 @@ workflow TAXPROFILER { ch_longreads_preprocessed = INPUT_CHECK.out.nanopore } - // - // PERFORM SHORT READ RUN MERGING + /* + MODULE: PERFORM SHORT READ RUN MERGING + */ + // TODO: Check not necessary for long reads too? - // + // TODO: source of clash - combined should only occur when + // files ARE to be combined. SE/unmerged (see not below) ch_processed_for_combine = ch_shortreads_preprocessed .dump(tag: "prep_for_combine_grouping") .map { meta, reads -> def meta_new = meta.clone() - meta_new['run_accession'] = 'combined' + //meta_new['run_accession'] = 'combined' [ meta_new, reads ] } .groupTuple ( by: 0 ) @@ -134,17 +137,18 @@ workflow TAXPROFILER { skip: it[1].size() < 2 } + // NOTE: this does not allow CATing of SE & PE runs of same sample + // when --shortread_clipmerge_mergepairs is false CAT_FASTQ ( ch_processed_for_combine.combine ) - // TODO May need to flatten reads? ch_reads_for_profiling = ch_processed_for_combine.skip .dump(tag: "skip_combine") .mix( CAT_FASTQ.out.reads ) .dump(tag: "files_for_profiling") - // - // COMBINE READS WITH POSSIBLE DATABASES - // + /* + COMBINE READS WITH POSSIBLE DATABASES + */ // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] ch_input_for_profiling = ch_reads_for_profiling @@ -157,9 +161,9 @@ workflow TAXPROFILER { unknown: true } - // - // PREPARE PROFILER INPUT CHANNELS - // + /* + PREPARE PROFILER INPUT CHANNELS + */ // We groupTuple to have all samples in one channel for MALT as database // loading takes a long time, so we only want to run it once per database @@ -171,7 +175,7 @@ workflow TAXPROFILER { [ temp_meta, it[1], db ] } .groupTuple(by: [0,2]) - .dump(tag: "input_for_malt") + .dump(tag: "input_to_malt") .multiMap { it -> reads: [ it[0], it[1].flatten() ] @@ -180,16 +184,16 @@ workflow TAXPROFILER { // We can run Kraken2 one-by-one sample-wise ch_input_for_kraken2 = ch_input_for_profiling.kraken2 - .dump(tag: "input_for_kraken") + .dump(tag: "input_to_kraken") .multiMap { it -> reads: [ it[0] + it[2], it[1].flatten() ] db: it[3] } - // - // RUN PROFILING - // + /* + MODULE: RUN PROFILING + */ if ( params.run_malt ) { MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) } @@ -198,9 +202,9 @@ workflow TAXPROFILER { KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) } - // - // MODULE: MultiQC - // + /* + MODULE: MultiQC + */ workflow_summary = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) From 8dc9e583add80f51111dc6c796a2ea1413e8731b Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 27 Mar 2022 09:30:23 +0200 Subject: [PATCH 13/21] Debugging run merging --- conf/modules.config | 6 ++--- .../nf-core/modules/kraken2/kraken2/main.nf | 3 ++- subworkflows/local/shortread_fastp.nf | 10 ++++--- subworkflows/local/shortread_preprocessing.nf | 7 ----- workflows/taxprofiler.nf | 27 ++++++++++++++----- 5 files changed, 32 insertions(+), 21 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 5823d3f..41f471f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -65,7 +65,7 @@ process { withName: FASTP { ext.prefix = { "${meta.id}_${meta.run_accession}" } ext.args = [ - // collapsing options + // collapsing options - option to retain singletons params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged", // trimming options params.shortread_clipmerge_skiptrim ? "--disable_adapter_trimming" : "", @@ -105,7 +105,7 @@ process { pattern: '*.{rma6,tab,text,sam,log}' ] ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + ext.prefix = { "${meta.id}-${meta.db_name}" } } withName: KRAKEN2_KRAKEN2 { @@ -115,7 +115,7 @@ process { pattern: '*.{fastq.gz,txt}' ] ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + ext.prefix = { "${meta.id}-${meta.db_name}" } } withName: CUSTOM_DUMPSOFTWAREVERSIONS { diff --git a/modules/nf-core/modules/kraken2/kraken2/main.nf b/modules/nf-core/modules/kraken2/kraken2/main.nf index 3ec5df5..52351f5 100644 --- a/modules/nf-core/modules/kraken2/kraken2/main.nf +++ b/modules/nf-core/modules/kraken2/kraken2/main.nf @@ -32,11 +32,12 @@ process KRAKEN2_KRAKEN2 { --threads $task.cpus \\ --unclassified-out $unclassified \\ --classified-out $classified \\ + $args \\ --report ${prefix}.kraken2.report.txt \\ --gzip-compressed \\ $paired \\ - $args \\ $reads + pigz -p $task.cpus *.fastq diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index 57ce2a6..d4d706e 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -7,7 +7,7 @@ include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fast workflow SHORTREAD_FASTP { take: - reads // file: /path/to/samplesheet.csv + reads // [[meta], [reads]] main: ch_versions = Channel.empty() @@ -24,16 +24,18 @@ workflow SHORTREAD_FASTP { ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) + // Last parameter here turns on merging of PE data FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs ) if ( params.shortread_clipmerge_mergepairs ) { + // TODO update to replace meta suffix ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged .mix( FASTP_SINGLE.out.reads ) .map { meta, reads -> - def meta_new = meta.clone() - meta_new['single_end'] = 1 - [ meta_new, reads ] + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] } } else { ch_fastp_reads_prepped = FASTP_PAIRED.out.reads diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index c82536f..7fba0c0 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -14,13 +14,6 @@ workflow SHORTREAD_PREPROCESSING { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() - // - // STEP: Read clipping and merging - // - // TODO give option to clip only and retain pairs - // TODO give option to retain singletons (probably fastp option likely) - // TODO move to subworkflow - if ( params.shortread_clipmerge_tool == "fastp" ) { ch_processed_reads = SHORTREAD_FASTP ( reads ).reads ch_versions = ch_versions.mix( SHORTREAD_FASTP.out.versions ) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index ee921ea..87f7a30 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -120,25 +120,40 @@ workflow TAXPROFILER { MODULE: PERFORM SHORT READ RUN MERGING */ - // TODO: Check not necessary for long reads too? - // TODO: source of clash - combined should only occur when - // files ARE to be combined. SE/unmerged (see not below) + // Remove run accession to allow grouping by sample. Will only merge + // if pairment type is the same. + + // TODO Current Branch system currently problematic - when single file not in a list, splits at + // `/` so makes list >= 2, so tries to merge, but then breaks kraken downstream + // e.g. `home jfellows Documents git nf-core taxprofiler testing work 68 9a2c8362add37832a776058d280bb7 2612_se.merged.fastq.gz` + // So theoretically need to force this into a list, (but results the can't access meta.id error as incorrect input format) + // But second issue >= 2 is MAYBE sufficient because what if merging two paired-end files? Need to chcek if the input channel formatted correctly for this? Need to check... ch_processed_for_combine = ch_shortreads_preprocessed .dump(tag: "prep_for_combine_grouping") .map { meta, reads -> def meta_new = meta.clone() - //meta_new['run_accession'] = 'combined' + + // remove run accession to allow group by sample + meta_new.remove('run_accession') + + // update id to prevent file name clashes when unable to group + // unmerged PE and SE runs of same sample + def type = meta_new['single_end'] ? "_se" : "_pe" + meta_new['id'] = meta['id'] + type + [ meta_new, reads ] } .groupTuple ( by: 0 ) + .dump(tag: "files_for_cat_fastq_branch") .branch{ - combine: it[1].size() >= 2 - skip: it[1].size() < 2 + combine: it[1] && it[1].size() > 1 + skip: true } // NOTE: this does not allow CATing of SE & PE runs of same sample // when --shortread_clipmerge_mergepairs is false + ch_processed_for_combine.combine.dump(tag: "input_into_cat_fastq") CAT_FASTQ ( ch_processed_for_combine.combine ) ch_reads_for_profiling = ch_processed_for_combine.skip From ee53283696261fa2687fa55402e3cfa3da164509 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 28 Mar 2022 09:13:05 +0200 Subject: [PATCH 14/21] Debugging comment --- workflows/taxprofiler.nf | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 87f7a30..8a05720 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -118,7 +118,6 @@ workflow TAXPROFILER { /* MODULE: PERFORM SHORT READ RUN MERGING - */ // Remove run accession to allow grouping by sample. Will only merge // if pairment type is the same. @@ -128,6 +127,7 @@ workflow TAXPROFILER { // e.g. `home jfellows Documents git nf-core taxprofiler testing work 68 9a2c8362add37832a776058d280bb7 2612_se.merged.fastq.gz` // So theoretically need to force this into a list, (but results the can't access meta.id error as incorrect input format) // But second issue >= 2 is MAYBE sufficient because what if merging two paired-end files? Need to chcek if the input channel formatted correctly for this? Need to check... + ch_processed_for_combine = ch_shortreads_preprocessed .dump(tag: "prep_for_combine_grouping") .map { @@ -160,6 +160,9 @@ workflow TAXPROFILER { .dump(tag: "skip_combine") .mix( CAT_FASTQ.out.reads ) .dump(tag: "files_for_profiling") + */ + + ch_reads_for_profiling = ch_shortreads_preprocessed /* COMBINE READS WITH POSSIBLE DATABASES @@ -198,6 +201,7 @@ workflow TAXPROFILER { } // We can run Kraken2 one-by-one sample-wise + // TODO Only flatten when paired-end! Causing issue commented out above! ch_input_for_kraken2 = ch_input_for_profiling.kraken2 .dump(tag: "input_to_kraken") .multiMap { From 231253227c7e59c1c108cf4cf4953cb89c4f6f5f Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 28 Mar 2022 16:38:10 +0200 Subject: [PATCH 15/21] Remove run merging for now due to complexity --- conf/modules.config | 4 +- subworkflows/local/shortread_fastp.nf | 17 +++++---- workflows/taxprofiler.nf | 55 ++------------------------- 3 files changed, 14 insertions(+), 62 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 41f471f..a9bc3a1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -105,7 +105,7 @@ process { pattern: '*.{rma6,tab,text,sam,log}' ] ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.db_name}" } + ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } withName: KRAKEN2_KRAKEN2 { @@ -115,7 +115,7 @@ process { pattern: '*.{fastq.gz,txt}' ] ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.db_name}" } + ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } withName: CUSTOM_DUMPSOFTWAREVERSIONS { diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index d4d706e..c2e435c 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -28,15 +28,16 @@ workflow SHORTREAD_FASTP { FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs ) if ( params.shortread_clipmerge_mergepairs ) { - // TODO update to replace meta suffix - ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged - .mix( FASTP_SINGLE.out.reads ) - .map { - meta, reads -> - def meta_new = meta.clone() - meta_new['single_end'] = 1 - [ meta_new, reads ] + ch_fastp_reads_prepped_pe = FASTP_PAIRED.out.reads_merged + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] } + + ch_fastp_reads_prepped = ch_fastp_reads_prepped_pe.mix( FASTP_SINGLE.out.reads ) + } else { ch_fastp_reads_prepped = FASTP_PAIRED.out.reads .mix( FASTP_SINGLE.out.reads ) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 8a05720..e73d94d 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -116,63 +116,15 @@ workflow TAXPROFILER { ch_longreads_preprocessed = INPUT_CHECK.out.nanopore } - /* - MODULE: PERFORM SHORT READ RUN MERGING - - // Remove run accession to allow grouping by sample. Will only merge - // if pairment type is the same. - - // TODO Current Branch system currently problematic - when single file not in a list, splits at - // `/` so makes list >= 2, so tries to merge, but then breaks kraken downstream - // e.g. `home jfellows Documents git nf-core taxprofiler testing work 68 9a2c8362add37832a776058d280bb7 2612_se.merged.fastq.gz` - // So theoretically need to force this into a list, (but results the can't access meta.id error as incorrect input format) - // But second issue >= 2 is MAYBE sufficient because what if merging two paired-end files? Need to chcek if the input channel formatted correctly for this? Need to check... - - ch_processed_for_combine = ch_shortreads_preprocessed - .dump(tag: "prep_for_combine_grouping") - .map { - meta, reads -> - def meta_new = meta.clone() - - // remove run accession to allow group by sample - meta_new.remove('run_accession') - - // update id to prevent file name clashes when unable to group - // unmerged PE and SE runs of same sample - def type = meta_new['single_end'] ? "_se" : "_pe" - meta_new['id'] = meta['id'] + type - - [ meta_new, reads ] - } - .groupTuple ( by: 0 ) - .dump(tag: "files_for_cat_fastq_branch") - .branch{ - combine: it[1] && it[1].size() > 1 - skip: true - } - - // NOTE: this does not allow CATing of SE & PE runs of same sample - // when --shortread_clipmerge_mergepairs is false - ch_processed_for_combine.combine.dump(tag: "input_into_cat_fastq") - CAT_FASTQ ( ch_processed_for_combine.combine ) - - ch_reads_for_profiling = ch_processed_for_combine.skip - .dump(tag: "skip_combine") - .mix( CAT_FASTQ.out.reads ) - .dump(tag: "files_for_profiling") - */ - - ch_reads_for_profiling = ch_shortreads_preprocessed - /* COMBINE READS WITH POSSIBLE DATABASES */ // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] - ch_input_for_profiling = ch_reads_for_profiling + ch_input_for_profiling = ch_shortreads_preprocessed .mix( ch_longreads_preprocessed ) .combine(DB_CHECK.out.dbs) - .dump(tag: "reads_plus_db") + .dump(tag: "reads_plus_db_clean") .branch { malt: it[2]['tool'] == 'malt' kraken2: it[2]['tool'] == 'kraken2' @@ -201,12 +153,11 @@ workflow TAXPROFILER { } // We can run Kraken2 one-by-one sample-wise - // TODO Only flatten when paired-end! Causing issue commented out above! ch_input_for_kraken2 = ch_input_for_profiling.kraken2 .dump(tag: "input_to_kraken") .multiMap { it -> - reads: [ it[0] + it[2], it[1].flatten() ] + reads: [ it[0] + it[2], it[1] ] db: it[3] } From 98dc8014a526209b24bdd55e321c10b2dc007a88 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 28 Mar 2022 16:46:45 +0200 Subject: [PATCH 16/21] Fix kraken2 module --- modules.json | 8 ++++---- modules/nf-core/modules/kraken2/kraken2/main.nf | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/modules.json b/modules.json index 9e6428a..2494fc1 100644 --- a/modules.json +++ b/modules.json @@ -24,12 +24,12 @@ "multiqc": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, - "untar": { - "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" - }, "porechop": { "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046" + }, + "untar": { + "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/kraken2/kraken2/main.nf b/modules/nf-core/modules/kraken2/kraken2/main.nf index 52351f5..3ec5df5 100644 --- a/modules/nf-core/modules/kraken2/kraken2/main.nf +++ b/modules/nf-core/modules/kraken2/kraken2/main.nf @@ -32,12 +32,11 @@ process KRAKEN2_KRAKEN2 { --threads $task.cpus \\ --unclassified-out $unclassified \\ --classified-out $classified \\ - $args \\ --report ${prefix}.kraken2.report.txt \\ --gzip-compressed \\ $paired \\ + $args \\ $reads - pigz -p $task.cpus *.fastq From eada201eb28f3572af5481a6d69e4f87407a352d Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 28 Mar 2022 16:47:37 +0200 Subject: [PATCH 17/21] Prettier --- modules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 2494fc1..34ef9f1 100644 --- a/modules.json +++ b/modules.json @@ -32,4 +32,4 @@ } } } -} \ No newline at end of file +} From 94e5cfef4aa7c2dd69e72e7b1b12439110ed1a09 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 28 Mar 2022 18:20:10 +0200 Subject: [PATCH 18/21] Filter long reads for MALT, bump cpus for FastQC for minigut to pass --- conf/base.config | 2 +- conf/test.config | 15 ++++++++++++++- subworkflows/local/db_check.nf | 2 +- workflows/taxprofiler.nf | 6 ++++++ 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/conf/base.config b/conf/base.config index 0c65574..2f0f97f 100644 --- a/conf/base.config +++ b/conf/base.config @@ -27,7 +27,7 @@ process { // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_low { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } + cpus = { check_max( 4 * task.attempt, 'cpus' ) } memory = { check_max( 12.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } } diff --git a/conf/test.config b/conf/test.config index 92a10e4..26972df 100644 --- a/conf/test.config +++ b/conf/test.config @@ -15,7 +15,7 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Limit resources so that this can run on GitHub Actions - max_cpus = 2 + max_cpus = 8 max_memory = '6.GB' max_time = '6.h' @@ -29,3 +29,16 @@ params { shortread_clipmerge = true } + +process { + withName: FASTQC { + cpus = { check_max( 8 * task.attempt, 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 6.h * task.attempt, 'time' ) } + } + withName: FASTQC_PROCESSED { + cpus = { check_max( 8 * task.attempt, 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 6.h * task.attempt, 'time' ) } + } +} diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index 96f815a..bda5cfe 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -14,7 +14,7 @@ workflow DB_CHECK { // TODO: make database sheet check // Checks: // 1) no duplicates, - // 2) dbs with no special arguments does not have quotes, e.g. just `,,` and NOT `,"",` + // 2) args do not have quotes, e.g. just `,,` and NOT `,"",` parsed_samplesheet = DATABASE_CHECK ( dbsheet ) .csv .splitCsv ( header:true, sep:',' ) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index e73d94d..af5c54d 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -90,9 +90,11 @@ workflow TAXPROFILER { MODULE: Run FastQC */ ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ).dump(tag: "input_to_fastq") + FASTQC ( ch_input_for_fastqc ) + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) CUSTOM_DUMPSOFTWAREVERSIONS ( @@ -137,7 +139,11 @@ workflow TAXPROFILER { // We groupTuple to have all samples in one channel for MALT as database // loading takes a long time, so we only want to run it once per database + // TODO document somewhere we only accept illumina short reads for MALT? ch_input_for_malt = ch_input_for_profiling.malt + .dump(tag: "input_to_malt_prefilter") + .filter { it[0]['instrument_platform'] == 'ILLUMINA' } + .dump(tag: "input_to_malt_postfilter") .map { it -> def temp_meta = [ id: it[2]['db_name']] + it[2] From 04fb6f412752811ac8fc621c8730d418986a1d08 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 28 Mar 2022 20:38:16 +0200 Subject: [PATCH 19/21] Use new resources for test data --- conf/test.config | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/conf/test.config b/conf/test.config index 26972df..92a10e4 100644 --- a/conf/test.config +++ b/conf/test.config @@ -15,7 +15,7 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Limit resources so that this can run on GitHub Actions - max_cpus = 8 + max_cpus = 2 max_memory = '6.GB' max_time = '6.h' @@ -29,16 +29,3 @@ params { shortread_clipmerge = true } - -process { - withName: FASTQC { - cpus = { check_max( 8 * task.attempt, 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 6.h * task.attempt, 'time' ) } - } - withName: FASTQC_PROCESSED { - cpus = { check_max( 8 * task.attempt, 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 6.h * task.attempt, 'time' ) } - } -} From 98a9688329e66814764746a9219580e841e84477 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 29 Mar 2022 15:18:06 +0200 Subject: [PATCH 20/21] Change CPUs back to template, update skiptrim fiag --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 2f0f97f..0c65574 100644 --- a/conf/base.config +++ b/conf/base.config @@ -27,7 +27,7 @@ process { // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_low { - cpus = { check_max( 4 * task.attempt, 'cpus' ) } + cpus = { check_max( 2 * task.attempt, 'cpus' ) } memory = { check_max( 12.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } } From 76e9624e3f04e573a56c1443884b2ca385488319 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 30 Mar 2022 11:31:37 +0200 Subject: [PATCH 21/21] NOW commit the adapte rtrim flag --- conf/modules.config | 2 +- nextflow.config | 2 +- nextflow_schema.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index a9bc3a1..71eaa75 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -68,7 +68,7 @@ process { // collapsing options - option to retain singletons params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged", // trimming options - params.shortread_clipmerge_skiptrim ? "--disable_adapter_trimming" : "", + params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "", params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", !{ ${meta.single_end} } && params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : "", // filtering options diff --git a/nextflow.config b/nextflow.config index a312d0c..6fde513 100644 --- a/nextflow.config +++ b/nextflow.config @@ -57,7 +57,7 @@ params { // FASTQ preprocessing shortread_clipmerge = false shortread_clipmerge_tool = 'fastp' - shortread_clipmerge_skiptrim = false + shortread_clipmerge_skipadaptertrim = false shortread_clipmerge_mergepairs = false shortread_clipmerge_excludeunmerged = true shortread_clipmerge_adapter1 = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 545f990..adc5a73 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -286,7 +286,7 @@ "type": "string", "default": "fastp" }, - "shortread_clipmerge_skiptrim": { + "shortread_clipmerge_skipadaptertrim": { "type": "boolean" }, "shortread_clipmerge_mergepairs": {