From b055df5ea076e2d82071eba9e3e692ce9d43e976 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 2 Apr 2022 17:02:05 +0200 Subject: [PATCH 01/21] Add bbduk complexity (entropy-based) filtering --- CITATIONS.md | 2 + conf/modules.config | 15 +++-- modules.json | 3 + modules/nf-core/modules/bbmap/bbduk/main.nf | 43 ++++++++++++ modules/nf-core/modules/bbmap/bbduk/meta.yml | 52 +++++++++++++++ nextflow.config | 7 ++ nextflow_schema.json | 21 +++++- subworkflows/local/longread_preprocessing.nf | 8 +-- .../local/shortread_adapterremoval.nf | 6 +- .../local/shortread_complexityfiltering.nf | 28 ++++++++ subworkflows/local/shortread_fastp.nf | 10 +-- subworkflows/local/shortread_preprocessing.nf | 6 +- workflows/taxprofiler.nf | 66 ++++++++++++------- 13 files changed, 222 insertions(+), 45 deletions(-) create mode 100644 modules/nf-core/modules/bbmap/bbduk/main.nf create mode 100644 modules/nf-core/modules/bbmap/bbduk/meta.yml create mode 100644 subworkflows/local/shortread_complexityfiltering.nf diff --git a/CITATIONS.md b/CITATIONS.md index 8f286b0..0eb8141 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -26,6 +26,8 @@ - [Porechop](https://github.com/rrwick/Porechop) +- [BBTools](http://sourceforge.net/projects/bbmap/) + - [Kraken2](https://doi.org/10.1186/s13059-019-1891-0) > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. “Improved Metagenomic Analysis with Kraken 2.” Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0. diff --git a/conf/modules.config b/conf/modules.config index dc8b138..601f915 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -132,7 +132,6 @@ process { ] } - withName: PORECHOP { ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ @@ -142,11 +141,17 @@ process { ] } - withName: CAT_FASTQ { + withName: BBMAP_BBDUK { + ext.args = [ + "entropy=${params.shortread_complexityfilter_bbduk_entropy}", + "entropywindow=${params.shortread_complexityfilter_bbduk_windowsize}", + params.shortread_complexityfilter_bbduk_mask ? "entropymask=t" : "entropymask=f" + ].join(' ').trim() + ext.prefix = { "${meta.id}-${meta.run_accession}" } publishDir = [ - path: { "${params.outdir}/prepared_sequences" }, - mode: 'copy', - pattern: '*.fastq.gz' + path: { "${params.outdir}/bbduk/" }, + mode: params.publish_dir_mode, + pattern: '*.{fastq.gz,log}' ] } diff --git a/modules.json b/modules.json index dcfbd3f..64e6c3c 100644 --- a/modules.json +++ b/modules.json @@ -6,6 +6,9 @@ "adapterremoval": { "git_sha": "f0800157544a82ae222931764483331a81812012" }, + "bbmap/bbduk": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + }, "cat/fastq": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, diff --git a/modules/nf-core/modules/bbmap/bbduk/main.nf b/modules/nf-core/modules/bbmap/bbduk/main.nf new file mode 100644 index 0000000..0ae005e --- /dev/null +++ b/modules/nf-core/modules/bbmap/bbduk/main.nf @@ -0,0 +1,43 @@ +process BBMAP_BBDUK { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? "bioconda::bbmap=38.90" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bbmap:38.90--he522d1c_1' : + 'quay.io/biocontainers/bbmap:38.90--he522d1c_1' }" + + input: + tuple val(meta), path(reads) + path contaminants + + output: + tuple val(meta), path('*.fastq.gz'), emit: reads + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def raw = meta.single_end ? "in=${reads[0]}" : "in1=${reads[0]} in2=${reads[1]}" + def trimmed = meta.single_end ? "out=${prefix}.fastq.gz" : "out1=${prefix}_1.fastq.gz out2=${prefix}_2.fastq.gz" + def contaminants_fa = contaminants ? "ref=$contaminants" : '' + """ + maxmem=\$(echo \"$task.memory\"| sed 's/ GB/g/g') + bbduk.sh \\ + -Xmx\$maxmem \\ + $raw \\ + $trimmed \\ + threads=$task.cpus \\ + $args \\ + $contaminants_fa \\ + &> ${prefix}.bbduk.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/bbmap/bbduk/meta.yml b/modules/nf-core/modules/bbmap/bbduk/meta.yml new file mode 100644 index 0000000..6abd3d9 --- /dev/null +++ b/modules/nf-core/modules/bbmap/bbduk/meta.yml @@ -0,0 +1,52 @@ +name: bbmap_bbduk +description: Adapter and quality trimming of sequencing reads +keywords: + - trimming + - adapter trimming + - quality trimming +tools: + - bbmap: + description: BBMap is a short read aligner, as well as various other bioinformatic tools. + homepage: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + documentation: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + tool_dev_url: None + doi: "" + licence: ["UC-LBL license (see package)"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - contaminants: + type: file + description: | + Reference files containing adapter and/or contaminant sequences for sequence kmer matching + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified fastq reads + pattern: "*fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - log: + type: file + description: Bbduk log file + pattern: "*bbduk.log" + +authors: + - "@MGordon09" diff --git a/nextflow.config b/nextflow.config index 7be36a6..d57743a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -65,6 +65,13 @@ params { shortread_clipmerge_minlength = 15 longread_clip = false + // Complexity filtering + shortread_complexityfilter = false + shortread_complexityfilter_tool = 'bbduk' + shortread_complexityfilter_bbduk_entropy = 0.3 + shortread_complexityfilter_bbduk_windowsize = 50 + shortread_complexityfilter_bbduk_mask = false + // MALT run_malt = false malt_mode = 'BlastN' diff --git a/nextflow_schema.json b/nextflow_schema.json index fb2ca31..bd8b438 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -266,8 +266,7 @@ "type": "boolean" }, "shortread_clipmerge_excludeunmerged": { - "type": "boolean", - "default": false + "type": "boolean" }, "longread_clip": { "type": "boolean" @@ -304,6 +303,24 @@ "shortread_clipmerge_minlength": { "type": "integer", "default": 15 + }, + "shortread_complexityfilter_tool": { + "type": "string", + "default": "bbduk" + }, + "shortread_complexityfilter_bbduk_entropy": { + "type": "number", + "default": 0.3 + }, + "shortread_complexityfilter_bbduk_windowsize": { + "type": "integer", + "default": 50 + }, + "shortread_complexityfilter_bbduk_mask": { + "type": "boolean" + }, + "shortread_complexityfilter": { + "type": "boolean" } } } diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf index a1515c7..2fa5f3b 100644 --- a/subworkflows/local/longread_preprocessing.nf +++ b/subworkflows/local/longread_preprocessing.nf @@ -1,6 +1,6 @@ -/* -Process long raw reads with porechop -*/ +// +// Process long raw reads with porechop +// include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main' include { PORECHOP } from '../../modules/nf-core/modules/porechop/main' @@ -25,7 +25,7 @@ workflow LONGREAD_PREPROCESSING { FASTQC_PROCESSED ( PORECHOP.out.reads ) ch_versions = ch_versions.mix(PORECHOP.out.versions.first()) - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) emit: diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index 5e005db..b173e4c 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -1,6 +1,6 @@ -/* -Process short raw reads with AdapterRemoval -*/ +// +// Process short raw reads with AdapterRemoval +// include { ADAPTERREMOVAL as ADAPTERREMOVAL_SINGLE } from '../../modules/nf-core/modules/adapterremoval/main' include { ADAPTERREMOVAL as ADAPTERREMOVAL_PAIRED } from '../../modules/nf-core/modules/adapterremoval/main' diff --git a/subworkflows/local/shortread_complexityfiltering.nf b/subworkflows/local/shortread_complexityfiltering.nf new file mode 100644 index 0000000..3d69ed6 --- /dev/null +++ b/subworkflows/local/shortread_complexityfiltering.nf @@ -0,0 +1,28 @@ +// +// Check input samplesheet and get read channels +// + +include { BBMAP_BBDUK } from '../../modules/nf-core/modules/bbmap/bbduk/main' + +workflow SHORTREAD_COMPLEXITYFILTERING { + take: + reads // [ [ meta ], [ reads ] ] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( params.shortread_complexityfilter_tool == 'bbduk' ) { + ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads + ch_versions = ch_versions.mix( BBMAP_BBDUK.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( BBMAP_BBDUK.out.log ) + } else { + ch_filtered_reads = reads + } + + emit: + reads = ch_filtered_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index 48817db..18baf17 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -1,6 +1,6 @@ -/* -Process short raw reads with FastP -*/ +// +// Process short raw reads with FastP +// include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main' include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' @@ -44,8 +44,8 @@ workflow SHORTREAD_FASTP { ch_processed_reads = ch_fastp_reads_prepped - ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} ) - ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json ) + ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json ) emit: reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index 1d0caac..b0ac25e 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -1,5 +1,5 @@ // -// Check input samplesheet and get read channels +// Perform read trimming and merging // @@ -9,7 +9,7 @@ include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules workflow SHORTREAD_PREPROCESSING { take: - reads // file: /path/to/samplesheet.csv + reads // [ [ meta ], [ reads ] ] main: ch_versions = Channel.empty() @@ -29,7 +29,7 @@ workflow SHORTREAD_PREPROCESSING { FASTQC_PROCESSED ( ch_processed_reads ) ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) emit: reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index ce89a91..a403589 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -40,9 +40,10 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi // include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { DB_CHECK } from '../subworkflows/local/db_check' -include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' -include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' +include { DB_CHECK } from '../subworkflows/local/db_check' +include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' +include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' +include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_complexityfiltering' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -61,7 +62,6 @@ include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fas include { MALT_RUN } from '../modules/nf-core/modules/malt/run/main' include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/modules/kraken2/kraken2/main' - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -98,10 +98,6 @@ workflow TAXPROFILER { ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - CUSTOM_DUMPSOFTWAREVERSIONS ( - ch_versions.unique().collectFile(name: 'collated_versions.yml') - ) - /* SUBWORKFLOW: PERFORM PREPROCESSING */ @@ -114,17 +110,26 @@ workflow TAXPROFILER { if ( params.longread_clip ) { ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ).reads .map { it -> [ it[0], [it[1]] ] } - ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first()) } else { ch_longreads_preprocessed = INPUT_CHECK.out.nanopore } + /* + SUBWORKFLOW: COMPLEXITY FILTERING + */ + + if ( params.shortread_complexityfilter ) { + ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads + } else { + ch_shortreads_filtered = ch_shortreads_preprocessed + } + /* COMBINE READS WITH POSSIBLE DATABASES */ // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] - ch_input_for_profiling = ch_shortreads_preprocessed + ch_input_for_profiling = ch_shortreads_filtered .mix( ch_longreads_preprocessed ) .combine(DB_CHECK.out.dbs) .branch { @@ -177,6 +182,12 @@ workflow TAXPROFILER { /* MODULE: MultiQC */ + + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') + ) + + workflow_summary = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) @@ -188,21 +199,30 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) if (params.shortread_clipmerge) { - ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_PREPROCESSING.out.mqc) - } - if (params.longread_clip) { - ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc) - } - if (params.run_kraken2) { - ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([])) - ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first()) - } - if (params.run_malt) { - ch_multiqc_files = ch_multiqc_files.mix(MALT_RUN.out.log.collect{it[1]}.ifEmpty([])) - ch_versions = ch_versions.mix(MALT_RUN.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_shortclipmerge") + ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions ) + } + + if (params.longread_clip) { + ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_longclipmerge") + ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions ) + } + + if (params.shortread_complexityfilter){ + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_compelxity") + ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) + } + + if (params.run_kraken2) { + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_kraken") + ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) + } + + if (params.run_malt) { + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_malt") + ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) } - // TODO MALT results overwriting per database? // TODO Versions for Karken/MALT not report? MULTIQC ( ch_multiqc_files.collect() From 7f1a7fb4e7be661e3a01ab0ffb0a61fe5de314ae Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 2 Apr 2022 17:05:54 +0200 Subject: [PATCH 02/21] Add complexity filtering to test profile --- conf/test.config | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/conf/test.config b/conf/test.config index 92a10e4..bce5918 100644 --- a/conf/test.config +++ b/conf/test.config @@ -24,8 +24,9 @@ params { // TODO nf-core: Give any required params for the test so that command line flags are not needed input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' - run_kraken2 = true - run_malt = true - shortread_clipmerge = true + run_kraken2 = true + run_malt = true + shortread_clipmerge = true + shortread_complexityfiltering = true } From 469e4f36825d86bf83a341d8c95bb4768f682bfd Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 2 Apr 2022 17:07:30 +0200 Subject: [PATCH 03/21] Remove remaining debugging .dumps( --- workflows/taxprofiler.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index a403589..1c86edb 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -90,7 +90,7 @@ workflow TAXPROFILER { /* MODULE: Run FastQC */ - ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ).dump(tag: "input_to_fastq") + ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ) FASTQC ( ch_input_for_fastqc @@ -199,27 +199,27 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) if (params.shortread_clipmerge) { - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_shortclipmerge") + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions ) } if (params.longread_clip) { - ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_longclipmerge") + ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions ) } if (params.shortread_complexityfilter){ - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_compelxity") + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } if (params.run_kraken2) { - ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_kraken") + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) } if (params.run_malt) { - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_malt") + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) } From 02d950c58c84f4e6c571cdb055154016fc78a22d Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 2 Apr 2022 17:18:20 +0200 Subject: [PATCH 04/21] Fix test so complexity filter actually executes --- conf/test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index bce5918..3e7530c 100644 --- a/conf/test.config +++ b/conf/test.config @@ -27,6 +27,6 @@ params { run_kraken2 = true run_malt = true shortread_clipmerge = true - shortread_complexityfiltering = true + shortread_complexityfilter = true } From d3572e18787eee40ed4e856bd6caf2f573246094 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 3 Apr 2022 07:28:01 +0200 Subject: [PATCH 05/21] Final MQC fix for AR2 (remove too-early collect) --- subworkflows/local/shortread_adapterremoval.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index b173e4c..bbb108e 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -77,7 +77,7 @@ workflow SHORTREAD_ADAPTERREMOVAL { ch_versions = ch_versions.mix( ADAPTERREMOVAL_SINGLE.out.versions.first() ) ch_versions = ch_versions.mix( ADAPTERREMOVAL_PAIRED.out.versions.first() ) - ch_multiqc_files = ch_multiqc_files.mix( ADAPTERREMOVAL_PAIRED.out.log.collect{it[1]}, ADAPTERREMOVAL_SINGLE.out.log.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( ADAPTERREMOVAL_PAIRED.out.log, ADAPTERREMOVAL_SINGLE.out.log ) emit: reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] From a384162810cb1fe1d800d7c3f2053f01c25083da Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 4 Apr 2022 21:16:51 +0200 Subject: [PATCH 06/21] Add prinseq as alternative complexity filtering --- CITATIONS.md | 12 ++-- conf/modules.config | 15 ++++- modules.json | 5 +- .../nf-core/modules/prinseqplusplus/main.nf | 61 +++++++++++++++++++ .../nf-core/modules/prinseqplusplus/meta.yml | 60 ++++++++++++++++++ nextflow.config | 13 ++-- nextflow_schema.json | 37 ++++++++--- .../local/shortread_complexityfiltering.nf | 6 +- 8 files changed, 189 insertions(+), 20 deletions(-) create mode 100644 modules/nf-core/modules/prinseqplusplus/main.nf create mode 100644 modules/nf-core/modules/prinseqplusplus/meta.yml diff --git a/CITATIONS.md b/CITATIONS.md index 0eb8141..ec424b1 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -18,23 +18,27 @@ - [fastp](https://doi.org/10.1093/bioinformatics/bty560) - > Chen, Shifu, Yanqing Zhou, Yaru Chen, and Jia Gu. 2018. “Fastp: An Ultra-Fast All-in-One FASTQ Preprocessor.” Bioinformatics 34 (17): i884-90. 10.1093/bioinformatics/bty560. + > Chen, Shifu, Yanqing Zhou, Yaru Chen, and Jia Gu. 2018. Fastp: An Ultra-Fast All-in-One FASTQ Preprocessor. Bioinformatics 34 (17): i884-90. 10.1093/bioinformatics/bty560. - [AdapterRemoval2](https://doi.org/10.1186/s13104-016-1900-2) - > Schubert, Mikkel, Stinus Lindgreen, and Ludovic Orlando. 2016. “AdapterRemoval v2: Rapid Adapter Trimming, Identification, and Read Merging.” BMC Research Notes 9 (February): 88. doi:10.1186/s13104-016-1900-2. + > Schubert, Mikkel, Stinus Lindgreen, and Ludovic Orlando. 2016. AdapterRemoval v2: Rapid Adapter Trimming, Identification, and Read Merging. BMC Research Notes 9 (February): 88. doi:10.1186/s13104-016-1900-2. - [Porechop](https://github.com/rrwick/Porechop) - [BBTools](http://sourceforge.net/projects/bbmap/) +- [PRINSEQ++](https://doi.org/10.7287/peerj.preprints.27553v1) + + > Cantu, Vito Adrian, Jeffrey Sadural, and Robert Edwards. 2019. PRINSEQ++, a Multi-Threaded Tool for Fast and Efficient Quality Control and Preprocessing of Sequencing Datasets. e27553v1. PeerJ Preprints. doi: 10.7287/peerj.preprints.27553v1. + - [Kraken2](https://doi.org/10.1186/s13059-019-1891-0) - > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. “Improved Metagenomic Analysis with Kraken 2.” Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0. + > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. Improved Metagenomic Analysis with Kraken 2. Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0. - [MALT](https://doi.org/10.1038/s41559-017-0446-6) - > Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. “Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico.” Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6. + > Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico. Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6. ## Software packaging/containerisation tools diff --git a/conf/modules.config b/conf/modules.config index 601f915..9d2edc3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -143,7 +143,7 @@ process { withName: BBMAP_BBDUK { ext.args = [ - "entropy=${params.shortread_complexityfilter_bbduk_entropy}", + "entropy=${params.shortread_complexityfilter_entropy}", "entropywindow=${params.shortread_complexityfilter_bbduk_windowsize}", params.shortread_complexityfilter_bbduk_mask ? "entropymask=t" : "entropymask=f" ].join(' ').trim() @@ -155,6 +155,19 @@ process { ] } + withName: PRINSEQPLUSPLUS { + ext.args = [ + params.shortread_complexityfilter_prinseqplusplus_mode == 'dust' ? "-lc_dust=${params.shortread_complexityfilter_prinseqplusplus_dustscore}" : "-lc_entropy=${params.shortread_complexityfilter_entropy}", + "-trim_qual_left=0 -trim_qual_left=0 -trim_qual_window=0 -trim_qual_step=0" + ].join(' ').trim() + ext.prefix = { "${meta.id}-${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/prinseqplusplus/" }, + mode: params.publish_dir_mode, + pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz,log}' + ] + } + withName: MALT_RUN { ext.args = { "${meta.db_params}" } ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } diff --git a/modules.json b/modules.json index 64e6c3c..355dc9e 100644 --- a/modules.json +++ b/modules.json @@ -33,9 +33,12 @@ "porechop": { "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046" }, + "prinseqplusplus": { + "git_sha": "f1c5384c31e985591716afdd732cf8c2ae29d05b" + }, "untar": { "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/prinseqplusplus/main.nf b/modules/nf-core/modules/prinseqplusplus/main.nf new file mode 100644 index 0000000..ebd8c58 --- /dev/null +++ b/modules/nf-core/modules/prinseqplusplus/main.nf @@ -0,0 +1,61 @@ +process PRINSEQPLUSPLUS { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::prinseq-plus-plus=1.2.3" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/prinseq-plus-plus:1.2.3--hc90279e_1': + 'quay.io/biocontainers/prinseq-plus-plus:1.2.3--hc90279e_1' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*_good_out*.fastq.gz") , emit: good_reads + tuple val(meta), path("*_single_out*.fastq.gz"), optional: true, emit: single_reads + tuple val(meta), path("*_bad_out*.fastq.gz") , optional: true, emit: bad_reads + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + if (meta.single_end) { + """ + prinseq++ \\ + -threads $task.cpus \\ + -fastq ${reads} \\ + -out_name ${prefix} \\ + -out_gz \\ + -VERBOSE 1 \\ + $args \\ + | tee ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + prinseqplusplus: \$(echo \$(prinseq++ --version | cut -f 2 -d ' ' )) + END_VERSIONS + """ + } else { + """ + prinseq++ \\ + -threads $task.cpus \\ + -fastq ${reads[0]} \\ + -fastq2 ${reads[1]} \\ + -out_name ${prefix} \\ + -out_gz \\ + -VERBOSE 1 \\ + $args \\ + | tee ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + prinseqplusplus: \$(echo \$(prinseq++ --version | cut -f 2 -d ' ' )) + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/modules/prinseqplusplus/meta.yml b/modules/nf-core/modules/prinseqplusplus/meta.yml new file mode 100644 index 0000000..8155df9 --- /dev/null +++ b/modules/nf-core/modules/prinseqplusplus/meta.yml @@ -0,0 +1,60 @@ +name: "prinseqplusplus" +description: PRINSEQ++ is a C++ implementation of the prinseq-lite.pl program. It can be used to filter, reformat or trim genomic and metagenomic sequence data +keywords: + - fastq + - fasta + - filter + - trim +tools: + - "prinseqplusplus": + description: "PRINSEQ++ - Multi-threaded C++ sequence cleaning" + homepage: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus" + documentation: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus" + tool_dev_url: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus" + doi: "10.7287/peerj.preprints.27553v1" + licence: "['GPL v2']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end + data, respectively. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - good_reads: + type: file + description: Reads passing filter(s) in gzipped FASTQ format + pattern: "*_good_out_{R1,R2}.fastq.gz" + - single_reads: + type: file + description: | + Single reads without the pair passing filter(s) in gzipped FASTQ format + pattern: "*_single_out_{R1,R2}.fastq.gz" + - bad_reads: + type: file + description: | + Reads without not passing filter(s) in gzipped FASTQ format + pattern: "*_bad_out_{R1,R2}.fastq.gz" + - log: + type: file + description: | + Verbose level 2 STDOUT information in a log file + pattern: "*.log" + +authors: + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index d57743a..03dfbb9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -66,11 +66,14 @@ params { longread_clip = false // Complexity filtering - shortread_complexityfilter = false - shortread_complexityfilter_tool = 'bbduk' - shortread_complexityfilter_bbduk_entropy = 0.3 - shortread_complexityfilter_bbduk_windowsize = 50 - shortread_complexityfilter_bbduk_mask = false + shortread_complexityfilter = false + shortread_complexityfilter_tool = 'bbduk' + shortread_complexityfilter_entropy = 0.3 + shortread_complexityfilter_bbduk_windowsize = 50 + shortread_complexityfilter_bbduk_mask = false + shortread_complexityfilter_prinseqplusplus_mode = 'entropy' + shortread_complexityfilter_prinseqplusplus_dustscore = 0.5 + // MALT run_malt = false diff --git a/nextflow_schema.json b/nextflow_schema.json index bd8b438..6dee045 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -284,7 +294,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -308,10 +321,6 @@ "type": "string", "default": "bbduk" }, - "shortread_complexityfilter_bbduk_entropy": { - "type": "number", - "default": 0.3 - }, "shortread_complexityfilter_bbduk_windowsize": { "type": "integer", "default": 50 @@ -321,6 +330,18 @@ }, "shortread_complexityfilter": { "type": "boolean" + }, + "shortread_complexityfilter_entropy": { + "type": "number", + "default": 0.3 + }, + "shortread_complexityfilter_prinseqplusplus_mode": { + "type": "string", + "default": "entropy" + }, + "shortread_complexityfilter_prinseqplusplus_dustscore": { + "type": "number", + "default": 0.5 } } -} +} \ No newline at end of file diff --git a/subworkflows/local/shortread_complexityfiltering.nf b/subworkflows/local/shortread_complexityfiltering.nf index 3d69ed6..12686d7 100644 --- a/subworkflows/local/shortread_complexityfiltering.nf +++ b/subworkflows/local/shortread_complexityfiltering.nf @@ -2,7 +2,8 @@ // Check input samplesheet and get read channels // -include { BBMAP_BBDUK } from '../../modules/nf-core/modules/bbmap/bbduk/main' +include { BBMAP_BBDUK } from '../../modules/nf-core/modules/bbmap/bbduk/main' +include { PRINSEQPLUSPLUS } from '../../modules/nf-core/modules/prinseqplusplus/main' workflow SHORTREAD_COMPLEXITYFILTERING { take: @@ -16,6 +17,9 @@ workflow SHORTREAD_COMPLEXITYFILTERING { ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads ch_versions = ch_versions.mix( BBMAP_BBDUK.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( BBMAP_BBDUK.out.log ) + } else if ( params.shortread_complexityfilter_tool == 'prinseqplusplus' ) { + ch_filtered_reads = PRINSEQPLUSPLUS ( reads ).good_reads + ch_versions = ch_versions.mix( PRINSEQPLUSPLUS.out.versions.first() ) } else { ch_filtered_reads = reads } From 613f6a5565dbaaf6bef06ba980a6b017506490f7 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 4 Apr 2022 21:21:29 +0200 Subject: [PATCH 07/21] Prettier --- modules.json | 2 +- nextflow_schema.json | 21 ++++----------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/modules.json b/modules.json index 355dc9e..d8ac3db 100644 --- a/modules.json +++ b/modules.json @@ -41,4 +41,4 @@ } } } -} \ No newline at end of file +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 6dee045..75e1abe 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -294,10 +284,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -344,4 +331,4 @@ "default": 0.5 } } -} \ No newline at end of file +} From c54c32c4947df993231c364ac3fe19f2abb85eed Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 5 Apr 2022 10:18:34 +0200 Subject: [PATCH 08/21] chore: update adapterremoval --- modules.json | 4 +- .../nf-core/modules/adapterremoval/main.nf | 44 ++++++++++++++----- .../nf-core/modules/adapterremoval/meta.yml | 14 +++--- 3 files changed, 42 insertions(+), 20 deletions(-) diff --git a/modules.json b/modules.json index 9953edb..51f13ea 100644 --- a/modules.json +++ b/modules.json @@ -4,7 +4,7 @@ "repos": { "nf-core/modules": { "adapterremoval": { - "git_sha": "f0800157544a82ae222931764483331a81812012" + "git_sha": "879d42c5e28661fe0a5e744c9e2c515868f9e08a" }, "cat/fastq": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" @@ -38,4 +38,4 @@ } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/adapterremoval/main.nf b/modules/nf-core/modules/adapterremoval/main.nf index 9d16b9c..0e17c05 100644 --- a/modules/nf-core/modules/adapterremoval/main.nf +++ b/modules/nf-core/modules/adapterremoval/main.nf @@ -12,15 +12,14 @@ process ADAPTERREMOVAL { path(adapterlist) output: - tuple val(meta), path("${prefix}.truncated.gz") , optional: true, emit: singles_truncated - tuple val(meta), path("${prefix}.discarded.gz") , optional: true, emit: discarded - tuple val(meta), path("${prefix}.pair1.truncated.gz") , optional: true, emit: pair1_truncated - tuple val(meta), path("${prefix}.pair2.truncated.gz") , optional: true, emit: pair2_truncated - tuple val(meta), path("${prefix}.collapsed.gz") , optional: true, emit: collapsed - tuple val(meta), path("${prefix}.collapsed.truncated.gz") , optional: true, emit: collapsed_truncated - tuple val(meta), path("${prefix}.paired.gz") , optional: true, emit: paired_interleaved - tuple val(meta), path('*.log') , emit: log - path "versions.yml" , emit: versions + tuple val(meta), path("${prefix}.truncated.fastq.gz") , optional: true, emit: singles_truncated + tuple val(meta), path("${prefix}.discarded.fastq.gz") , optional: true, emit: discarded + tuple val(meta), path("${prefix}.pair{1,2}.truncated.fastq.gz") , optional: true, emit: paired_truncated + tuple val(meta), path("${prefix}.collapsed.fastq.gz") , optional: true, emit: collapsed + tuple val(meta), path("${prefix}.collapsed.truncated.fastq.gz") , optional: true, emit: collapsed_truncated + tuple val(meta), path("${prefix}.paired.fastq.gz") , optional: true, emit: paired_interleaved + tuple val(meta), path('*.settings') , emit: settings + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -38,10 +37,19 @@ process ADAPTERREMOVAL { $adapterlist \\ --basename ${prefix} \\ --threads ${task.cpus} \\ - --settings ${prefix}.log \\ --seed 42 \\ --gzip + ensure_fastq() { + if [ -f "\${1}" ]; then + mv "\${1}" "\${1::-3}.fastq.gz" + fi + + } + + ensure_fastq '${prefix}.truncated.gz' + ensure_fastq '${prefix}.discarded.gz' + cat <<-END_VERSIONS > versions.yml "${task.process}": adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g") @@ -56,10 +64,24 @@ process ADAPTERREMOVAL { $adapterlist \\ --basename ${prefix} \\ --threads $task.cpus \\ - --settings ${prefix}.log \\ --seed 42 \\ --gzip + ensure_fastq() { + if [ -f "\${1}" ]; then + mv "\${1}" "\${1::-3}.fastq.gz" + fi + + } + + ensure_fastq '${prefix}.truncated.gz' + ensure_fastq '${prefix}.discarded.gz' + ensure_fastq '${prefix}.pair1.truncated.gz' + ensure_fastq '${prefix}.pair2.truncated.gz' + ensure_fastq '${prefix}.collapsed.gz' + ensure_fastq '${prefix}.collapsed.truncated.gz' + ensure_fastq '${prefix}.paired.gz' + cat <<-END_VERSIONS > versions.yml "${task.process}": adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g") diff --git a/modules/nf-core/modules/adapterremoval/meta.yml b/modules/nf-core/modules/adapterremoval/meta.yml index 5faad04..77273f6 100644 --- a/modules/nf-core/modules/adapterremoval/meta.yml +++ b/modules/nf-core/modules/adapterremoval/meta.yml @@ -43,43 +43,43 @@ output: Adapter trimmed FastQ files of either single-end reads, or singleton 'orphaned' reads from merging of paired-end data (i.e., one of the pair was lost due to filtering thresholds). - pattern: "*.truncated.gz" + pattern: "*.truncated.fastq.gz" - discarded: type: file description: | Adapter trimmed FastQ files of reads that did not pass filtering thresholds. - pattern: "*.discarded.gz" + pattern: "*.discarded.fastq.gz" - pair1_truncated: type: file description: | Adapter trimmed R1 FastQ files of paired-end reads that did not merge with their respective R2 pair due to long templates. The respective pair is stored in 'pair2_truncated'. - pattern: "*.pair1.truncated.gz" + pattern: "*.pair1.truncated.fastq.gz" - pair2_truncated: type: file description: | Adapter trimmed R2 FastQ files of paired-end reads that did not merge with their respective R1 pair due to long templates. The respective pair is stored in 'pair1_truncated'. - pattern: "*.pair2.truncated.gz" + pattern: "*.pair2.truncated.fastq.gz" - collapsed: type: file description: | Collapsed FastQ of paired-end reads that successfully merged with their respective R1 pair but were not trimmed. - pattern: "*.collapsed.gz" + pattern: "*.collapsed.fastq.gz" - collapsed_truncated: type: file description: | Collapsed FastQ of paired-end reads that successfully merged with their respective R1 pair and were trimmed of adapter due to sufficient overlap. - pattern: "*.collapsed.truncated.gz" + pattern: "*.collapsed.truncated.fastq.gz" - log: type: file description: AdapterRemoval log file - pattern: "*.log" + pattern: "*.settings" - versions: type: file description: File containing software versions From 0e9b2989e3fd59b6e05a77a8faf6ed89d6ddea03 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 5 Apr 2022 10:53:04 +0200 Subject: [PATCH 09/21] refactor: remove superfluous ENSURE_FASTQ_EXTENSION --- .../local/shortread_adapterremoval.nf | 75 ++++--------------- 1 file changed, 16 insertions(+), 59 deletions(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index e522a1a..8c02f6f 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -5,11 +5,6 @@ Process short raw reads with AdapterRemoval include { ADAPTERREMOVAL as ADAPTERREMOVAL_SINGLE } from '../../modules/nf-core/modules/adapterremoval/main' include { ADAPTERREMOVAL as ADAPTERREMOVAL_PAIRED } from '../../modules/nf-core/modules/adapterremoval/main' include { CAT_FASTQ } from '../../modules/nf-core/modules/cat/fastq/main' -include { - ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION1; - ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION2; - ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION3; -} from '../../modules/local/ensure_fastq_extension' workflow SHORTREAD_ADAPTERREMOVAL { @@ -36,34 +31,25 @@ workflow SHORTREAD_ADAPTERREMOVAL { if ( params.shortread_clipmerge_mergepairs && !params.shortread_clipmerge_excludeunmerged ) { - ENSURE_FASTQ_EXTENSION1( - Channel.empty().mix( + ch_concat_fastq = Channel.empty() + .mix( ADAPTERREMOVAL_PAIRED.out.collapsed, ADAPTERREMOVAL_PAIRED.out.collapsed_truncated, ADAPTERREMOVAL_PAIRED.out.singles_truncated, - ADAPTERREMOVAL_PAIRED.out.pair1_truncated, - ADAPTERREMOVAL_PAIRED.out.pair2_truncated + ADAPTERREMOVAL_PAIRED.out.paired_truncated ) - .map { meta, reads -> - meta.single_end = true - [meta, reads] - } - ) + .groupTuple() + .map { [it.head(), it.tail().flatten()] } - CAT_FASTQ( - ENSURE_FASTQ_EXTENSION1.out.reads - .groupTuple() - ) - - ENSURE_FASTQ_EXTENSION2(ADAPTERREMOVAL_SINGLE.out.singles_truncated) + CAT_FASTQ(ch_concat_fastq) ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads - .mix(ENSURE_FASTQ_EXTENSION2.out.reads) + .mix(ADAPTERREMOVAL_SINGLE.out.singles_truncated) } else if ( params.shortread_clipmerge_mergepairs && params.shortread_clipmerge_excludeunmerged ) { - ENSURE_FASTQ_EXTENSION1( - Channel.empty().mix( + ch_concat_fastq = Channel.empty() + .mix( ADAPTERREMOVAL_PAIRED.out.collapsed, ADAPTERREMOVAL_PAIRED.out.collapsed_truncated ) @@ -71,54 +57,25 @@ workflow SHORTREAD_ADAPTERREMOVAL { meta.single_end = true [meta, reads] } - ) + .groupTuple() - CAT_FASTQ( - ENSURE_FASTQ_EXTENSION1.out.reads - .groupTuple() - ) - - ENSURE_FASTQ_EXTENSION2(ADAPTERREMOVAL_SINGLE.out.singles_truncated) + CAT_FASTQ(ch_concat_fastq) ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads - .mix(ENSURE_FASTQ_EXTENSION2.out.reads) + .mix(ADAPTERREMOVAL_SINGLE.out.singles_truncated) } else { - ENSURE_FASTQ_EXTENSION1( - ADAPTERREMOVAL_PAIRED.out.pair1_truncated - .map { meta, reads -> - meta.single_end = true - [meta, reads] - } - ) - - ENSURE_FASTQ_EXTENSION2( - ADAPTERREMOVAL_PAIRED.out.pair2_truncated - .map { meta, reads -> - meta.single_end = true - [meta, reads] - } - ) - - ENSURE_FASTQ_EXTENSION3(ADAPTERREMOVAL_SINGLE.out.singles_truncated) - - ch_adapterremoval_reads_prepped = ENSURE_FASTQ_EXTENSION1.out.reads - .join(ENSURE_FASTQ_EXTENSION2.out.reads) - .groupTuple() - .map { meta, pair1, pair2 -> - meta.single_end = false - [ meta, [ pair1, pair2 ].flatten() ] - } - .mix(ENSURE_FASTQ_EXTENSION3.out.reads) + ch_adapterremoval_reads_prepped = ADAPTERREMOVAL_PAIRED.out.paired_truncated + .mix(ADAPTERREMOVAL_SINGLE.out.singles_truncated) } ch_versions = ch_versions.mix( ADAPTERREMOVAL_SINGLE.out.versions.first() ) ch_versions = ch_versions.mix( ADAPTERREMOVAL_PAIRED.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( - ADAPTERREMOVAL_PAIRED.out.log.collect{it[1]}, - ADAPTERREMOVAL_SINGLE.out.log.collect{it[1]} + ADAPTERREMOVAL_PAIRED.out.settings.collect{it[1]}, + ADAPTERREMOVAL_SINGLE.out.settings.collect{it[1]} ) emit: From d46ddd972c2fc2c3ce0e7fe03b3ad42a4821332a Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 5 Apr 2022 11:04:30 +0200 Subject: [PATCH 10/21] Expand tests and fix schema after review --- .github/workflows/ci.yml | 8 ++++++++ conf/test.config | 13 +++++++------ nextflow.config | 2 +- nextflow_schema.json | 3 ++- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b8975b5..79148f0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,8 +29,16 @@ jobs: - NXF_VER: "" NXF_EDGE: "1" parameters: + - "--longread_clip false" + - "--shortread_clip false" - "--shortread_clipmerge_tool fastp" + - "--shortread_clipmerge_tool fastp --shortread_clipmerge_mergepairs --shortread_clipmerge_excludeunmerged" + - "--shortread_clipmerge_tool fastp --shortread_clipmerge_mergepairs" - "--shortread_clipmerge_tool adapterremoval" + - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs --shortread_clipmerge_excludeunmerged" + - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs" + - "--shortread_complexityfilter_tool bbduk" + - "--shortread_complexityfilter_tool prinseq" steps: - name: Check out pipeline code diff --git a/conf/test.config b/conf/test.config index ff92fe7..bdad2c1 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,11 +22,12 @@ params { // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' - run_kraken2 = true - run_malt = true - run_metaphlan3 = true - shortread_clipmerge = true + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + run_kraken2 = true + run_malt = true + run_metaphlan3 = true + shortread_clipmerge = true + longread_clip = false shortread_complexityfilter = true } diff --git a/nextflow.config b/nextflow.config index cd81f08..e6d6307 100644 --- a/nextflow.config +++ b/nextflow.config @@ -51,7 +51,7 @@ params { max_cpus = 16 max_time = '240.h' - // Databaess + // Databases databases = null // FASTQ preprocessing diff --git a/nextflow_schema.json b/nextflow_schema.json index bec9ea4..6f6125e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -328,7 +328,8 @@ }, "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", - "default": "entropy" + "default": "entropy", + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", From 59b2088aa183a58523f712ec5ee85cbdab8e1647 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 5 Apr 2022 11:16:43 +0200 Subject: [PATCH 11/21] style: apply prettier --- modules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 51f13ea..becb501 100644 --- a/modules.json +++ b/modules.json @@ -38,4 +38,4 @@ } } } -} \ No newline at end of file +} From 39e5e802c7af4e7e36927afa471801ff3b8aab6e Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 5 Apr 2022 11:29:39 +0200 Subject: [PATCH 12/21] refactor: make code more explicit, add comment --- subworkflows/local/shortread_adapterremoval.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index 8c02f6f..906a33f 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -39,7 +39,9 @@ workflow SHORTREAD_ADAPTERREMOVAL { ADAPTERREMOVAL_PAIRED.out.paired_truncated ) .groupTuple() - .map { [it.head(), it.tail().flatten()] } + // Paired-end reads cause a nested tuple during grouping. + // We want to present a flat list of files to `CAT_FASTQ`. + .map { meta, fastq -> [meta, fastq.flatten()] } CAT_FASTQ(ch_concat_fastq) From 245a4d1f5dae7e29ec76aa1265154edbbe3eb3db Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 5 Apr 2022 13:08:44 +0200 Subject: [PATCH 13/21] Fix MQC staging and remove debugging dump --- subworkflows/local/shortread_adapterremoval.nf | 4 ++-- workflows/taxprofiler.nf | 13 ++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index 9d49b10..473a05f 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -117,8 +117,8 @@ workflow SHORTREAD_ADAPTERREMOVAL { ch_versions = ch_versions.mix( ADAPTERREMOVAL_SINGLE.out.versions.first() ) ch_versions = ch_versions.mix( ADAPTERREMOVAL_PAIRED.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( - ADAPTERREMOVAL_PAIRED.out.log.collect{it[1]}, - ADAPTERREMOVAL_SINGLE.out.log.collect{it[1]} + ADAPTERREMOVAL_PAIRED.out.log, + ADAPTERREMOVAL_SINGLE.out.log ) emit: diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index ddd5914..eb2b461 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -171,7 +171,6 @@ workflow TAXPROFILER { } ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 - .dump(tag: "input_metaphlan3") .multiMap { it -> reads: [it[0] + it[2], it[1]] @@ -213,34 +212,34 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) if (params.shortread_clipmerge) { - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]).dump(tag: "clipmerge") ) ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions ) } if (params.longread_clip) { - ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) + ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]).dump(tag: "clip") ) ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions ) } if (params.shortread_complexityfilter){ - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]).dump(tag: "complex") ) ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } if (params.run_kraken2) { - ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]).dump(tag: "kraken") ) ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) } if (params.run_malt) { - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]).dump(tag: "malt") ) ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) } // TODO Versions for Karken/MALT not report? // TODO create multiQC module for metaphlan MULTIQC ( - ch_multiqc_files.collect() + ch_multiqc_files.collect().dump(tag: "input_to_mqc") ) multiqc_report = MULTIQC.out.report.toList() ch_versions = ch_versions.mix(MULTIQC.out.versions) From 5b9355725dea7af56745a28e6807ae5768798e8b Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 5 Apr 2022 13:14:06 +0200 Subject: [PATCH 14/21] Whoops --- workflows/taxprofiler.nf | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index eb2b461..8dfe996 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -17,7 +17,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } -if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not except uncollapsed paired-reads. Pairs will be profiled as separate files." +if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "[nf-core/taxprofiler] error: cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" /* @@ -212,34 +212,34 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) if (params.shortread_clipmerge) { - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]).dump(tag: "clipmerge") ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions ) } if (params.longread_clip) { - ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]).dump(tag: "clip") ) + ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions ) } if (params.shortread_complexityfilter){ - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]).dump(tag: "complex") ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } if (params.run_kraken2) { - ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]).dump(tag: "kraken") ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) } if (params.run_malt) { - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]).dump(tag: "malt") ) + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) } // TODO Versions for Karken/MALT not report? // TODO create multiQC module for metaphlan MULTIQC ( - ch_multiqc_files.collect().dump(tag: "input_to_mqc") + ch_multiqc_files.collect() ) multiqc_report = MULTIQC.out.report.toList() ch_versions = ch_versions.mix(MULTIQC.out.versions) From 98f082d7b6df9e7ef8da41b98bafab63770d3582 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 5 Apr 2022 13:17:54 +0200 Subject: [PATCH 15/21] Fix mistake in previous upstream merge with AR2 output channel for settings file --- subworkflows/local/shortread_adapterremoval.nf | 5 +++-- workflows/taxprofiler.nf | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index 6512596..bfed76a 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -75,9 +75,10 @@ workflow SHORTREAD_ADAPTERREMOVAL { ch_versions = ch_versions.mix( ADAPTERREMOVAL_SINGLE.out.versions.first() ) ch_versions = ch_versions.mix( ADAPTERREMOVAL_PAIRED.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( - ADAPTERREMOVAL_PAIRED.out.log, - ADAPTERREMOVAL_SINGLE.out.log + ADAPTERREMOVAL_PAIRED.out.settings, + ADAPTERREMOVAL_SINGLE.out.settings ) emit: diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 8dfe996..3b08402 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -222,7 +222,7 @@ workflow TAXPROFILER { } if (params.shortread_complexityfilter){ - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } From 82aa89ad63d8769989b533da4695dc9387d81355 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 5 Apr 2022 13:55:11 +0200 Subject: [PATCH 16/21] re add missing switch meta of merged reads to true --- subworkflows/local/shortread_adapterremoval.nf | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index bfed76a..b573be9 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -38,11 +38,17 @@ workflow SHORTREAD_ADAPTERREMOVAL { ADAPTERREMOVAL_PAIRED.out.singles_truncated, ADAPTERREMOVAL_PAIRED.out.paired_truncated ) + .map { meta, reads -> + def meta_new = meta.clone() + meta_new.single_end = true + [meta_new, reads] + } .groupTuple() // Paired-end reads cause a nested tuple during grouping. // We want to present a flat list of files to `CAT_FASTQ`. .map { meta, fastq -> [meta, fastq.flatten()] } + CAT_FASTQ(ch_concat_fastq) ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads @@ -56,10 +62,13 @@ workflow SHORTREAD_ADAPTERREMOVAL { ADAPTERREMOVAL_PAIRED.out.collapsed_truncated ) .map { meta, reads -> - meta.single_end = true - [meta, reads] + def meta_new = meta.clone() + meta_new.single_end = true + [meta_new, reads] } .groupTuple() + .map { meta, fastq -> [meta, fastq.flatten()] } + CAT_FASTQ(ch_concat_fastq) From c40a9a00113c3e178bf20a4f1e0870d0994d9383 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 5 Apr 2022 12:13:45 +0200 Subject: [PATCH 17/21] refactor: avoid publishing unpacked databases --- conf/modules.config | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 2a8789f..22db8d0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,11 +35,7 @@ process { } withName: UNTAR { - publishDir = [ - path: { "${params.outdir}/databases" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + publishDir = null } withName: FASTQC { From af12435fa5e3adf240d48bf9f764a063c62593b8 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 5 Apr 2022 12:16:26 +0200 Subject: [PATCH 18/21] fix: use parameter for publish mode everywhere --- conf/modules.config | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 22db8d0..a910b8e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -43,7 +43,7 @@ process { ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } publishDir = [ path: { "${params.outdir}/fastqc/raw" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.html' ] } @@ -53,7 +53,7 @@ process { ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } publishDir = [ path: { "${params.outdir}/fastqc/processed" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.html' ] } @@ -69,7 +69,7 @@ process { ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ path: { "${params.outdir}/fastp" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.fastq.gz' ] } @@ -88,7 +88,7 @@ process { ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ path: { "${params.outdir}/fastp" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.fastq.gz' ] } @@ -104,7 +104,7 @@ process { ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ path: { "${params.outdir}/adapterremoval" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.fastq.gz' ] } @@ -123,7 +123,7 @@ process { ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ path: { "${params.outdir}/adapterremoval" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.fastq.gz' ] } @@ -132,7 +132,7 @@ process { ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ path: { "${params.outdir}/porechop" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.fastq.gz' ] } @@ -169,7 +169,7 @@ process { ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/malt/${meta.db_name}" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.{rma6,tab,text,sam,log}' ] } @@ -179,7 +179,7 @@ process { ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/kraken2/${meta.db_name}" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.{fastq.gz,txt}' ] } From 8827b143a22b07c21c7cbe5776fb112015f5ae8b Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 5 Apr 2022 12:30:06 +0200 Subject: [PATCH 19/21] fix: disable publishing correctly --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index a910b8e..ce9597b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,7 +35,7 @@ process { } withName: UNTAR { - publishDir = null + publishDir = [enabled: false] } withName: FASTQC { From 018b9a8ea60afd0c5d123e4420fdccaecafbc9f7 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 6 Apr 2022 10:21:43 +0200 Subject: [PATCH 20/21] refactor: do not generally publish output --- conf/modules.config | 6 ------ 1 file changed, 6 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ce9597b..f9cfe3b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,12 +12,6 @@ process { - publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - withName: SAMPLESHEET_CHECK { publishDir = [ path: { "${params.outdir}/pipeline_info" }, From 7ab3aa546b8e1d0388981757d259284bf67f5fc8 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 6 Apr 2022 10:27:11 +0200 Subject: [PATCH 21/21] refactor: use flags to publish reads --- conf/modules.config | 25 ++++++++++++++----------- nextflow.config | 2 ++ nextflow_schema.json | 8 ++++++++ 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index f9cfe3b..b59850f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -28,10 +28,6 @@ process { ] } - withName: UNTAR { - publishDir = [enabled: false] - } - withName: FASTQC { ext.args = '--quiet' ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } @@ -64,7 +60,8 @@ process { publishDir = [ path: { "${params.outdir}/fastp" }, mode: params.publish_dir_mode, - pattern: '*.fastq.gz' + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads ] } @@ -83,7 +80,8 @@ process { publishDir = [ path: { "${params.outdir}/fastp" }, mode: params.publish_dir_mode, - pattern: '*.fastq.gz' + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads ] } @@ -99,7 +97,8 @@ process { publishDir = [ path: { "${params.outdir}/adapterremoval" }, mode: params.publish_dir_mode, - pattern: '*.fastq.gz' + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads ] } @@ -118,7 +117,8 @@ process { publishDir = [ path: { "${params.outdir}/adapterremoval" }, mode: params.publish_dir_mode, - pattern: '*.fastq.gz' + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads ] } @@ -127,7 +127,8 @@ process { publishDir = [ path: { "${params.outdir}/porechop" }, mode: params.publish_dir_mode, - pattern: '*.fastq.gz' + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads ] } @@ -141,7 +142,8 @@ process { publishDir = [ path: { "${params.outdir}/bbduk/" }, mode: params.publish_dir_mode, - pattern: '*.{fastq.gz,log}' + pattern: '*.{fastq.gz,log}', + enabled: params.save_complexityfiltered_reads ] } @@ -154,7 +156,8 @@ process { publishDir = [ path: { "${params.outdir}/prinseqplusplus/" }, mode: params.publish_dir_mode, - pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz,log}' + pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz,log}', + enabled: params.save_complexityfiltered_reads ] } diff --git a/nextflow.config b/nextflow.config index e6d6307..19cc823 100644 --- a/nextflow.config +++ b/nextflow.config @@ -64,6 +64,7 @@ params { shortread_clipmerge_adapter2 = null shortread_clipmerge_minlength = 15 longread_clip = false + save_preprocessed_reads = false // Complexity filtering shortread_complexityfilter = false @@ -73,6 +74,7 @@ params { shortread_complexityfilter_bbduk_mask = false shortread_complexityfilter_prinseqplusplus_mode = 'entropy' shortread_complexityfilter_prinseqplusplus_dustscore = 0.5 + save_complexityfiltered_reads = false // MALT diff --git a/nextflow_schema.json b/nextflow_schema.json index 6f6125e..6858409 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -308,6 +308,10 @@ "type": "integer", "default": 15 }, + "save_preprocessed_reads": { + "type": "boolean", + "default": false + }, "shortread_complexityfilter_tool": { "type": "string", "default": "bbduk" @@ -334,6 +338,10 @@ "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", "default": 0.5 + }, + "save_complexityfiltered_reads": { + "type": "boolean", + "default": false } } }