From 87a1d8051979040f7dd1b6f976b2bd0224a7bf24 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 7 May 2022 05:22:35 +0200 Subject: [PATCH 1/9] Add working output for Kraken2/Centrifuge/DIAMOND --- conf/modules.config | 4 +- conf/test.config | 4 ++ modules.json | 6 +- .../nf-core/modules/diamond/blastx/main.nf | 33 ++++++--- .../nf-core/modules/diamond/blastx/meta.yml | 30 +++++++- .../nf-core/modules/kraken2/kraken2/main.nf | 23 ++++-- .../nf-core/modules/kraken2/kraken2/meta.yml | 25 +++++-- nextflow.config | 14 ++-- nextflow_schema.json | 71 ++++++++++++++----- subworkflows/local/profiling.nf | 34 ++++++--- workflows/taxprofiler.nf | 2 + 11 files changed, 181 insertions(+), 65 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index cd0fb04..31d0fca 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -271,7 +271,7 @@ process { publishDir = [ path: { "${params.outdir}/kraken2/${meta.db_name}" }, mode: params.publish_dir_mode, - pattern: '*.{txt}' + pattern: '*.{txt,report,fastq.gz}' ] } @@ -289,7 +289,7 @@ process { publishDir = [ path: { "${params.outdir}/centrifuge/${meta.db_name}" }, mode: params.publish_dir_mode, - pattern: '*.txt' + pattern: '*.{txt,sam,gz}' ] ext.args = { "${meta.db_params}" } ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } diff --git a/conf/test.config b/conf/test.config index a5244f9..573db42 100644 --- a/conf/test.config +++ b/conf/test.config @@ -36,6 +36,10 @@ params { run_metaphlan3 = true run_centrifuge = true run_diamond = true + malt_save_reads = true + kraken2_save_reads = true + centrifuge_save_reads = true + diamond_save_reads = true } process { diff --git a/modules.json b/modules.json index a55c88b..5cad32e 100644 --- a/modules.json +++ b/modules.json @@ -28,7 +28,7 @@ "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, "diamond/blastx": { - "git_sha": "42564565b934eeb2449e35ec97ed13ff2a67f1de" + "git_sha": "bd3bfe0817246082525ab93707976676b1fe208b" }, "fastp": { "git_sha": "d0a1cbb703a130c19f6796c3fce24fbe7dfce789" @@ -43,7 +43,7 @@ "git_sha": "538dbac98ba9c8f799536cd5a617195501439457" }, "kraken2/kraken2": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + "git_sha": "abe025677cdd805cc93032341ab19885473c1a07" }, "malt/run": { "git_sha": "72b96f4e504eef673f2b5c13560a9d90b669129b" @@ -80,4 +80,4 @@ } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/diamond/blastx/main.nf b/modules/nf-core/modules/diamond/blastx/main.nf index 6703c1e..d327227 100644 --- a/modules/nf-core/modules/diamond/blastx/main.nf +++ b/modules/nf-core/modules/diamond/blastx/main.nf @@ -2,21 +2,26 @@ process DIAMOND_BLASTX { tag "$meta.id" label 'process_medium' - // Dimaond is limited to v2.0.9 because there is not a - // singularity version higher than this at the current time. - conda (params.enable_conda ? "bioconda::diamond=2.0.9" : null) + conda (params.enable_conda ? "bioconda::diamond=2.0.15" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/diamond:2.0.9--hdcc8f71_0' : - 'quay.io/biocontainers/diamond:2.0.9--hdcc8f71_0' }" + 'https://depot.galaxyproject.org/singularity/diamond:2.0.15--hb97b32f_0' : + 'quay.io/biocontainers/diamond:2.0.15--hb97b32f_0' }" input: tuple val(meta), path(fasta) path db - val outext + val out_ext + val blast_columns output: - tuple val(meta), path('*.{blast,xml,txt,daa,sam,tsv,paf}'), emit: output - path "versions.yml" , emit: versions + tuple val(meta), path('*.blast'), optional: true, emit: blast + tuple val(meta), path('*.xml') , optional: true, emit: xml + tuple val(meta), path('*.txt') , optional: true, emit: txt + tuple val(meta), path('*.daa') , optional: true, emit: daa + tuple val(meta), path('*.sam') , optional: true, emit: sam + tuple val(meta), path('*.tsv') , optional: true, emit: tsv + tuple val(meta), path('*.paf') , optional: true, emit: paf + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -24,7 +29,8 @@ process DIAMOND_BLASTX { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - switch ( outext ) { + def columns = blast_columns ? "${blast_columns}" : '' + switch ( out_ext ) { case "blast": outfmt = 0; break case "xml": outfmt = 5; break case "txt": outfmt = 6; break @@ -32,6 +38,11 @@ process DIAMOND_BLASTX { case "sam": outfmt = 101; break case "tsv": outfmt = 102; break case "paf": outfmt = 103; break + default: + outfmt = '6'; + out_ext = 'txt'; + log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)"); + break } """ DB=`find -L ./ -name "*.dmnd" | sed 's/.dmnd//'` @@ -41,9 +52,9 @@ process DIAMOND_BLASTX { --threads $task.cpus \\ --db \$DB \\ --query $fasta \\ - --outfmt ${outfmt} \\ + --outfmt ${outfmt} ${columns} \\ $args \\ - --out ${prefix}.${outext} + --out ${prefix}.${out_ext} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/modules/diamond/blastx/meta.yml b/modules/nf-core/modules/diamond/blastx/meta.yml index 5ee2d55..2dcd7bc 100644 --- a/modules/nf-core/modules/diamond/blastx/meta.yml +++ b/modules/nf-core/modules/diamond/blastx/meta.yml @@ -28,7 +28,7 @@ input: type: directory description: Directory containing the nucelotide blast database pattern: "*" - - outext: + - out_ext: type: string description: | Specify the type of output file to be generated. `blast` corresponds to @@ -38,10 +38,34 @@ input: pattern: "blast|xml|txt|daa|sam|tsv|paf" output: + - blast: + type: file + description: File containing blastp hits + pattern: "*.{blast}" + - xml: + type: file + description: File containing blastp hits + pattern: "*.{xml}" - txt: type: file - description: File containing blastx hits - pattern: "*.{blastx.txt}" + description: File containing hits in tabular BLAST format. + pattern: "*.{txt}" + - daa: + type: file + description: File containing hits DAA format + pattern: "*.{daa}" + - sam: + type: file + description: File containing aligned reads in SAM format + pattern: "*.{sam}" + - tsv: + type: file + description: Tab separated file containing taxonomic classification of hits + pattern: "*.{tsv}" + - paf: + type: file + description: File containing aligned reads in pairwise mapping format format + pattern: "*.{paf}" - versions: type: file description: File containing software versions diff --git a/modules/nf-core/modules/kraken2/kraken2/main.nf b/modules/nf-core/modules/kraken2/kraken2/main.nf index 3ec5df5..d400023 100644 --- a/modules/nf-core/modules/kraken2/kraken2/main.nf +++ b/modules/nf-core/modules/kraken2/kraken2/main.nf @@ -10,12 +10,15 @@ process KRAKEN2_KRAKEN2 { input: tuple val(meta), path(reads) path db + val save_output_fastqs + val save_reads_assignment output: - tuple val(meta), path('*classified*') , emit: classified - tuple val(meta), path('*unclassified*'), emit: unclassified - tuple val(meta), path('*report.txt') , emit: txt - path "versions.yml" , emit: versions + tuple val(meta), path('*classified*') , optional:true, emit: classified_reads_fastq + tuple val(meta), path('*unclassified*') , optional:true, emit: unclassified_reads_fastq + tuple val(meta), path('*classifiedreads*'), optional:true, emit: classified_reads_assignment + tuple val(meta), path('*report.txt') , emit: report + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -26,19 +29,25 @@ process KRAKEN2_KRAKEN2 { def paired = meta.single_end ? "" : "--paired" def classified = meta.single_end ? "${prefix}.classified.fastq" : "${prefix}.classified#.fastq" def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq" + def classified_command = save_output_fastqs ? "--classified-out ${classified}" : "" + def unclassified_command = save_output_fastqs ? "--unclassified-out ${unclassified}" : "" + def readclassification_command = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "" + def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : "" + """ kraken2 \\ --db $db \\ --threads $task.cpus \\ - --unclassified-out $unclassified \\ - --classified-out $classified \\ --report ${prefix}.kraken2.report.txt \\ --gzip-compressed \\ + $unclassified_command \\ + $classified_command \\ + $readclassification_command \\ $paired \\ $args \\ $reads - pigz -p $task.cpus *.fastq + $compress_reads_command cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/modules/kraken2/kraken2/meta.yml b/modules/nf-core/modules/kraken2/kraken2/meta.yml index 9d6a385..7129fe3 100644 --- a/modules/nf-core/modules/kraken2/kraken2/meta.yml +++ b/modules/nf-core/modules/kraken2/kraken2/meta.yml @@ -27,25 +27,40 @@ input: - db: type: directory description: Kraken2 database + - save_output_fastqs: + type: boolean + description: | + If true, optional commands are added to save classified and unclassified reads + as fastq files + - save_reads_assignment: + type: boolean + description: | + If true, an optional command is added to save a file reporting the taxonomic + classification of each input read output: - meta: type: map description: | Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - - classified: + - classified_reads_fastq: type: file description: | - Reads classified to belong to any of the taxa + Reads classified as belonging to any of the taxa on the Kraken2 database. pattern: "*{fastq.gz}" - - unclassified: + - unclassified_reads_fastq: type: file description: | - Reads not classified to belong to any of the taxa + Reads not classified to any of the taxa on the Kraken2 database. pattern: "*{fastq.gz}" - - txt: + - classified_reads_assignment: + type: file + description: | + Kraken2 output file indicating the taxonomic assignment of + each input read + - report: type: file description: | Kraken2 report containing stats about classified diff --git a/nextflow.config b/nextflow.config index ca9e280..3f76d53 100644 --- a/nextflow.config +++ b/nextflow.config @@ -94,16 +94,17 @@ params { // MALT run_malt = false malt_mode = 'BlastN' - malt_generatemegansummary = false + malt_generate_megansummary = false + malt_save_reads = false // kraken2 - run_kraken2 = false + run_kraken2 = false + kraken2_save_reads = false + kraken2_save_readclassification = false // centrifuge run_centrifuge = false - centrifuge_save_unaligned = false - centrifuge_save_aligned = false - centrifuge_sam_format = false + centrifuge_save_reads = false // metaphlan3 run_metaphlan3 = false @@ -114,7 +115,8 @@ params { // diamond run_diamond = false - diamond_output_format = 'txt' + diamond_output_format = 'tsv' // TSV is only format with taxonomic information apparently + diamond_save_reads = false // this will override diamound output format so no taxonomic profile is generated! } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index ab2108e..bb4b759 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -278,15 +288,6 @@ "run_centrifuge": { "type": "boolean" }, - "centrifuge_save_unaligned": { - "type": "boolean" - }, - "centrifuge_save_aligned": { - "type": "boolean" - }, - "centrifuge_sam_format": { - "type": "boolean" - }, "run_metaphlan3": { "type": "boolean", "description": "Enable MetaPhlAn for taxonomic profiling" @@ -294,7 +295,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -335,7 +339,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -385,13 +392,20 @@ "run_kaiju": { "type": "boolean" }, - "malt_generatemegansummary": { + "malt_generate_megansummary": { "type": "boolean" }, "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": ["phylum", "class", "order", "family", "genus", "species"] + "enum": [ + "phylum", + "class", + "order", + "family", + "genus", + "species" + ] }, "run_diamond": { "type": "boolean" @@ -399,11 +413,34 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"] + "enum": [ + "blast", + "xml", + "txt", + "daa", + "sam", + "tsv", + "paf" + ] }, "longread_hostremoval_index": { "type": "string", "default": "None" + }, + "malt_save_reads": { + "type": "boolean" + }, + "kraken2_save_reads": { + "type": "boolean" + }, + "kraken2_save_readclassification": { + "type": "boolean" + }, + "centrifuge_save_reads": { + "type": "boolean" + }, + "diamond_save_reads": { + "type": "boolean" } } -} +} \ No newline at end of file diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 7fb3ce9..18ea7fa 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -65,10 +65,18 @@ workflow PROFILING { ch_input_for_malt = ch_input_for_profiling.malt .filter { it[0]['instrument_platform'] == 'ILLUMINA' } .map { - it -> - def temp_meta = [ id: it[2]['db_name']] + it[2] - def db = it[3] - [ temp_meta, it[1], db ] + meta, reads, db_meta, db -> + def sam_format = params.malt_save_reads ? ' --alignments' : "" + // TODO No MALT SAM? + // TODO check all aligned reads published + // TODO try turning on/off aligned reads + // TODO wut? [9a/a441d6] Submitted process > NFCORE_TAXPROFILER:TAXPROFILER:PROFILING:MALT_RUN (null) + def temp_meta = [ id: meta['db_name'] ] + def new_db_meta = db_meta.clone() + new_db_meta['db_params'] = db_meta['db_params'] + sam_format + def new_meta = temp_meta + new_db_meta + + [ new_meta, reads, db ] } .groupTuple(by: [0,2]) .multiMap { @@ -92,7 +100,7 @@ workflow PROFILING { [ meta_new, rma ] } - MEGAN_RMA2INFO (ch_maltrun_for_megan, params.malt_generatemegansummary ) + MEGAN_RMA2INFO (ch_maltrun_for_megan, params.malt_generate_megansummary ) ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( MALT_RUN.out.versions.first(), MEGAN_RMA2INFO.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( MEGAN_RMA2INFO.out.txt ) @@ -108,10 +116,10 @@ workflow PROFILING { db: it[3] } - KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) - ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db, params.kraken2_save_reads, params.kraken2_save_readclassification ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) - ch_raw_profiles = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.txt ) + ch_raw_profiles = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.report ) } @@ -128,7 +136,7 @@ workflow PROFILING { db: it[3] } - CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format ) + CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_reads, params.centrifuge_save_reads, params.centrifuge_save_reads ) CENTRIFUGE_KREPORT (CENTRIFUGE_CENTRIFUGE.out.results, ch_input_for_centrifuge.db) ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( CENTRIFUGE_KREPORT.out.kreport ) @@ -180,9 +188,13 @@ workflow PROFILING { db: it[3] } - DIAMOND_BLASTX ( ch_input_for_diamond.reads, ch_input_for_diamond.db, params.diamond_output_format ) + // diamond only accepts single output file specification, therefore + // this will replace output file! + ch_diamond_reads_format = params.diamond_save_reads ? 'sam' : params.diamond_output_format + + DIAMOND_BLASTX ( ch_input_for_diamond.reads, ch_input_for_diamond.db, params.diamond_output_format, [] ) ch_versions = ch_versions.mix( DIAMOND_BLASTX.out.versions.first() ) - ch_raw_profiles = ch_raw_profiles.mix( DIAMOND_BLASTX.out.output ) + ch_raw_profiles = ch_raw_profiles.mix( DIAMOND_BLASTX.out.tsv ) } diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 7a6cd09..c319296 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -29,6 +29,8 @@ if (params.hostremoval_reference ) { ch_reference = file(params.hostre if (params.shortread_hostremoval_index ) { ch_shortread_reference_index = file(params.shortread_hostremoval_index ) } else { ch_shortread_reference_index = [] } if (params.longread_hostremoval_index ) { ch_longread_reference_index = file(params.longread_hostremoval_index ) } else { ch_longread_reference_index = [] } +if (params.diamond_save_reads ) log.warn "[nf-core/taxprofiler] DIAMOND only allows output of a single format. Only aligned reads in SAM format will be produced, no taxonomic profiles will be available." + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES From 2838c136bda47ad24d36b5ebb174998d58c57348 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 2 Jun 2022 12:51:39 +0200 Subject: [PATCH 2/9] Prettier --- modules.json | 2 +- nextflow_schema.json | 45 +++++++------------------------------------- 2 files changed, 8 insertions(+), 39 deletions(-) diff --git a/modules.json b/modules.json index 5cad32e..9d918b3 100644 --- a/modules.json +++ b/modules.json @@ -80,4 +80,4 @@ } } } -} \ No newline at end of file +} diff --git a/nextflow_schema.json b/nextflow_schema.json index e7b495d..f0093b1 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -295,10 +285,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -340,10 +327,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -399,14 +383,7 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": [ - "phylum", - "class", - "order", - "family", - "genus", - "species" - ] + "enum": ["phylum", "class", "order", "family", "genus", "species"] }, "run_diamond": { "type": "boolean" @@ -414,15 +391,7 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": [ - "blast", - "xml", - "txt", - "daa", - "sam", - "tsv", - "paf" - ] + "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"] }, "longread_hostremoval_index": { "type": "string", @@ -448,4 +417,4 @@ "type": "boolean" } } -} \ No newline at end of file +} From 3313b90453ed0e4bf2571a28d1436427a05b90f7 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 3 Jun 2022 13:50:48 +0200 Subject: [PATCH 3/9] Fix JSOn schema --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index a82ff9e..2c00348 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -418,7 +418,7 @@ "longread_qc_target_bases": { "type": "integer", "default": 500000000 - } + }, "malt_save_reads": { "type": "boolean" }, From d48b3be5a780d87b30c8f9ce189a062676cc601a Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 3 Jun 2022 13:55:17 +0200 Subject: [PATCH 4/9] Tweak DIAMOND save_reads message --- nextflow.config | 2 +- workflows/taxprofiler.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 11cbfc0..e5b8a1d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -124,7 +124,7 @@ params { // diamond run_diamond = false diamond_output_format = 'tsv' // TSV is only format with taxonomic information apparently - diamond_save_reads = false // this will override diamound output format so no taxonomic profile is generated! + diamond_save_reads = false // this will override default diamond output format so no taxonomic profile is generated! } // Load base.config by default for all pipelines diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 6a05dbe..f29a366 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -33,7 +33,7 @@ if (params.hostremoval_reference ) { ch_reference = file(params.hostre if (params.shortread_hostremoval_index ) { ch_shortread_reference_index = file(params.shortread_hostremoval_index ) } else { ch_shortread_reference_index = [] } if (params.longread_hostremoval_index ) { ch_longread_reference_index = file(params.longread_hostremoval_index ) } else { ch_longread_reference_index = [] } -if (params.diamond_save_reads ) log.warn "[nf-core/taxprofiler] DIAMOND only allows output of a single format. Only aligned reads in SAM format will be produced, no taxonomic profiles will be available." +if (params.diamond_save_reads ) log.warn "[nf-core/taxprofiler] DIAMOND only allows output of a single format. As --diamond_save_reads supplied, only aligned reads in SAM format will be produced, no taxonomic profiles will be available." /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 9462032d00824f16c55fe4a313404282eddab91d Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 3 Jun 2022 22:29:04 +0200 Subject: [PATCH 5/9] Fix MALT save_alignemnts reads and some clean up --- conf/test.config | 3 +++ nextflow.config | 2 +- subworkflows/local/longread_preprocessing.nf | 2 +- subworkflows/local/profiling.nf | 10 ++++------ 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/conf/test.config b/conf/test.config index 996d5de..e9fa62e 100644 --- a/conf/test.config +++ b/conf/test.config @@ -47,4 +47,7 @@ process { withName: MALT_RUN { maxForks = 1 } + withName: MEGAN_RMA2INFO { + maxForks = 1 + } } diff --git a/nextflow.config b/nextflow.config index e5b8a1d..73fd0b3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -196,7 +196,7 @@ profiles { test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } test_noprofiling { includeConfig 'conf/test_noprofiling.config' } - test_nopreprocessing { includeConfig 'conf/test_preprocessing.config' } + test_nopreprocessing { includeConfig 'conf/test_nopreprocessing.config' } } // Load igenomes.config if required diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf index 5ae5417..6a23b0e 100644 --- a/subworkflows/local/longread_preprocessing.nf +++ b/subworkflows/local/longread_preprocessing.nf @@ -48,7 +48,7 @@ workflow LONGREAD_PREPROCESSING { } - FASTQC_PROCESSED ( ch_processed_reads.dump(tag: "filtlong") ) + FASTQC_PROCESSED ( ch_processed_reads ) ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) emit: diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 18ea7fa..b83c78f 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -66,19 +66,17 @@ workflow PROFILING { .filter { it[0]['instrument_platform'] == 'ILLUMINA' } .map { meta, reads, db_meta, db -> - def sam_format = params.malt_save_reads ? ' --alignments' : "" - // TODO No MALT SAM? - // TODO check all aligned reads published - // TODO try turning on/off aligned reads + def sam_format = params.malt_save_reads ? ' --alignments ./ -za false' : "" // TODO wut? [9a/a441d6] Submitted process > NFCORE_TAXPROFILER:TAXPROFILER:PROFILING:MALT_RUN (null) def temp_meta = [ id: meta['db_name'] ] def new_db_meta = db_meta.clone() new_db_meta['db_params'] = db_meta['db_params'] + sam_format def new_meta = temp_meta + new_db_meta - + new_meta['id'] = new_meta['db_name'] [ new_meta, reads, db ] } .groupTuple(by: [0,2]) + .dump(tag: "into_malt") .multiMap { it -> reads: [ it[0], it[1].flatten() ] @@ -192,7 +190,7 @@ workflow PROFILING { // this will replace output file! ch_diamond_reads_format = params.diamond_save_reads ? 'sam' : params.diamond_output_format - DIAMOND_BLASTX ( ch_input_for_diamond.reads, ch_input_for_diamond.db, params.diamond_output_format, [] ) + DIAMOND_BLASTX ( ch_input_for_diamond.reads, ch_input_for_diamond.db, ch_diamond_reads_format , [] ) ch_versions = ch_versions.mix( DIAMOND_BLASTX.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( DIAMOND_BLASTX.out.tsv ) From ebdd5683b248d2664fbd41fdc32cb5bf06e2e4ab Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 3 Jun 2022 22:33:48 +0200 Subject: [PATCH 6/9] Apply suggestions from code review --- subworkflows/local/profiling.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index b83c78f..9bae127 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -67,7 +67,6 @@ workflow PROFILING { .map { meta, reads, db_meta, db -> def sam_format = params.malt_save_reads ? ' --alignments ./ -za false' : "" - // TODO wut? [9a/a441d6] Submitted process > NFCORE_TAXPROFILER:TAXPROFILER:PROFILING:MALT_RUN (null) def temp_meta = [ id: meta['db_name'] ] def new_db_meta = db_meta.clone() new_db_meta['db_params'] = db_meta['db_params'] + sam_format @@ -76,7 +75,6 @@ workflow PROFILING { [ new_meta, reads, db ] } .groupTuple(by: [0,2]) - .dump(tag: "into_malt") .multiMap { it -> reads: [ it[0], it[1].flatten() ] From 490a8a8a840566a53ac252b107c3c1fff207c61c Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 9 Jun 2022 08:15:58 +0200 Subject: [PATCH 7/9] Remove dump --- subworkflows/local/profiling.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index b83c78f..03f0bf5 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -76,7 +76,6 @@ workflow PROFILING { [ new_meta, reads, db ] } .groupTuple(by: [0,2]) - .dump(tag: "into_malt") .multiMap { it -> reads: [ it[0], it[1].flatten() ] From 621b6a3d092e00a11086444083c95544ad8781b8 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 10 Jun 2022 11:27:51 +0200 Subject: [PATCH 8/9] Add comments about MALT id replacement and refactor for simplicity --- subworkflows/local/profiling.nf | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 9bae127..7d35837 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -66,13 +66,20 @@ workflow PROFILING { .filter { it[0]['instrument_platform'] == 'ILLUMINA' } .map { meta, reads, db_meta, db -> - def sam_format = params.malt_save_reads ? ' --alignments ./ -za false' : "" - def temp_meta = [ id: meta['db_name'] ] + def new_meta = meta.clone() def new_db_meta = db_meta.clone() + + // Add the saving of alignments in SAM format to params + def sam_format = params.malt_save_reads ? ' --alignments ./ -za false' : "" new_db_meta['db_params'] = db_meta['db_params'] + sam_format - def new_meta = temp_meta + new_db_meta - new_meta['id'] = new_meta['db_name'] - [ new_meta, reads, db ] + + // As MALT has huge databases, we don't run on a per-sample basis but multiple + // samples at once. This replaces the ID of the particular process with the + // db_name instead to prevent `null` in job name, and in publishDir) + def updated_meta = new_meta + new_db_meta + updated_meta['id'] = updated_meta['db_name'] + + [ updated_meta, reads, db ] } .groupTuple(by: [0,2]) .multiMap { From e0ad49ebc9f22120ff001c08f3b21facf6038c22 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 10 Jun 2022 19:33:37 +0200 Subject: [PATCH 9/9] Fix metadata manipulaton for malt --- subworkflows/local/profiling.nf | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 7d35837..de5bea1 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -61,25 +61,29 @@ workflow PROFILING { // MALT: We groupTuple to have all samples in one channel for MALT as database // loading takes a long time, so we only want to run it once per database - // TODO document somewhere we only accept illumina short reads for MALT? ch_input_for_malt = ch_input_for_profiling.malt .filter { it[0]['instrument_platform'] == 'ILLUMINA' } .map { meta, reads, db_meta, db -> - def new_meta = meta.clone() + + // Reset entire input meta for MALT to just database name, + // as we don't run run on a per-sample basis due to huge datbaases + // so all samples are in one run and so sample-specific metadata + // unnecessary. Set as database name to prevent `null` job ID and prefix. + def temp_meta = [ id: meta['db_name'] ] + + // Extend database parameters to specify whether to save alignments or not def new_db_meta = db_meta.clone() - - // Add the saving of alignments in SAM format to params def sam_format = params.malt_save_reads ? ' --alignments ./ -za false' : "" new_db_meta['db_params'] = db_meta['db_params'] + sam_format - - // As MALT has huge databases, we don't run on a per-sample basis but multiple - // samples at once. This replaces the ID of the particular process with the - // db_name instead to prevent `null` in job name, and in publishDir) - def updated_meta = new_meta + new_db_meta - updated_meta['id'] = updated_meta['db_name'] - [ updated_meta, reads, db ] + // Combine reduced sample metadata with updated database parameters metadata, + // make sure id is db_name for publishing purposes. + def new_meta = temp_meta + new_db_meta + new_meta['id'] = new_meta['db_name'] + + [ new_meta, reads, db ] + } .groupTuple(by: [0,2]) .multiMap {