From de5734052601ce6bedd7be2389fef4bacd0affb1 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 13 Apr 2022 11:49:35 +0200 Subject: [PATCH 1/7] Start work --- modules.json | 5 +- .../nf-core/modules/megan/rma2info/main.nf | 38 ++++++++++++++ .../nf-core/modules/megan/rma2info/meta.yml | 51 +++++++++++++++++++ nextflow.config | 1 + subworkflows/local/profiling.nf | 22 ++++---- .../local/shortread_postprocessing.nf | 39 ++++++++++++++ 6 files changed, 146 insertions(+), 10 deletions(-) create mode 100644 modules/nf-core/modules/megan/rma2info/main.nf create mode 100644 modules/nf-core/modules/megan/rma2info/meta.yml create mode 100644 subworkflows/local/shortread_postprocessing.nf diff --git a/modules.json b/modules.json index 7fbc65c..e921454 100644 --- a/modules.json +++ b/modules.json @@ -30,6 +30,9 @@ "malt/run": { "git_sha": "72b96f4e504eef673f2b5c13560a9d90b669129b" }, + "megan/rma2info": { + "git_sha": "2d38566eca4cc15142b2ffa7c11837569b39aece" + }, "metaphlan3": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, @@ -47,4 +50,4 @@ } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/megan/rma2info/main.nf b/modules/nf-core/modules/megan/rma2info/main.nf new file mode 100644 index 0000000..80d1975 --- /dev/null +++ b/modules/nf-core/modules/megan/rma2info/main.nf @@ -0,0 +1,38 @@ +process MEGAN_RMA2INFO { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::megan=6.21.7" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/megan:6.21.7--h9ee0642_0': + 'quay.io/biocontainers/megan:6.21.7--h9ee0642_0' }" + + input: + tuple val(meta), path(rma6) + val(megan_summary) + + output: + tuple val(meta), path("*.txt.gz") , emit: txt + tuple val(meta), path("*.megan"), optional: true, emit: megan_summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def summary = megan_summary ? "-es ${prefix}.megan" : "" + """ + rma2info \\ + -i ${rma6} \\ + -o ${prefix}.txt.gz \\ + ${summary} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + megan: \$(echo \$(rma2info 2>&1) | grep version | sed 's/.*version //g;s/, built.*//g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/megan/rma2info/meta.yml b/modules/nf-core/modules/megan/rma2info/meta.yml new file mode 100644 index 0000000..0f2d5a9 --- /dev/null +++ b/modules/nf-core/modules/megan/rma2info/meta.yml @@ -0,0 +1,51 @@ +name: "megan_rma2info" +description: Analyses an RMA file and exports information in text format +keywords: + - megan + - rma6 + - classification + - conversion +tools: + - "megan": + description: "A tool for studying the taxonomic content of a set of DNA reads" + homepage: "https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/" + documentation: "https://software-ab.informatik.uni-tuebingen.de/download/megan6/welcome.html" + tool_dev_url: "https://github.com/husonlab/megan-ce" + doi: "10.1371/journal.pcbi.1004957" + licence: "['GPL >=3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - rma6: + type: file + description: RMA6 file from MEGAN or MALT + pattern: "*.rma6" + - megan_summary: + type: boolean + description: Specify whether to generate an MEGAN summary file + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: file + description: Compressed text file + pattern: "*.txt.gz" + - megan_summary: + type: file + description: Optionally generated MEGAN summary file + pattern: "*.megan" + +authors: + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index b4a8d91..8ddf365 100644 --- a/nextflow.config +++ b/nextflow.config @@ -89,6 +89,7 @@ params { centrifuge_save_unaligned = false centrifuge_save_aligned = false centrifuge_sam_format = false + // metaphlan3 run_metaphlan3 = false } diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index c74c583..59b0dc0 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -14,8 +14,9 @@ workflow PROFILING { databases // [ [ meta ], path ] main: - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + ch_raw_profiles = Channel.empty() /* COMBINE READS WITH POSSIBLE DATABASES @@ -89,30 +90,33 @@ workflow PROFILING { if ( params.run_malt ) { MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) + ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( MALT_RUN.out.rma6 ) } if ( params.run_kraken2 ) { KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) - ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) + ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.txt ) } if ( params.run_centrifuge ) { CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format ) - ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) + ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( CENTRIFUGE_CENTRIFUGE.out.report ) } if ( params.run_metaphlan3 ) { METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db ) ch_versions = ch_versions.mix( METAPHLAN3.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN3.out.biom ) } emit: - // TODO work out if there is enough standardisation of output to export as one? - //output = ch_filtered_reads // channel: [ val(meta), [ reads ] ] + profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] versions = ch_versions // channel: [ versions.yml ] mqc = ch_multiqc_files } diff --git a/subworkflows/local/shortread_postprocessing.nf b/subworkflows/local/shortread_postprocessing.nf new file mode 100644 index 0000000..7fb0d70 --- /dev/null +++ b/subworkflows/local/shortread_postprocessing.nf @@ -0,0 +1,39 @@ +// +// Perform read trimming and merging +// + + +include { SHORTREAD_FASTP } from './shortread_fastp' +include { SHORTREAD_ADAPTERREMOVAL } from './shortread_adapterremoval' +include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main' + +workflow SHORTREAD_POSTPROCESSING { + take: + input // [ [ meta ], [ taxon_table/file ] ] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( params.shortread_clipmerge_tool == "fastp" ) { + ch_processed_reads = SHORTREAD_FASTP ( reads ).reads + ch_versions = ch_versions.mix( SHORTREAD_FASTP.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc ) + } else if ( params.shortread_clipmerge_tool == "adapterremoval" ) { + ch_processed_reads = SHORTREAD_ADAPTERREMOVAL ( reads ).reads + ch_versions = ch_versions.mix( SHORTREAD_ADAPTERREMOVAL.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_ADAPTERREMOVAL.out.mqc ) + } else { + ch_processed_reads = reads + } + + FASTQC_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + + emit: + output = output // channel: [ val(meta), taxon_table ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + From cc73cdd51d26330d831ec306f8a5da081fb8665a Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 16 Apr 2022 07:42:30 +0200 Subject: [PATCH 2/7] Add generation of taxon-table like output for MALT --- CITATIONS.md | 10 ++++- conf/modules.config | 15 ++++++- nextflow.config | 1 + nextflow_schema.json | 7 +++- subworkflows/local/profiling.nf | 32 +++++++++++---- .../local/shortread_postprocessing.nf | 39 ------------------- 6 files changed, 52 insertions(+), 52 deletions(-) delete mode 100644 subworkflows/local/shortread_postprocessing.nf diff --git a/CITATIONS.md b/CITATIONS.md index e7bcf47..02621d9 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -40,9 +40,17 @@ > Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico. Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6. +- [MEGAN](https://doi.org/10.1371/journal.pcbi.1004957) + + > Huson, Daniel H., Sina Beier, Isabell Flade, Anna Górska, Mohamed El-Hadidi, Suparna Mitra, Hans-Joachim Ruscheweyh, and Rewati Tappu. 2016. “MEGAN Community Edition - Interactive Exploration and Analysis of Large-Scale Microbiome Sequencing Data.” PLoS Computational Biology 12 (6): e1004957. doi: 10.1371/journal.pcbi.1004957. + - [MetaPhlAn3](https://doi.org/10.7554/eLife.65088) - > Beghini, Francesco, Lauren J McIver, Aitor Blanco-Míguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, et al. 2021. “Integrating Taxonomic, Functional, and Strain-Level Profiling of Diverse Microbial Communities with BioBakery 3.” Edited by Peter Turnbaugh, Eduardo Franco, and C Titus Brown. ELife 10 (May): e65088. + > Beghini, Francesco, Lauren J McIver, Aitor Blanco-Míguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, et al. 2021. “Integrating Taxonomic, Functional, and Strain-Level Profiling of Diverse Microbial Communities with BioBakery 3.” Edited by Peter Turnbaugh, Eduardo Franco, and C Titus Brown. ELife 10 (May): e65088. doi: 10.7554/eLife.65088 + +- [Centrifuge](https://doi.org/10.1101/gr.210641.116) + + > Kim, Daehwan, Li Song, Florian P. Breitwieser, and Steven L. Salzberg. 2016. “Centrifuge: Rapid and Sensitive Classification of Metagenomic Sequences.” Genome Research 26 (12): 1721-29. doi: 10.1101/gr.210641.116. ## Software packaging/containerisation tools diff --git a/conf/modules.config b/conf/modules.config index ccd1748..23455a4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -191,11 +191,22 @@ process { withName: MALT_RUN { ext.args = { "${meta.db_params}" } - ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + // one run with multiple samples, so fix ID to just db name to ensure clean log name + ext.prefix = { "${meta.db_name}" } publishDir = [ path: { "${params.outdir}/malt/${meta.db_name}" }, mode: params.publish_dir_mode, - pattern: '*.{log}' + pattern: '*.{rma6,log,sam}' + ] + } + + withName: MEGAN_RMA2INFO { + ext.args = "-c2c Taxonomy" + ext.prefix = { "${meta.id}" } + publishDir = [ + path: { "${params.outdir}/malt/${meta.db_name}" }, + mode: params.publish_dir_mode, + pattern: '*.{txt.gz,megan}' ] } diff --git a/nextflow.config b/nextflow.config index bf3ca92..a618f66 100644 --- a/nextflow.config +++ b/nextflow.config @@ -88,6 +88,7 @@ params { // MALT run_malt = false malt_mode = 'BlastN' + malt_generatemegansummary = false // kraken2 run_kraken2 = false diff --git a/nextflow_schema.json b/nextflow_schema.json index cf0edab..2cbfb0e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -364,11 +364,14 @@ }, "shortread_hostremoval_reference": { "type": "string", - "default": null + "default": "None" }, "shortread_hostremoval_index": { "type": "string", - "default": null + "default": "None" + }, + "malt_generatemegansummary": { + "type": "boolean" } } } diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 23b96e7..d7532b7 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -3,6 +3,7 @@ // include { MALT_RUN } from '../../modules/nf-core/modules/malt/run/main' +include { MEGAN_RMA2INFO } from '../../modules/nf-core/modules/megan/rma2info/main' include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/modules/kraken2/kraken2/main' include { CENTRIFUGE_CENTRIFUGE } from '../../modules/nf-core/modules/centrifuge/centrifuge/main' include { METAPHLAN3 } from '../../modules/nf-core/modules/metaphlan3/main' @@ -95,33 +96,48 @@ workflow PROFILING { if ( params.run_malt ) { MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) - ch_raw_profiles = ch_raw_profiles.mix( MALT_RUN.out.rma6 ) + + ch_maltrun_for_megan = MALT_RUN.out.rma6 + .transpose() + .map{ + meta, rma -> + // re-extract meta from file names, use filename without rma to + // ensure we keep paired-end information in downstream filenames + // when no pair-merging + def meta_new = meta.clone() + meta_new['db_name'] = meta.id + meta_new['id'] = rma.name - ( '.' + rma.extension ) + [ meta_new, rma ] + } + + MEGAN_RMA2INFO (ch_maltrun_for_megan, params.malt_generatemegansummary ) + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) + ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( MEGAN_RMA2INFO.out.txt ) } if ( params.run_kraken2 ) { KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) - ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) + ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.txt ) } if ( params.run_centrifuge ) { CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format ) - ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) + ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( CENTRIFUGE_CENTRIFUGE.out.report ) } if ( params.run_metaphlan3 ) { METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db ) - ch_versions = ch_versions.mix( METAPHLAN3.out.versions.first() ) + ch_versions = ch_versions.mix( METAPHLAN3.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN3.out.biom ) } emit: - profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] + profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom versions = ch_versions // channel: [ versions.yml ] mqc = ch_multiqc_files } diff --git a/subworkflows/local/shortread_postprocessing.nf b/subworkflows/local/shortread_postprocessing.nf deleted file mode 100644 index 7fb0d70..0000000 --- a/subworkflows/local/shortread_postprocessing.nf +++ /dev/null @@ -1,39 +0,0 @@ -// -// Perform read trimming and merging -// - - -include { SHORTREAD_FASTP } from './shortread_fastp' -include { SHORTREAD_ADAPTERREMOVAL } from './shortread_adapterremoval' -include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main' - -workflow SHORTREAD_POSTPROCESSING { - take: - input // [ [ meta ], [ taxon_table/file ] ] - - main: - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() - - if ( params.shortread_clipmerge_tool == "fastp" ) { - ch_processed_reads = SHORTREAD_FASTP ( reads ).reads - ch_versions = ch_versions.mix( SHORTREAD_FASTP.out.versions ) - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc ) - } else if ( params.shortread_clipmerge_tool == "adapterremoval" ) { - ch_processed_reads = SHORTREAD_ADAPTERREMOVAL ( reads ).reads - ch_versions = ch_versions.mix( SHORTREAD_ADAPTERREMOVAL.out.versions ) - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_ADAPTERREMOVAL.out.mqc ) - } else { - ch_processed_reads = reads - } - - FASTQC_PROCESSED ( ch_processed_reads ) - ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) - - emit: - output = output // channel: [ val(meta), taxon_table ] - versions = ch_versions // channel: [ versions.yml ] - mqc = ch_multiqc_files -} - From d0256ec9378eac0cad715fe364d997a84e9b312b Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 16 Apr 2022 07:45:24 +0200 Subject: [PATCH 3/7] Prettier --- modules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 18c0e69..18dea60 100644 --- a/modules.json +++ b/modules.json @@ -56,4 +56,4 @@ } } } -} \ No newline at end of file +} From 818d3bd053beb2a33730d91d8556c544ff94b417 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 17 Apr 2022 17:53:22 +0200 Subject: [PATCH 4/7] Complete documentation for preprocessing subworkflows --- docs/usage.md | 107 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 103 insertions(+), 4 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 239a55d..528aeef 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -10,6 +10,8 @@ ## Samplesheet inputs +nf-core/profiler can accept as input raw or preprocessed single- or paired-end short-read (e.g. Illumina) FASTQ files, long-read FASTQ files (e.g. Oxford Nanopore), or FASTA sequences (when accepted by a profiler). + You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row as shown in the examples below. Furthermother, nf-core/taxprofiler also requires a second comma-separated file of 3 columns with a header row as in the examples below. This samplesheet is then specified on the command line as follows: @@ -20,7 +22,7 @@ This samplesheet is then specified on the command line as follows: ### Multiple runs of the same sample -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will process reads before performing profiling. Below is an example for the same sample sequenced across 3 lanes: +The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate different runs FASTQ files of the same sample before performing profiling, when `--perform_runmerging` is supplied. Below is an example for the same sample sequenced across 3 lanes: ```console sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta @@ -36,7 +38,7 @@ sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 6 columns to match those defined in the table below. -A final samplesheet file consisting of both single- and paired-end data, as well as long-read FASTA fies may look something like the one below. This is for 6 samples, where `2612` has been sequenced twice. +A final samplesheet file consisting of both single- and paired-end data, as well as long-read FASTA files may look something like the one below. This is for 6 samples, where `2612` has been sequenced twice. ```console 2611,ERR5766174,ILLUMINA,,,///fasta/ERX5474930_ERR5766174_1.fa.gz @@ -59,9 +61,11 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p ### Full database sheet -nf-core/taxprofiler supports multiple databases being profiled in parallel for each tool. These databases, and specific parameters for each, can be specified in a 4 column comma-separated sheet. +nf-core/taxprofiler supports multiple databases being profiled in parallel for each tool. +Databases can be supplied either be in the form of a compressed `.tar.gz` archive of a folder containing all relevant database files or the path to an (uncompressed) directory. +The pipelines takes the locations these of databases as input, and specific parameters for each, via a 4 column comma-separated sheet. -> ⚠️ nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user. +> ⚠️ nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user. See below for more information of the expected database files. An example database sheet can look as follows, where 4 tools are being used, and `malt` and `kraken2` will be used against two databases each. @@ -86,6 +90,41 @@ Column specifications are as follows: > 💡 You can also specify the same database directory/file twice (ensuring unique `db_name`s) and specify different parameters for each database to compare the effect of different parameters during profiling. +nf-core/taxprofiler will automatically decompress and extract any compressed archives for you. + +Expected (uncompressed) database files for each tool are as follows: + +- **MALT** output of `malt-build`. A directory containing: + - `ref.idx` + - `taxonomy.idx` + - `taxonomy.map` + - `index0.idx` + - `table0.idx` + - `table0.db` + - `ref.inf` + - `ref.db` + - `taxonomy.tre` +- **Kraken2** output of `kraken2-build` command(s) A directory containing: + - `opts.k2d` + - `hash.k2d` + - `taxo.k2d` +- **Centrifuge** output of `centrifuge-build`. A directory containing: + - `..cf` + - `..cf` + - `..cf` + - `..cf` +- **MetaPhlAn3** generated with `metaphlan --install` or downloaded from links on the [MetaPhlAn3 wiki](https://github.com/biobakery/MetaPhlAn/wiki/MetaPhlAn-3.0#customizing-the-database). A directory containing: + - `mpa_v30_CHOCOPhlAn_201901.pkl` + - `mpa_v30_CHOCOPhlAn_201901.pkl` + - `mpa_v30_CHOCOPhlAn_201901.fasta` + - `mpa_v30_CHOCOPhlAn_201901.3.bt2` + - `mpa_v30_CHOCOPhlAn_201901.4.bt2` + - `mpa_v30_CHOCOPhlAn_201901.1.bt2` + - `mpa_v30_CHOCOPhlAn_201901.2.bt2` + - `mpa_v30_CHOCOPhlAn_201901.rev.1.bt2` + - `mpa_v30_CHOCOPhlAn_201901.rev.2.bt2` + - `mpa_latest` + ## Running the pipeline The typical command for running the pipeline is as follows: @@ -105,6 +144,66 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` +### Preprocessing Steps + +nf-core/taxprofiler offers four main preprocessing steps + +- Read processing: adapter clipping and pair-merging. +- Complexity filtering: removal of low-sequence complexity reads. +- Host read-removal: removal of reads aligning to reference genome(s) of a host. +- Run merging: concatenation of multiple FASTQ chunks/sequencing runs/libraries of a sample. + +#### Read Processing + +Raw sequencing read processing in the form of adapter clipping and paired-end read merging can be activated via the `--perform_shortread_clipmerge` or `--perform_longread_clip` flags. + +It is highly recommended to run this on raw reads to remove artefacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles. + +There are currently two options for short-read preprocessing: `fastp` or `adapterremoval`. + +For adapter clipping, you can either rely on tool default adapter sequences, or supply your own adapters (`--shortread_clipmerge_adapter1` and `--shortread_clipmerge_adapter2`) +By default, paired-end merging is not activated and paired-end profiling is performed where supported otherwise pairs will be independently profiled. If paired-end merging is activated you can also specify whether to exclude unmerged reads in the reads sent for profiling (`--shortread_clipmerge_mergepairs` and `--shortread_clipmerge_excludeunmerged`). +You can also turn off clipping and only perform paired-end merging, if requested. This can be useful when processing data downloaded from the ENA, SRA, or DDBJ (--shortread_clipmerge_skipadaptertrim). +Both tools support length filtering of reads and can be tuned with `--shortread_clipmerge_minlength`. Performing length filtering can be useful to remove short (often low sequencing complexity) sequences that result in unspecific classification and therefore slow down runtime during profiling, with minimal gain. + +There is currently one option for long-read Oxford Nanopore processing: `porechop`. + +For both short-read and long-read preprocessing, you can optionally save the resulting processed reads with `--save_preprocessed_reads`. + +#### Complexity Filtering + +Complexity filtering can be activated via the `--perform_shortread_complexityfilter` flag. + +Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informatic taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources. + +There are currently two options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) and [`prinseq++`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/). + +The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of both tools (see links above) to decide on optimal methods and parameters for your dataset. + +You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. + +#### Host Removal + +Removal of possible-host reads from FASTQ files prior profiling can be activated with `--perform_shortread_hostremoval` + +Similarly to complexity filtering, host-removal can be useful for runtime optimisation and reduction in misclassified reads. It is not always necessary to report classification of reads from a host when you already know the host of the sample, therefore you can gain a run-time and computational advantage by removing these prior typically resource-heavy profiling with more efficient methods. Furthermore, particularly with human samples, you can reduce the number of false positives during profiling that occur due to host-sequence contamination in reference genomes on public databases. + +nf-core/taxprofiler currently offers host-removal via alignment against a reference genome with Bowtie2, and the use of the unaligned reads for downstream profiling. + +You can supply your reference genome in FASTA format with `--shortread_hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index`, however nf-core/taxprofiler will generate this for you if necessary. + +> 💡 If you have multiple taxa or sequences you wish to remove (e.g., the host genome and then also PhiX - common quality-control reagent during sequencing) you can simply concatenate the FASTAs of each taxa or sequences into a single reference file. + +#### Run Merging + +For samples that may have been sequenced over multiple runs, or for FASTQ files split into multiple chunks, you can activate the ability to merge across all runs or chunks with `--perform_runmerging`. + +For more information how to set up your input samplesheet, see [Multiple runs of the same sample](#multiple-runs-of-the-same-sample). + +Activating this functionality will concatenate togther the FASTQ files with the same sample name _after_ the optional preprocessing steps and prior profiling. Note that libraries with runs of different pairment types will **not** the different types merged together, and output files will indicate with a `_se` or `_pe` suffix to the sample name accordingly. + +You can optionally save the FASTQ output of the run merging with the `--save_runmerged_reads`. + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: From 457c13e8b74f4fef01f8631bbbdfb9583777760c Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 17 Apr 2022 17:55:23 +0200 Subject: [PATCH 5/7] Add additional tips for host removal --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 528aeef..919569c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -190,7 +190,7 @@ Similarly to complexity filtering, host-removal can be useful for runtime optimi nf-core/taxprofiler currently offers host-removal via alignment against a reference genome with Bowtie2, and the use of the unaligned reads for downstream profiling. -You can supply your reference genome in FASTA format with `--shortread_hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index`, however nf-core/taxprofiler will generate this for you if necessary. +You can supply your reference genome in FASTA format with `--shortread_hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index`, however nf-core/taxprofiler will generate this for you if necessary. Pre-supplying the directory of index files can greatly speed up the process, and these can be re-used. > 💡 If you have multiple taxa or sequences you wish to remove (e.g., the host genome and then also PhiX - common quality-control reagent during sequencing) you can simply concatenate the FASTAs of each taxa or sequences into a single reference file. From 87777a428959bc89f4f6cfda956c43312d4b6d16 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Tue, 19 Apr 2022 12:54:57 +0200 Subject: [PATCH 6/7] Update subworkflows/local/profiling.nf Co-authored-by: Moritz E. Beber --- subworkflows/local/profiling.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index d7532b7..5f9c111 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -106,7 +106,7 @@ workflow PROFILING { // when no pair-merging def meta_new = meta.clone() meta_new['db_name'] = meta.id - meta_new['id'] = rma.name - ( '.' + rma.extension ) + meta_new['id'] = rma.baseName [ meta_new, rma ] } From ac714dd30f758a0f419ad4155b296cdba5e8a80a Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 20 Apr 2022 09:22:10 +0200 Subject: [PATCH 7/7] Apply suggestions from code review Co-authored-by: Moritz E. Beber --- docs/usage.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 919569c..f4f11fb 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -10,7 +10,7 @@ ## Samplesheet inputs -nf-core/profiler can accept as input raw or preprocessed single- or paired-end short-read (e.g. Illumina) FASTQ files, long-read FASTQ files (e.g. Oxford Nanopore), or FASTA sequences (when accepted by a profiler). +nf-core/taxprofiler can accept as input raw or preprocessed single- or paired-end short-read (e.g. Illumina) FASTQ files, long-read FASTQ files (e.g. Oxford Nanopore), or FASTA sequences (available for a subset of profilers). You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row as shown in the examples below. Furthermother, nf-core/taxprofiler also requires a second comma-separated file of 3 columns with a header row as in the examples below. @@ -62,8 +62,8 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p ### Full database sheet nf-core/taxprofiler supports multiple databases being profiled in parallel for each tool. -Databases can be supplied either be in the form of a compressed `.tar.gz` archive of a folder containing all relevant database files or the path to an (uncompressed) directory. -The pipelines takes the locations these of databases as input, and specific parameters for each, via a 4 column comma-separated sheet. +Databases can be supplied either in the form of a compressed `.tar.gz` archive of a directory containing all relevant database files or the path to a directory on the filesystem. +The pipeline takes the locations and specific parameters of these databases as input via a four column comma-separated sheet. > ⚠️ nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user. See below for more information of the expected database files. @@ -163,7 +163,7 @@ There are currently two options for short-read preprocessing: `fastp` or `adapte For adapter clipping, you can either rely on tool default adapter sequences, or supply your own adapters (`--shortread_clipmerge_adapter1` and `--shortread_clipmerge_adapter2`) By default, paired-end merging is not activated and paired-end profiling is performed where supported otherwise pairs will be independently profiled. If paired-end merging is activated you can also specify whether to exclude unmerged reads in the reads sent for profiling (`--shortread_clipmerge_mergepairs` and `--shortread_clipmerge_excludeunmerged`). -You can also turn off clipping and only perform paired-end merging, if requested. This can be useful when processing data downloaded from the ENA, SRA, or DDBJ (--shortread_clipmerge_skipadaptertrim). +You can also turn off clipping and only perform paired-end merging, if requested. This can be useful when processing data downloaded from the ENA, SRA, or DDBJ (`--shortread_clipmerge_skipadaptertrim`). Both tools support length filtering of reads and can be tuned with `--shortread_clipmerge_minlength`. Performing length filtering can be useful to remove short (often low sequencing complexity) sequences that result in unspecific classification and therefore slow down runtime during profiling, with minimal gain. There is currently one option for long-read Oxford Nanopore processing: `porechop`. @@ -174,7 +174,7 @@ For both short-read and long-read preprocessing, you can optionally save the res Complexity filtering can be activated via the `--perform_shortread_complexityfilter` flag. -Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informatic taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources. +Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources. There are currently two options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) and [`prinseq++`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/). @@ -200,7 +200,7 @@ For samples that may have been sequenced over multiple runs, or for FASTQ files For more information how to set up your input samplesheet, see [Multiple runs of the same sample](#multiple-runs-of-the-same-sample). -Activating this functionality will concatenate togther the FASTQ files with the same sample name _after_ the optional preprocessing steps and prior profiling. Note that libraries with runs of different pairment types will **not** the different types merged together, and output files will indicate with a `_se` or `_pe` suffix to the sample name accordingly. +Activating this functionality will concatenate the FASTQ files with the same sample name _after_ the optional preprocessing steps and _before_ profiling. Note that libraries with runs of different pairing types will **not** be merged and this will be indicated on output files with a `_se` or `_pe` suffix to the sample name accordingly. You can optionally save the FASTQ output of the run merging with the `--save_runmerged_reads`.