diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5cea5b7..11db52e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,18 +18,20 @@ jobs: if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/taxprofiler') }}" runs-on: ubuntu-latest strategy: + fail-fast: false matrix: NXF_VER: - "21.10.3" - "latest-everything" parameters: + - "--preprocessing_qc_tool falco" - "--perform_longread_qc false" - "--perform_shortread_qc false" - "--shortread_qc_tool fastp" - - "--shortread_qc_tool fastp --shortread_qc_mergepairs --shortread_qc_excludeunmerged" + - "--shortread_qc_tool fastp --shortread_qc_mergepairs --shortread_qc_includeunmerged" - "--shortread_qc_tool fastp --shortread_qc_mergepairs" - "--shortread_qc_tool adapterremoval" - - "--shortread_qc_tool adapterremoval --shortread_qc_mergepairs --shortread_qc_excludeunmerged" + - "--shortread_qc_tool adapterremoval --shortread_qc_mergepairs --shortread_qc_includeunmerged" - "--shortread_qc_tool adapterremoval --shortread_qc_mergepairs" - "--shortread_complexityfilter_tool bbduk" - "--shortread_complexityfilter_tool prinseqplusplus" @@ -99,3 +101,69 @@ jobs: with: command: nextflow run ${GITHUB_WORKSPACE} -profile test_motus,docker --outdir ./results --databases ./database_motus.csv attempt_limit: 3 + + krakenuniq: + name: Test KrakenUniq with workflow parameters + if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/taxprofiler') }} + runs-on: ubuntu-latest + strategy: + matrix: + NXF_VER: + - "21.10.3" + - "latest-everything" + + steps: + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Install Nextflow + uses: nf-core/setup-nextflow@v1 + with: + version: "${{ matrix.NXF_VER }}" + + - name: Show current locale + run: locale + + - name: Set UTF-8 enabled locale + run: | + sudo locale-gen en_US.UTF-8 + sudo update-locale LANG=en_US.UTF-8 + + - name: Run pipeline with test data + uses: Wandalen/wretry.action@v1.0.11 + with: + command: nextflow run ${GITHUB_WORKSPACE} -profile test_krakenuniq,docker --outdir ./results + attempt_limit: 3 + + malt: + name: Test MALT with workflow parameters + if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/taxprofiler') }} + runs-on: ubuntu-latest + strategy: + matrix: + NXF_VER: + - "21.10.3" + - "latest-everything" + + steps: + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Install Nextflow + uses: nf-core/setup-nextflow@v1 + with: + version: "${{ matrix.NXF_VER }}" + + - name: Show current locale + run: locale + + - name: Set UTF-8 enabled locale + run: | + sudo locale-gen en_US.UTF-8 + sudo update-locale LANG=en_US.UTF-8 + + - name: Run pipeline with test data + uses: Wandalen/wretry.action@v1.0.11 + with: + command: nextflow run ${GITHUB_WORKSPACE} -profile test_nothing,docker --run_malt --outdir ./results + attempt_limit: 3 diff --git a/CITATIONS.md b/CITATIONS.md index 1ce4ec2..daf9022 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -36,6 +36,14 @@ > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. Improved Metagenomic Analysis with Kraken 2. Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0. +- [KrakenUniq](https://doi.org/10.1186/s13059-018-1568-0) + + > Breitwieser, Florian P., Daniel N. Baker, and Steven L. Salzberg. 2018. KrakenUniq: confident and fast metagenomics classification using unique k-mer counts. Genome Biology 19 (1): 198. doi: 10.1186/s13059-018-1568-0 + +- [Bracken](https://doi.org/10.7717/peerj-cs.104) + + > Lu, J., Breitwieser, F. P., Thielen, P., & Salzberg, S. L. (2017). Bracken: Estimating species abundance in metagenomics data. PeerJ Computer Science, 3, e104. doi: 10.7717/peerj-cs.104 + - [Krona](https://doi.org/10.1186/1471-2105-12-385) > Ondov, Brian D., Nicholas H. Bergman, and Adam M. Phillippy. 2011. Interactive metagenomic visualization in a Web browser. BMC Bioinformatics 12 (1): 385. doi: 10.1186/1471-2105-12-385. @@ -62,6 +70,10 @@ - [FILTLONG](https://github.com/rrwick/Filtlong) +- [falco](https://doi.org/10.12688/f1000research.21142.2) + +> de Sena Brandine G and Smith AD. Falco: high-speed FastQC emulation for quality control of sequencing data. F1000Research 2021, 8:1874 + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index 11eb9a3..5a8884e 100644 --- a/README.md +++ b/README.md @@ -30,13 +30,14 @@ On release, automated continuous integration tests run the pipeline on a full-si ![](docs/images/taxprofiler_tube.png) -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) or [`falco`](https://github.com/smithlabcode/falco) as an alternative option) 2. Performs optional read pre-processing - - Adapter clipping and merging (short read: [fastp](https://github.com/OpenGene/fastp), [AdapterRemoval2](https://github.com/MikkelSchubert/adapterremoval); long read: [porechop](https://github.com/rrwick/Porechop)) - - Low complexity filtering ([bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus)) - - Host read removal ([BowTie2](http://bowtie-bio.sourceforge.net/bowtie2/)) + - Adapter clipping and merging (short-read: [fastp](https://github.com/OpenGene/fastp), [AdapterRemoval2](https://github.com/MikkelSchubert/adapterremoval); long-read: [porechop](https://github.com/rrwick/Porechop)) + - Low complexity and quality filtering (short-read: [bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus); long-read: [Filtlong](https://github.com/rrwick/Filtlong)) + - Host-read removal (short-read: [BowTie2](http://bowtie-bio.sourceforge.net/bowtie2/); long-read: [Minimap2](https://github.com/lh3/minimap2)) - Run merging -3. Performs taxonomic profiling using one or more of: +3. Supports statistics for host-read removal ([Samtools](http://www.htslib.org/)) +4. Performs taxonomic profiling using one or more of: - [Kraken2](https://ccb.jhu.edu/software/kraken2/) - [MetaPhlAn3](https://huttenhower.sph.harvard.edu/metaphlan/) - [MALT](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/malt/) @@ -44,15 +45,16 @@ On release, automated continuous integration tests run the pipeline on a full-si - [Centrifuge](https://ccb.jhu.edu/software/centrifuge/) - [Kaiju](https://kaiju.binf.ku.dk/) - [mOTUs](https://motu-tool.org/) - - [MetaMaps](https://github.com/DiltheyLab/MetaMaps) -4. Perform optional post-processing with: + - [KrakenUniq](https://github.com/fbreitwieser/krakenuniq) +5. Perform optional post-processing with: - [bracken](https://ccb.jhu.edu/software/bracken/) -5. Standardises output tables -6. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +6. Standardises output tables +7. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +8. Plotting Kraken2, Centrifuge, Kaiju and MALT results ([`Krona`](https://hpc.nih.gov/apps/kronatools.html)) ## Quick Start -1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=21.10.3`) +1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=21.10.3`). 2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) (you can follow [this tutorial](https://singularity-tutorial.github.io/01-installation/)), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(you can use [`Conda`](https://conda.io/miniconda.html) both to install Nextflow itself and also to manage software within pipelines. Please only use it within pipelines as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_. @@ -87,7 +89,7 @@ nf-core/taxprofiler was originally written by nf-core community. We thank the following people for their extensive assistance in the development of this pipeline: -[James A. Fellows Yates](https://github.com/jfy133), [Moritz Beber](https://github.com/Midnighter), [Lauri Mesilaakso](https://github.com/ljmesi), [Sofia Stamouli](https://github.com/sofsam), [Maxime Borry](https://github.com/maxibor). +[James A. Fellows Yates](https://github.com/jfy133), [Moritz Beber](https://github.com/Midnighter), [Lauri Mesilaakso](https://github.com/ljmesi), [Sofia Stamouli](https://github.com/sofsam), [Maxime Borry](https://github.com/maxibor),[Thomas A. Christensen II](https://github.com/MillironX), [Jianhong Ou](https://github.com/jianhong), [Rafal Stepien](https://github.com/rafalstepien), [Mahwash Jamy](https://github.com/mjamy). ## Contributions and Support diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index e2b5a6e..e4a04a9 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -21,6 +21,7 @@ run_modules: - adapterRemoval - fastp - bowtie2 + - samtools - kraken - malt - custom_content diff --git a/conf/modules.config b/conf/modules.config index d2a0051..dd85c0c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -40,11 +40,29 @@ process { ] } + withName: FALCO { + ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } + publishDir = [ + path: { "${params.outdir}/falco/raw" }, + mode: params.publish_dir_mode, + pattern: '*.{html,txt}' + ] + } + + withName: FALCO_PROCESSED { + ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } + publishDir = [ + path: { "${params.outdir}/falco/processed" }, + mode: params.publish_dir_mode, + pattern: '*.{html,txt}' + ] + } + withName: FASTP_SINGLE { ext.args = [ // trimming options params.shortread_qc_skipadaptertrim ? "--disable_adapter_trimming" : "", - params.shortread_qc_adapter1 ? "--adapter_sequence ${params.shortread_qc_adapter1}" : "", + params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter1 ? "--adapter_sequence ${params.shortread_qc_adapter1}" : "", // filtering options "--length_required ${params.shortread_qc_minlength}", (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp') ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' @@ -61,11 +79,11 @@ process { withName: FASTP_PAIRED { ext.args = [ // collapsing options - option to retain singletons - params.shortread_qc_excludeunmerged ? '' : "--include_unmerged", + params.shortread_qc_includeunmerged ? '--include_unmerged' : '', // trimming options params.shortread_qc_skipadaptertrim ? "--disable_adapter_trimming" : "", - params.shortread_qc_adapter1 ? "--adapter_sequence ${params.shortread_qc_adapter1}" : "", - params.shortread_qc_adapter2 ? "--adapter_sequence_r2 ${params.shortread_qc_adapter2}" : "--detect_adapter_for_pe", + params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter1 ? "--adapter_sequence ${params.shortread_qc_adapter1}" : "", + params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter2 ? "--adapter_sequence_r2 ${params.shortread_qc_adapter2}" : "--detect_adapter_for_pe", // filtering options "--length_required ${params.shortread_qc_minlength}", params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' @@ -82,8 +100,7 @@ process { withName: ADAPTERREMOVAL_SINGLE { ext.args = [ // trimming options - params.shortread_qc_skipadaptertrim ? "--adapter1 '' --adapter2 ''" : "", - params.shortread_qc_adapter1 ? "--adapter1 ${params.shortread_qc_adapter1}" : "", + params.shortread_qc_skipadaptertrim ? "--adapter1 ''" : params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter1 ? "--adapter1 ${params.shortread_qc_adapter1}" : "", // filtering options "--minlength ${params.shortread_qc_minlength}" ].join(' ').trim() @@ -101,9 +118,8 @@ process { // collapsing options params.shortread_qc_mergepairs ? "--collapse" : "", // trimming options - params.shortread_qc_skipadaptertrim ? "--adapter1 '' --adapter2 ''" : "", - params.shortread_qc_adapter1 ? "--adapter1 ${params.shortread_qc_adapter1}" : "", - params.shortread_qc_adapter2 ? "--adapter2 ${params.shortread_qc_adapter2}" : "", + params.shortread_qc_skipadaptertrim ? "--adapter1 ''" : params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter1 ? "--adapter1 ${params.shortread_qc_adapter1}" : "", // adding adapter list happens at module input channel level + params.shortread_qc_skipadaptertrim ? "--adapter2 ''" : params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter2 ? "--adapter2 ${params.shortread_qc_adapter2}" : "", // filtering options "--minlength ${params.shortread_qc_minlength}" ].join(' ').trim() @@ -116,7 +132,7 @@ process { ] } - withName: PORECHOP { + withName: PORECHOP_PORECHOP { ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ path: { "${params.outdir}/porechop" }, @@ -215,6 +231,15 @@ process { ] } + withName: SAMTOOLS_STATS { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/samtools/stats" }, + mode: params.publish_dir_mode, + pattern: '*stats' + ] + } + withName: BBMAP_BBDUK { ext.args = [ "entropy=${params.shortread_complexityfilter_entropy}", @@ -256,7 +281,7 @@ process { } withName: MALT_RUN { - ext.args = { "${meta.db_params}" } + ext.args = { "${meta.db_params} -m ${params.malt_mode}" } // one run with multiple samples, so fix ID to just db name to ensure clean log name ext.prefix = { "${meta.db_name}" } publishDir = [ @@ -277,7 +302,7 @@ process { } withName: KRAKEN2_KRAKEN2 { - ext.args = { "${meta.db_params}" } + ext.args = params.kraken2_save_minimizers ? { "${meta.db_params} --report-minimizer-data" } : { "${meta.db_params}" } ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/kraken2/${meta.db_name}/" }, @@ -286,6 +311,16 @@ process { ] } + withName: BRACKEN_BRACKEN { + errorStrategy = 'ignore' + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/bracken/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.tsv' + ] + } + withName: KRAKENTOOLS_COMBINEKREPORTS { ext.prefix = { "kraken2_${meta.id}_combined_reports" } publishDir = [ @@ -295,6 +330,17 @@ process { ] } + withName: KRAKENUNIQ_PRELOADEDKRAKENUNIQ { + ext.args = { "${meta.db_params}" } + // one run with multiple samples, so fix ID to just db name to ensure clean log name + ext.prefix = { "${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/krakenuniq/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt,report,fastq.gz}' + ] + } + withName: KRONA_CLEANUP { ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ @@ -411,6 +457,13 @@ process { } withName: MOTUS_PROFILE { + ext.args = { + [ + params.motus_remove_ncbi_ids ? "" : "-p", + params.motus_use_relative_abundance ? "" : "-c", + params.motus_save_mgc_read_counts ? "-M ${task.ext.prefix}.mgc" : "" + ].join(',').replaceAll(','," ") + } ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/motus/${meta.db_name}/" }, diff --git a/conf/test.config b/conf/test.config index d5dcd67..db9f81d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -34,10 +34,12 @@ params { hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' run_kaiju = true run_kraken2 = true - run_malt = true + run_bracken = true + run_malt = false run_metaphlan3 = true run_centrifuge = true run_diamond = true + run_krakenuniq = true run_motus = false run_krona = true krona_taxonomy_directory = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/metagenome/krona_taxonomy.tab' @@ -50,6 +52,7 @@ params { process { withName: MALT_RUN { maxForks = 1 + ext.args = { "-m ${params.malt_mode} -J-Xmx12G" } } withName: MEGAN_RMA2INFO_TSV { maxForks = 1 diff --git a/conf/test_krakenuniq.config b/conf/test_krakenuniq.config new file mode 100644 index 0000000..67b559e --- /dev/null +++ b/conf/test_krakenuniq.config @@ -0,0 +1,72 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +// +// Separate test as KrakenUniq database can sometimes be too big for GHA +// + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test to check KrakenUniq function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_krakenuniq.csv' + perform_shortread_qc = true + perform_longread_qc = true + shortread_qc_mergepairs = true + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = false + run_kraken2 = false + run_bracken = false + run_malt = false + run_metaphlan3 = false + run_centrifuge = false + run_diamond = false + run_krakenuniq = true + run_motus = false + run_krona = true + krona_taxonomy_directory = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/metagenome/krona_taxonomy.tab' + malt_save_reads = true + kraken2_save_reads = true + centrifuge_save_reads = true + diamond_save_reads = true +} + +process { + withName: MALT_RUN { + maxForks = 1 + } + withName: MEGAN_RMA2INFO_TSV { + maxForks = 1 + } + withName: MEGAN_RMA2INFO_KRONA { + maxForks = 1 + } + withName: 'EIDO_VALIDATE' { + ext.args = '--st-index sample' + } + withName: 'EIDO_CONVERT' { + ext.args = '--st-index sample' + } +} diff --git a/conf/test_motus.config b/conf/test_motus.config index d5eb8f8..ee41447 100644 --- a/conf/test_motus.config +++ b/conf/test_motus.config @@ -10,6 +10,10 @@ ---------------------------------------------------------------------------------------- */ +// +// Separate test as mOTUs database download can be flaky +// + params { config_profile_name = 'mOTUs Test profile' config_profile_description = 'Minimal test to check mOTUs function' @@ -33,10 +37,15 @@ params { hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' run_kaiju = false run_kraken2 = false + run_bracken = false run_malt = false run_metaphlan3 = false run_centrifuge = false run_diamond = false + run_krakenuniq = false run_motus = true + motus_save_mgc_read_counts = false + motus_remove_ncbi_ids = false + motus_use_relative_abundance = false run_profile_standardisation = true } diff --git a/conf/test_nopreprocessing.config b/conf/test_nopreprocessing.config index 3908b56..1a36159 100644 --- a/conf/test_nopreprocessing.config +++ b/conf/test_nopreprocessing.config @@ -33,10 +33,12 @@ params { hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' run_kaiju = true run_kraken2 = true + run_bracken = true run_malt = true run_metaphlan3 = true run_centrifuge = true run_diamond = true + run_krakenuniq = true run_motus = false run_krona = true } @@ -44,5 +46,6 @@ params { process { withName: MALT_RUN { maxForks = 1 + ext.args = { "-m ${params.malt_mode} -J-Xmx12G" } } } diff --git a/conf/test_noprofiling.config b/conf/test_noprofiling.config index 12c7185..3ca715b 100644 --- a/conf/test_noprofiling.config +++ b/conf/test_noprofiling.config @@ -34,10 +34,12 @@ params { hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' run_kaiju = false run_kraken2 = false + run_bracken = false run_malt = false run_metaphlan3 = false run_centrifuge = false run_diamond = false + run_krakenuniq = false run_motus = false } diff --git a/conf/test_nothing.config b/conf/test_nothing.config index c0ecece..b95deb4 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -33,15 +33,18 @@ params { hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' run_kaiju = false run_kraken2 = false + run_bracken = false run_malt = false run_metaphlan3 = false run_centrifuge = false run_diamond = false + run_krakenuniq = false run_motus = false } process { withName: MALT_RUN { maxForks = 1 + ext.args = { "-m ${params.malt_mode} -J-Xmx12G" } } } diff --git a/conf/test_pep.config b/conf/test_pep.config index 7f8c95d..6ce788d 100644 --- a/conf/test_pep.config +++ b/conf/test_pep.config @@ -19,6 +19,7 @@ params { hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' run_kaiju = true run_kraken2 = true + run_bracken = true run_malt = true run_metaphlan3 = true run_centrifuge = true @@ -36,6 +37,7 @@ params { process { withName: MALT_RUN { maxForks = 1 + ext.args = { "-m ${params.malt_mode} -J-Xmx12G" } } withName: MEGAN_RMA2INFO { maxForks = 1 diff --git a/docs/images/taxprofiler_tube.pdf b/docs/images/taxprofiler_tube.pdf index 63d8c06..b0b10bb 100644 Binary files a/docs/images/taxprofiler_tube.pdf and b/docs/images/taxprofiler_tube.pdf differ diff --git a/docs/images/taxprofiler_tube.png b/docs/images/taxprofiler_tube.png index 0d0a4aa..1a3b464 100644 Binary files a/docs/images/taxprofiler_tube.png and b/docs/images/taxprofiler_tube.png differ diff --git a/docs/images/taxprofiler_tube.svg b/docs/images/taxprofiler_tube.svg index 0ea3460..c4d2403 100644 --- a/docs/images/taxprofiler_tube.svg +++ b/docs/images/taxprofiler_tube.svg @@ -7,7 +7,7 @@ viewBox="0 0 555.62502 211.66668" version="1.1" id="svg5" - inkscape:version="1.2 (1:1.2.1+202207142221+cd75a1ee6d)" + inkscape:version="1.2.1 (1:1.2.1+202210291243+9c6d41e410)" sodipodi:docname="taxprofiler_tube.svg" xml:space="preserve" inkscape:export-filename="taxprofiler_tube.png" @@ -26,7 +26,7 @@ inkscape:pageopacity="0.0" inkscape:pagecheckerboard="true" inkscape:document-units="mm" - showgrid="true" + showgrid="false" inkscape:snap-bbox="true" inkscape:bbox-nodes="true" inkscape:snap-bbox-edge-midpoints="false" @@ -36,11 +36,11 @@ fit-margin-left="0" fit-margin-right="0" fit-margin-bottom="0" - inkscape:zoom="0.71542514" - inkscape:cx="808.61011" - inkscape:cy="440.29764" + inkscape:zoom="1.0117639" + inkscape:cx="596.48301" + inkscape:cy="463.05268" inkscape:window-width="1920" - inkscape:window-height="1043" + inkscape:window-height="1016" inkscape:window-x="0" inkscape:window-y="0" inkscape:window-maximized="1" @@ -2184,8 +2184,8 @@ id="rect2500" width="555.625" height="211.66667" - x="-793.74982" - y="42.333302" + x="-793.74988" + y="42.333309" ry="0" />FILTERING)(HOST REMOVAL)(HOST REMOVAL)(RUN MERGING)(RUN MERGING)Bowtie2samtoolsstatssamtoolsstatscatFastQCFastQCfalcoFastQCfalcoFastQCFastQCfalcoFastQCFastQCfalcotaxprofiler + id="path1461-29-9-8-6-6-7-69" /> diff --git a/docs/usage.md b/docs/usage.md index 5769bd9..8ec435a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -47,6 +47,7 @@ The pipeline will auto-detect whether a sample is single- or paired-end using th A final samplesheet file consisting of both single- and paired-end data, as well as long-read FASTA files may look something like the one below. This is for 6 samples, where `2612` has been sequenced twice. ```console +sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta 2611,ERR5766174,ILLUMINA,,,///fasta/ERX5474930_ERR5766174_1.fa.gz 2612,ERR5766176,ILLUMINA,///fastq/ERX5474932_ERR5766176_1.fastq.gz,///fastq/ERX5474932_ERR5766176_2.fastq.gz, 2612,ERR5766180,ILLUMINA,///fastq/ERX5474936_ERR5766180_1.fastq.gz,, @@ -73,14 +74,15 @@ The pipeline takes the locations and specific profiling parameters of the tool o > ⚠️ nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user. See below for more information of the expected database files. -An example database sheet can look as follows, where 4 tools are being used, and `malt` and `kraken2` will be used against two databases each. +An example database sheet can look as follows, where 5 tools are being used, and `malt` and `kraken2` will be used against two databases each. This is because specifying `bracken` implies first running `kraken2` on the same database. ```console tool,db_name,db_params,db_path malt,malt85,-id 85,///malt/testdb-malt/ malt,malt95,-id 90,///malt/testdb-malt.tar.gz -kraken2,db1,,///kraken2/testdb-kraken2.tar.gz +bracken,db1,,///bracken/testdb-bracken.tar.gz kraken2,db2,--quick,///kraken2/testdb-kraken2.tar.gz +krakenuniq,db3,,///krakenuniq/testdb-krakenuniq.tar.gz centrifuge,db1,,///centrifuge/minigut_cf.tar.gz metaphlan3,db1,,///metaphlan3/metaphlan_database/ motus,db_mOTU,,///motus/motus_database/ @@ -90,8 +92,8 @@ Column specifications are as follows: | Column | Description | | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tool` | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required]. | -| `db_name` | A unique name of the particular database [required]. | +| `tool` | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required]. Please note that `bracken` also implies running `kraken2` on the same database. | +| `db_name` | A unique name per tool for the particular database [required]. Please note that names need to be unique across both `kraken2` and `bracken` as well, even if re-using the same database. | | `db_params` | Any parameters of the given taxonomic profiler that you wish to specify that the taxonomic profiling tool should use when profiling against this specific. Can be empty to use taxonomic profiler defaults. Must not be surrounded by quotes [required]. We generally do not recommend specifying parameters here that turn on/off saving of output files or specifying particular file extensions - this should be already addressed via pipeline parameters. | | `db_path` | Path to the database. Can either be a path to a directory containing the database index files or a `.tar.gz` file which contains the compressed database directory with the same name as the tar archive, minus `.tar.gz` [required]. | @@ -115,6 +117,21 @@ Expected (uncompressed) database files for each tool are as follows: - `opts.k2d` - `hash.k2d` - `taxo.k2d` +- **Bracken** output of a combined `kraken2-` and `bracken-build` process. Please see the [documentation on Bracken](https://github.com/jenniferlu717/Bracken#running-bracken-easy-version) for details. The output is a directory containing files per expected sequencing read length similarly to: + - `hash.k2d` + - `opts.k2d` + - `taxo.k2d` + - `database.kraken` + - `database100mers.kmer_distrib` + - `database100mers.kraken` + - `database150mers.kmer_distrib` + - `database150mers.kraken` +- **KrakenUniq** output of `krakenuniq-build` command(s) A directory containing: + - `opts.k2d` + - `hash.k2d` + - `taxo.k2d` + - `database.idx` + - `taxDB` - **Centrifuge** output of `centrifuge-build`. A directory containing: - `..cf` - `..cf` @@ -166,6 +183,10 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` +### Sequencing quality control + +nf-core taxprofiler offers [`falco`][https://github.com/smithlabcode/falco] as an alternative option to [`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). + ### Preprocessing Steps nf-core/taxprofiler offers four main preprocessing steps @@ -179,12 +200,12 @@ nf-core/taxprofiler offers four main preprocessing steps Raw sequencing read processing in the form of adapter clipping and paired-end read merging can be activated via the `--perform_shortread_qc` or `--perform_longread_qc` flags. -It is highly recommended to run this on raw reads to remove artefacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles. +It is highly recommended to run this on raw reads to remove artifacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles. There are currently two options for short-read preprocessing: `fastp` or `adapterremoval`. For adapter clipping, you can either rely on tool default adapter sequences, or supply your own adapters (`--shortread_qc_adapter1` and `--shortread_qc_adapter2`) -By default, paired-end merging is not activated and paired-end profiling is performed where supported otherwise pairs will be independently profiled. If paired-end merging is activated you can also specify whether to exclude unmerged reads in the reads sent for profiling (`--shortread_qc_mergepairs` and `--shortread_qc_excludeunmerged`). +By default, paired-end merging is not activated and paired-end profiling is performed where supported otherwise pairs will be independently profiled. If paired-end merging is activated you can also specify whether to include unmerged reads in the reads sent for profiling (`--shortread_qc_mergepairs` and `--shortread_qc_includeunmerged`). You can also turn off clipping and only perform paired-end merging, if requested. This can be useful when processing data downloaded from the ENA, SRA, or DDBJ (`--shortread_qc_skipadaptertrim`). Both tools support length filtering of reads and can be tuned with `--shortread_qc_minlength`. Performing length filtering can be useful to remove short (often low sequencing complexity) sequences that result in unspecific classification and therefore slow down runtime during profiling, with minimal gain. @@ -230,9 +251,31 @@ You can optionally save the FASTQ output of the run merging with the `--save_run #### Profiling +###### Bracken + +It is unclear whether Bracken is suitable for running long reads, as it makes certain assumptions about read lengths. Furthemore, during testing we found issues where Bracken would fail on the long-read test data. Therefore nf-core/taxprofiler does not run Bracken on data specified as being sequenced with `OXFORD_NANOPORE` in the input samplesheet. If you believe this to be wrong, please contact us on the nf-core slack and we can discuss this. + +###### Centrifuge + +Centrifuge currently does not accept FASTA files as input, therefore no output will be produced for these input files. + +###### DIAMOND + +DIAMOND only allows output of a single format at a time, therefore parameters such --diamond_save_reads supplied will result in only aligned reads in SAM format will be produced, no taxonomic profiles will be available. Be aware of this when setting up your pipeline runs, depending n your particular use case. + ###### MALT -nf-core/taxprofiler uses MALT 0.4.1, which is a compatively old version. However it has been found that the most recent version of MALT (0.5.\*), at the time of writing, is broken. [The the LCA step appears not to be executed](http://megan.informatik.uni-tuebingen.de/t/lca-placement-failure-with-malt-v-0-5-2-and-0-5-3/1996/3), pushing all hits to the leaves of the taxonomy. However, if you need to use a more recent taxonomy map file with your databases, the output of `malt-build` from MALT 0.5.3 should be still be compatible with `malt-run` of 0.4.1. +MALT does not support paired-end reads alignment (unlike other tools), therefore nf-core/taxprofiler aligns these as indepenent files if read-merging is skipped. If you skip merging, you can sum or average the results of the counts of the pairs. + +Krona can only be run on MALT output if path to Krona taxonomy database supplied to `--krona_taxonomy_directory`. Therefore if you do not supply the a KRona directory, Krona plots will not be produced for MALT. + +###### MetaPhlAn3 + +MetaPhlAn3 currently does not accept FASTA files as input, therefore no output will be produced for these input files. + +###### mOTUs + +mOTUs currently does not accept FASTA files as input, therefore no output will be produced for these input files. ### Updating the pipeline @@ -461,7 +504,7 @@ malt-build -i path/to/fasta/files/*.{fna,fa} -s DNA -d index -t 8 -st 4 -a2t meg ## Troubleshooting and FAQs -### I get a warning during centrifuge_kreport process with exit status 255. +### I get a warning during centrifuge_kreport process with exit status 255 When a sample has insufficient hits for abundance estimation, the resulting `report.txt` file will be empty. diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 7883d70..02d4347 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -12,9 +12,9 @@ class WorkflowMain { // TODO nf-core: Add Zenodo DOI for pipeline after first release //"* The pipeline\n" + //" https://doi.org/10.5281/zenodo.XXXXXXX\n\n" + - "* The nf-core framework\n" + - " https://doi.org/10.1038/s41587-020-0439-x\n\n" + - "* Software dependencies\n" + + '* The nf-core framework\n' + + ' https://doi.org/10.1038/s41587-020-0439-x\n\n' + + '* Software dependencies\n' + " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" } @@ -53,15 +53,15 @@ class WorkflowMain { System.exit(0) } + // Print parameter summary log to screen + + log.info paramsSummaryLog(workflow, params, log) + // Validate workflow parameters via the JSON schema if (params.validate_params) { NfcoreSchema.validateParameters(workflow, params, log) } - // Print parameter summary log to screen - - log.info paramsSummaryLog(workflow, params, log) - // Check that a -profile or Nextflow config has been provided to run the pipeline NfcoreTemplate.checkConfigProvided(workflow, log) @@ -90,4 +90,5 @@ class WorkflowMain { } return null } + } diff --git a/modules.json b/modules.json index 10d6c74..fb721a8 100644 --- a/modules.json +++ b/modules.json @@ -7,7 +7,7 @@ "nf-core": { "adapterremoval": { "branch": "master", - "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + "git_sha": "ce7cf27e377fdacf7ebe8e75903ec70405ea1659" }, "bbmap/bbduk": { "branch": "master", @@ -21,6 +21,10 @@ "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" }, + "bracken/bracken": { + "branch": "master", + "git_sha": "8cab56516076b23c6f8eb1ac20ba4ce9692c85e1" + }, "cat/fastq": { "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" @@ -49,9 +53,13 @@ "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" }, + "falco": { + "branch": "master", + "git_sha": "fc959214036403ad83efe7a41d43d0606c445cda" + }, "fastp": { "branch": "master", - "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + "git_sha": "1e49f31e93c56a3832833eef90a02d3cde5a3f7e" }, "fastqc": { "branch": "master", @@ -89,6 +97,10 @@ "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" }, + "krakenuniq/preloadedkrakenuniq": { + "branch": "master", + "git_sha": "05649975c6611c6e007537a7984e186e12ae03af" + }, "krona/ktimporttaxonomy": { "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" @@ -99,7 +111,7 @@ }, "malt/run": { "branch": "master", - "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + "git_sha": "6d9712f03ec2de8264a50ee4541a617e1e063b51" }, "megan/rma2info": { "branch": "master", @@ -133,9 +145,9 @@ "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" }, - "porechop": { + "porechop/porechop": { "branch": "master", - "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + "git_sha": "2a4e85eb81875a572bb58133e37f84ba3cc484d7" }, "prinseqplusplus": { "branch": "master", @@ -145,6 +157,14 @@ "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" }, + "samtools/index": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "samtools/stats": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, "samtools/view": { "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" diff --git a/modules/local/kraken2_standard_report.nf b/modules/local/kraken2_standard_report.nf new file mode 100644 index 0000000..09a98c1 --- /dev/null +++ b/modules/local/kraken2_standard_report.nf @@ -0,0 +1,32 @@ +process KRAKEN2_STANDARD_REPORT { + tag "$meta.id" + label 'process_single' + + conda (params.enable_conda ? 'conda-forge::sed=4.8' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv2/biocontainers_v1.2.0_cv2.img' : + 'biocontainers/biocontainers:v1.2.0_cv2' }" + + input: + tuple val(meta), path(report) + + output: + tuple val(meta), path(result), emit: report + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + result = "${prefix}_standardized.kraken2.report.txt" + """ + cut -f1-3,6-8 '${report}' > '${result}' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cut: \$(echo \$(cut --version 2>&1) | sed 's/^.*(GNU coreutils) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} + diff --git a/modules/nf-core/adapterremoval/main.nf b/modules/nf-core/adapterremoval/main.nf index 0e17c05..643c141 100644 --- a/modules/nf-core/adapterremoval/main.nf +++ b/modules/nf-core/adapterremoval/main.nf @@ -34,7 +34,7 @@ process ADAPTERREMOVAL { AdapterRemoval \\ --file1 $reads \\ $args \\ - $adapterlist \\ + $list \\ --basename ${prefix} \\ --threads ${task.cpus} \\ --seed 42 \\ @@ -61,7 +61,7 @@ process ADAPTERREMOVAL { --file1 ${reads[0]} \\ --file2 ${reads[1]} \\ $args \\ - $adapterlist \\ + $list \\ --basename ${prefix} \\ --threads $task.cpus \\ --seed 42 \\ diff --git a/modules/nf-core/bracken/bracken/main.nf b/modules/nf-core/bracken/bracken/main.nf new file mode 100644 index 0000000..ac7d1af --- /dev/null +++ b/modules/nf-core/bracken/bracken/main.nf @@ -0,0 +1,42 @@ +process BRACKEN_BRACKEN { + tag "$meta.id" + label 'process_low' + + // WARN: Version information not provided by tool on CLI. + // Please update version string below when bumping container versions. + conda (params.enable_conda ? "bioconda::bracken=2.7" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bracken:2.7--py39hc16433a_0': + 'quay.io/biocontainers/bracken:2.7--py39hc16433a_0' }" + + input: + tuple val(meta), path(kraken_report) + path database + + output: + tuple val(meta), path(bracken_report), emit: reports + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + bracken_report = "${prefix}.tsv" + // WARN: Version information not provided by tool on CLI. + // Please update version string below when bumping container versions. + def VERSION = '2.7' + """ + bracken \\ + ${args} \\ + -d '${database}' \\ + -i '${kraken_report}' \\ + -o '${bracken_report}' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bracken: ${VERSION} + END_VERSIONS + """ +} diff --git a/modules/nf-core/bracken/bracken/meta.yml b/modules/nf-core/bracken/bracken/meta.yml new file mode 100644 index 0000000..4a05edd --- /dev/null +++ b/modules/nf-core/bracken/bracken/meta.yml @@ -0,0 +1,48 @@ +name: bracken_bracken +description: Re-estimate taxonomic abundance of metagenomic samples analyzed by kraken. +keywords: + - bracken + - metagenomics + - abundance + - kraken2 +tools: + - bracken: + description: Bracken (Bayesian Reestimation of Abundance with KrakEN) is a highly accurate statistical method that computes the abundance of species in DNA sequences from a metagenomics sample. + homepage: https://ccb.jhu.edu/software/bracken/ + documentation: https://ccb.jhu.edu/software/bracken/index.shtml?t=manual + tool_dev_url: https://github.com/jenniferlu717/Bracken + doi: "10.7717/peerj-cs.104" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - kraken_report: + type: file + description: TSV file with six columns coming from kraken2 output + pattern: "*.{tsv}" + - database: + type: file + description: Directory containing the kraken2/Bracken files for analysis + pattern: "*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reports: + type: file + description: TSV output report of the re-estimated abundances + pattern: "*.{tsv}" + +authors: + - "@Midnighter" diff --git a/modules/nf-core/falco/main.nf b/modules/nf-core/falco/main.nf new file mode 100644 index 0000000..1688162 --- /dev/null +++ b/modules/nf-core/falco/main.nf @@ -0,0 +1,57 @@ +process FALCO { + tag "$meta.id" + label 'process_single' + + + conda (params.enable_conda ? "bioconda::falco=1.2.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/falco:1.2.1--h867801b_3': + 'quay.io/biocontainers/falco:1.2.1--h867801b_3' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.html"), emit: html + tuple val(meta), path("*.txt") , emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ( reads.toList().size() == 1 ) { + """ + falco $args --threads $task.cpus ${reads} -D ${prefix}_data.txt -S ${prefix}_summary.txt -R ${prefix}_report.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + falco:\$( falco --version | sed -e "s/falco//g" ) + END_VERSIONS + """ + } else { + """ + falco $args --threads $task.cpus ${reads} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + falco:\$( falco --version | sed -e "s/falco//g" ) + END_VERSIONS + """ + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_data.txt + touch ${prefix}_fastqc_data.html + touch ${prefix}_summary.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + falco: \$( falco --version | sed -e "s/falco v//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/falco/meta.yml b/modules/nf-core/falco/meta.yml new file mode 100644 index 0000000..6f77fb1 --- /dev/null +++ b/modules/nf-core/falco/meta.yml @@ -0,0 +1,52 @@ +name: falco +description: Run falco on sequenced reads +keywords: + - quality control + - qc + - adapters + - fastq +tools: + - fastqc: + description: "falco is a drop-in C++ implementation of FastQC to assess the quality of sequence reads." + + homepage: "https://falco.readthedocs.io/" + documentation: "https://falco.readthedocs.io/" + tool_dev_url: "None" + doi: "" + licence: "['GPL v3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - html: + type: file + description: FastQC like report + pattern: "*_{fastqc_report.html}" + - txt: + type: file + description: falco report data + pattern: "*_{data.txt}" + - txt: + type: file + description: falco summary file + pattern: "*_{summary.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@lucacozzuto" diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf index 11ea4db..207258a 100644 --- a/modules/nf-core/fastp/main.nf +++ b/modules/nf-core/fastp/main.nf @@ -9,6 +9,7 @@ process FASTP { input: tuple val(meta), path(reads) + path adapter_fasta val save_trimmed_fail val save_merged @@ -27,6 +28,7 @@ process FASTP { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def adapter_list = adapter_fasta ? "--adapter_fasta ${adapter_fasta}" : "" def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : '' // Added soft-links to original fastqs for consistent naming in MultiQC // Use single ended for interleaved. Add --interleaved_in in config. @@ -40,6 +42,7 @@ process FASTP { --thread $task.cpus \\ --json ${prefix}.fastp.json \\ --html ${prefix}.fastp.html \\ + $adapter_list \\ $fail_fastq \\ $args \\ 2> ${prefix}.fastp.log \\ @@ -61,6 +64,7 @@ process FASTP { --thread $task.cpus \\ --json ${prefix}.fastp.json \\ --html ${prefix}.fastp.html \\ + $adapter_list \\ $fail_fastq \\ $args \\ 2> ${prefix}.fastp.log @@ -82,6 +86,7 @@ process FASTP { --out2 ${prefix}_2.fastp.fastq.gz \\ --json ${prefix}.fastp.json \\ --html ${prefix}.fastp.html \\ + $adapter_list \\ $fail_fastq \\ $merge_fastq \\ --thread $task.cpus \\ diff --git a/modules/nf-core/fastp/meta.yml b/modules/nf-core/fastp/meta.yml index 2368fde..6f6fad7 100644 --- a/modules/nf-core/fastp/meta.yml +++ b/modules/nf-core/fastp/meta.yml @@ -23,6 +23,10 @@ input: List of input FastQ files of size 1 and 2 for single-end and paired-end data, respectively. If you wish to run interleaved paired-end data, supply as single-end data but with `--interleaved_in` in your `modules.conf`'s `ext.args` for the module. + - adapter_fasta: + type: file + description: File in FASTA format containing possible adapters to remove. + pattern: "*.{fasta,fna,fas,fa}" - save_trimmed_fail: type: boolean description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz` diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf b/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf new file mode 100644 index 0000000..0ecacee --- /dev/null +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf @@ -0,0 +1,224 @@ +process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? "bioconda::krakenuniq=1.0.0" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krakenuniq:1.0.0--pl5321h19e8d03_0': + 'quay.io/biocontainers/krakenuniq:1.0.0--pl5321h19e8d03_0' }" + + input: + tuple val(meta), path(fastqs) + path db + val ram_chunk_size + val save_output_fastqs + val report_file + val save_output + + output: + tuple val(meta), path('*.classified{.,_}*') , optional:true, emit: classified_reads_fastq + tuple val(meta), path('*.unclassified{.,_}*') , optional:true, emit: unclassified_reads_fastq + tuple val(meta), path('*classified.txt') , optional:true, emit: classified_assignment + tuple val(meta), path('*report.txt') , emit: report + + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args ?: '' + + def classified = meta.single_end ? '"\${PREFIX}.classified.fastq"' : '"\${PREFIX}.classified#.fastq"' + def unclassified = meta.single_end ? '"\${PREFIX}.unclassified.fastq"' : '"\${PREFIX}.unclassified#.fastq"' + def classified_option = save_output_fastqs ? "--classified-out ${classified}" : '' + def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : '' + def output_option = save_output ? '--output "\${PREFIX}.krakenuniq.classified.txt"' : '' + def report = report_file ? '--report-file "\${PREFIX}.krakenuniq.report.txt"' : '' + def compress_reads_command = save_output_fastqs ? 'gzip --no-name *.fastq' : '' + if (meta.single_end) { + """ + krakenuniq \\ + --db $db \\ + --preload \\ + --preload-size $ram_chunk_size \\ + --threads $task.cpus \\ + $args + + strip_suffix() { + local result=\$1 + # Strip any file extensions. + echo "\${result%%.*}" + } + + printf "%s\\n" ${fastqs} | while read FASTQ; do \\ + PREFIX="\$(strip_suffix "\${FASTQ}")" + + krakenuniq \\ + --db $db \\ + --threads $task.cpus \\ + $report \\ + $output_option \\ + $unclassified_option \\ + $classified_option \\ + $output_option \\ + $args2 \\ + "\${FASTQ}" + done + + $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//') + END_VERSIONS + """ + } else { + """ + krakenuniq \\ + --db $db \\ + --preload \\ + --preload-size $ram_chunk_size \\ + --threads $task.cpus \\ + $args + + strip_suffix() { + local result + read result + # Strip any trailing dot or underscore. + result="\${result%_}" + echo "\${result%.}" + } + + printf "%s %s\\n" ${fastqs} | while read FASTQ; do \\ + read -r -a FASTQ <<< "\${FASTQ}" + PREFIX="\$(printf "%s\\n" "\${FASTQ[@]}" | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' | strip_suffix)" + + krakenuniq \\ + --db $db \\ + --threads $task.cpus \\ + $report \\ + $output_option \\ + $unclassified_option \\ + $classified_option \\ + $output_option \\ + --paired \\ + $args2 \\ + "\${FASTQ[@]}" + done + + $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//') + END_VERSIONS + """ + } + + stub: + def args = task.ext.args ?: '' + def args2 = task.ext.args ?: '' + + def classified = meta.single_end ? '"\${PREFIX}.classified.fastq"' : '"\${PREFIX}.classified#.fastq"' + def unclassified = meta.single_end ? '"\${PREFIX}.unclassified.fastq"' : '"\${PREFIX}.unclassified#.fastq"' + def classified_option = save_output_fastqs ? "--classified-out ${classified}" : '' + def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : '' + def output_option = save_output ? '--output "\${PREFIX}.krakenuniq.classified.txt"' : '' + def report = report_file ? '--report-file "\${PREFIX}.krakenuniq.report.txt"' : '' + def compress_reads_command = save_output_fastqs ? 'gzip --no-name *.fastq' : '' + if (meta.single_end) { + """ + echo krakenuniq \\ + --db $db \\ + --preload \\ + --preload-size $ram_chunk_size \\ + --threads $task.cpus \\ + $args + + strip_suffix() { + local result=\$1 + # Strip any file extensions. + echo "\${result%%.*}" + } + + printf "%s\\n" ${fastqs} | while read FASTQ; do \\ + echo "\${FASTQ}" + PREFIX="\$(strip_suffix "\${FASTQ}")" + echo "\${PREFIX}" + + echo krakenuniq \\ + --db $db \\ + --threads $task.cpus \\ + $report \\ + $output_option \\ + $unclassified_option \\ + $classified_option \\ + $output_option \\ + $args2 \\ + "\${FASTQ}" + + touch "\${PREFIX}.classified.fastq.gz" + touch "\${PREFIX}.krakenuniq.classified.txt" + touch "\${PREFIX}.krakenuniq.report.txt" + touch "\${PREFIX}.unclassified.fastq.gz" + done + + echo $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//') + END_VERSIONS + """ + } else { + """ + echo krakenuniq \\ + --db $db \\ + --preload \\ + --preload-size $ram_chunk_size \\ + --threads $task.cpus \\ + $args + + strip_suffix() { + local result + read result + # Strip any trailing dot or underscore. + result="\${result%_}" + echo "\${result%.}" + } + + printf "%s %s\\n" ${fastqs} | while read FASTQ; do \\ + read -r -a FASTQ <<< "\${FASTQ}" + echo "\${FASTQ[@]}" + PREFIX="\$(printf "%s\\n" "\${FASTQ[@]}" | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' | strip_suffix)" + echo "\${PREFIX}" + + echo krakenuniq \\ + --db $db \\ + --threads $task.cpus \\ + $report \\ + $output_option \\ + $unclassified_option \\ + $classified_option \\ + $output_option \\ + --paired \\ + $args2 \\ + "\${FASTQ[@]}" + + touch "\${PREFIX}.classified_1.fastq.gz" "\${PREFIX}.classified_2.fastq.gz" + touch "\${PREFIX}.krakenuniq.classified.txt" + touch "\${PREFIX}.krakenuniq.report.txt" + touch "\${PREFIX}.unclassified_1.fastq.gz" "\${PREFIX}.unclassified_2.fastq.gz" + done + + echo $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml b/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml new file mode 100644 index 0000000..4ac645c --- /dev/null +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml @@ -0,0 +1,78 @@ +name: "krakenuniq_preloadedkrakenuniq" +description: Classifies metagenomic sequence data using unique k-mer counts +keywords: + - classify + - metagenomics + - kmers + - fastq + - db +tools: + - "krakenuniq": + description: "Metagenomics classifier with unique k-mer counting for more specific results" + homepage: https://github.com/fbreitwieser/krakenuniq + documentation: https://github.com/fbreitwieser/krakenuniq + doi: 10.1186/s13059-018-1568-0 + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastqs: + type: file + description: List of input FastQ files + - db: + type: directory + description: KrakenUniq database + - ram_chunk_size: + type: val + description: Amount of maximum amount of RAM each chunk of database that should be loaded at any one time + pattern: "*GB" + - save_output_fastqs: + type: boolean + description: | + If true, optional commands are added to save classified and unclassified reads + as fastq files + - save_reads_assignment: + type: boolean + description: | + If true, an optional command is added to save a file reporting the taxonomic + classification of each input read +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classified_reads_fastq: + type: file + description: | + Reads classified as belonging to any of the taxa + on the KrakenUniq database. + pattern: "*.fastq.gz" + - unclassified_reads_fastq: + type: file + description: | + Reads not classified to any of the taxa + on the KrakenUniq database. + pattern: "*.fastq.gz" + - classified_assignment: + type: file + description: | + KrakenUniq output file indicating the taxonomic assignment of + each input read ## DOUBLE CHECK!! + - report: + type: file + description: | + KrakenUniq report containing stats about classified + and not classifed reads. + pattern: "*.report.txt" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@mjamy" + - "@Midnighter" diff --git a/modules/nf-core/malt/run/main.nf b/modules/nf-core/malt/run/main.nf index 2b91d90..2e75b4c 100644 --- a/modules/nf-core/malt/run/main.nf +++ b/modules/nf-core/malt/run/main.nf @@ -2,14 +2,13 @@ process MALT_RUN { tag "$meta.id" label 'process_high' - conda (params.enable_conda ? "bioconda::malt=0.41" : null) + conda (params.enable_conda ? "bioconda::malt=0.61" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/malt:0.41--1' : - 'quay.io/biocontainers/malt:0.41--1' }" + 'https://depot.galaxyproject.org/singularity/malt:0.61--hdfd78af_0' : + 'quay.io/biocontainers/malt:0.61--hdfd78af_0' }" input: tuple val(meta), path(fastqs) - val mode path index output: @@ -38,7 +37,6 @@ process MALT_RUN { -o . \\ $args \\ --inFile ${fastqs.join(' ')} \\ - -m $mode \\ --index $index/ |&tee ${prefix}-malt-run.log cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/malt/run/meta.yml b/modules/nf-core/malt/run/meta.yml index 66f2d7a..8fa1958 100644 --- a/modules/nf-core/malt/run/meta.yml +++ b/modules/nf-core/malt/run/meta.yml @@ -28,10 +28,6 @@ input: type: file description: Input FASTQ files pattern: "*.{fastq.gz,fq.gz}" - - mode: - type: string - description: Program mode - pattern: "Unknown|BlastN|BlastP|BlastX|Classifier" - index: type: directory description: Index/database directory from malt-build diff --git a/modules/nf-core/porechop/main.nf b/modules/nf-core/porechop/porechop/main.nf similarity index 97% rename from modules/nf-core/porechop/main.nf rename to modules/nf-core/porechop/porechop/main.nf index 77050bc..f946417 100644 --- a/modules/nf-core/porechop/main.nf +++ b/modules/nf-core/porechop/porechop/main.nf @@ -1,4 +1,4 @@ -process PORECHOP { +process PORECHOP_PORECHOP { tag "$meta.id" label 'process_medium' @@ -28,7 +28,6 @@ process PORECHOP { $args \\ -o ${prefix}.fastq.gz \\ > ${prefix}.log - cat <<-END_VERSIONS > versions.yml "${task.process}": porechop: \$( porechop --version ) diff --git a/modules/nf-core/porechop/meta.yml b/modules/nf-core/porechop/porechop/meta.yml similarity index 98% rename from modules/nf-core/porechop/meta.yml rename to modules/nf-core/porechop/porechop/meta.yml index e526317..98b838f 100644 --- a/modules/nf-core/porechop/meta.yml +++ b/modules/nf-core/porechop/porechop/meta.yml @@ -1,4 +1,4 @@ -name: porechop +name: "porechop_porechop" description: Adapter removal and demultiplexing of Oxford Nanopore reads keywords: - adapter diff --git a/modules/nf-core/samtools/index/main.nf b/modules/nf-core/samtools/index/main.nf new file mode 100644 index 0000000..e04e63e --- /dev/null +++ b/modules/nf-core/samtools/index/main.nf @@ -0,0 +1,48 @@ +process SAMTOOLS_INDEX { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : + 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai"), optional:true, emit: crai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + index \\ + -@ ${task.cpus-1} \\ + $args \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${input}.bai + touch ${input}.crai + touch ${input}.csi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml new file mode 100644 index 0000000..e5cadbc --- /dev/null +++ b/modules/nf-core/samtools/index/meta.yml @@ -0,0 +1,53 @@ +name: samtools_index +description: Index SAM/BAM/CRAM file +keywords: + - index + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: hhttp://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - crai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - csi: + type: file + description: CSI index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@maxulysse" diff --git a/modules/nf-core/samtools/stats/main.nf b/modules/nf-core/samtools/stats/main.nf new file mode 100644 index 0000000..9b0c386 --- /dev/null +++ b/modules/nf-core/samtools/stats/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_STATS { + tag "$meta.id" + label 'process_single' + + conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : + 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + + input: + tuple val(meta), path(input), path(input_index) + path fasta + + output: + tuple val(meta), path("*.stats"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ + stats \\ + --threads ${task.cpus} \\ + ${reference} \\ + ${input} \\ + > ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/stats/meta.yml b/modules/nf-core/samtools/stats/meta.yml new file mode 100644 index 0000000..cac50b1 --- /dev/null +++ b/modules/nf-core/samtools/stats/meta.yml @@ -0,0 +1,53 @@ +name: samtools_stats +description: Produces comprehensive statistics from SAM/BAM/CRAM file +keywords: + - statistics + - counts + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: hhttp://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - fasta: + type: optional file + description: Reference file the CRAM was created with + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - stats: + type: file + description: File containing samtools stats output + pattern: "*.{stats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@FriederikeHanssen" diff --git a/nextflow.config b/nextflow.config index efb5aff..00edd9b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -60,13 +60,16 @@ params { databases = null // FASTQ preprocessing + preprocessing_qc_tool = 'fastqc' + perform_shortread_qc = false shortread_qc_tool = 'fastp' shortread_qc_skipadaptertrim = false - shortread_qc_mergepairs = true - shortread_qc_excludeunmerged = false + shortread_qc_mergepairs = false + shortread_qc_includeunmerged = false shortread_qc_adapter1 = null shortread_qc_adapter2 = null + shortread_qc_adapterlist = null shortread_qc_minlength = 15 perform_longread_qc = false @@ -114,6 +117,16 @@ params { run_kraken2 = false kraken2_save_reads = false // added directly to module in profiling.nf kraken2_save_readclassification = false // added directly to module in profiling.nf + kraken2_save_minimizers = false + + //krakenuniq + run_krakenuniq = false + krakenuniq_ram_chunk_size = '16G' + krakenuniq_save_reads = false // added directly to module in profiling.nf + krakenuniq_save_readclassifications = false // added directly to module in profiling.nf + + // Bracken + run_bracken = false // centrifuge run_centrifuge = false @@ -132,7 +145,10 @@ params { diamond_save_reads = false // this will override default diamond output format so no taxonomic profile is generated! added directly to module in profiling.nf // mOTUs - run_motus = false + run_motus = false + motus_use_relative_abundance = false + motus_remove_ncbi_ids = false + motus_save_mgc_read_counts = false // krona run_krona = false @@ -229,6 +245,7 @@ profiles { test_nopreprocessing { includeConfig 'conf/test_nopreprocessing.config' } test_nothing { includeConfig 'conf/test_nothing.config' } test_motus { includeConfig 'conf/test_motus.config' } + test_krakenuniq { includeConfig 'conf/test_krakenuniq.config' } test_pep { includeConfig 'conf/test_pep.config' } } diff --git a/nextflow_schema.json b/nextflow_schema.json index f88443f..d6debb3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir", "databases"], + "required": ["input", "databases", "outdir"], "properties": { "input": { "type": "string", @@ -56,6 +56,14 @@ "description": "Common options across both long and short read preprocessing QC steps", "default": "", "properties": { + "preprocessing_qc_tool": { + "type": "string", + "default": "fastqc", + "enum": ["fastqc", "falco"], + "help_text": "Falco is designed as a drop-in replacement for FastQC but written in C++ for faster computation. We particularly recommend using falco when using long reads (due to reduced memory constraints), however is also applicable for short reads.", + "description": "Specify the tool used for quality control of raw sequencing reads", + "fa_icon": "fas fa-tools" + }, "save_preprocessed_reads": { "type": "boolean", "fa_icon": "fas fa-save", @@ -104,18 +112,23 @@ "description": "Specify adapter 2 nucleotide sequence", "help_text": "Specify a custom reverse or R2 adapter sequence to be removed from reads. \n\nIf not set, the selected short-read QC tool's defaults will be used.\n\n> Modifies tool parameter(s):\n> - fastp: `--adapter_sequence`. fastp default: `AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT`\n> - AdapterRemoval: `--adapter1`. AdapteRemoval2 default: `AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT`" }, + "shortread_qc_adapterlist": { + "type": "string", + "default": "None", + "description": "Specify a list of all possible adapters to trim. Overrides --shortread_qc_adapter1/2. Formats: .txt (AdapterRemoval) or .fasta. (fastp).", + "help_text": "Allows to supply a file with a list of adapter (combinations) to remove from all files. \n\nOverrides the --shortread_qc_adapter1/--shortread_qc_adapter2 parameters . \n\nFor AdapterRemoval this consists of a two column table with a `.txt` extension: first column represents forward strand, second column for reverse strand. You must supply all possible combinations, one per line, and this list is applied to all files. See AdapterRemoval documentation for more information.\n\nFor fastp this consists of a standard FASTA format with a `.fasta`/`.fa`/`.fna`/`.fas` extension. The adapter sequence in this file should be at least 6bp long, otherwise it will be skipped. fastp trims the adapters present in the FASTA file one by one.\n\n> Modifies AdapterRemoval parameter: --adapter-list\n> Modifies fastp parameter: --adapter_fasta" + }, "shortread_qc_mergepairs": { "type": "boolean", "fa_icon": "fas fa-toggle-on", "description": "Turn on merging of read pairs for paired-end data", - "default": true, - "help_text": "Turn on the merging of read-pairs of paired-end short read sequencing data for AdapterRemoval (this is performed automatically with fastp).\n\n> Modifies tool parameter(s):\n> - AdapterRemoval: `--collapse`\n" + "help_text": "Turn on the merging of read-pairs of paired-end short read sequencing data. \n\n> Modifies tool parameter(s):\n> - AdapterRemoval: `--collapse`\n> - fastp: `-m --merged_out`\n" }, - "shortread_qc_excludeunmerged": { + "shortread_qc_includeunmerged": { "type": "boolean", "fa_icon": "far fa-times-circle", - "description": "Discard unmerged reads from paired-end merging", - "help_text": "Turns off the inclusion of unmerged reads in resulting processing FASTQ file of paired-end sequencing data when using `fastp`.\n\nThis can be useful in cases where you prefer to have very short reads (e.g. aDNA), thus excluding longer-reads or possibly faulty reads where one of the pair was discarded.\n\n> Modifies tool parameter(s):\n> - removed from reads `--include_unmerged`\n" + "description": "Include unmerged reads from paired-end merging in the downstream analysis", + "help_text": "Turns on the inclusion of unmerged reads in resulting FASTQ file from merging paired-end sequencing data when using `fastp` and/or `AdapterRemoval`. For `fastp` this means the unmerged read pairs are directly included in the output FASTQ file. For `AdapterRemoval`, additional output files containing unmerged reads are all concatenated into one file by the workflow.\n\nExcluding unmerged reads can be useful in cases where you prefer to have very short reads (e.g. aDNA), thus excluding longer-reads or possibly faulty reads where one of the pair was discarded.\n\n> Adds `fastp` option: `--include_unmerged`\n" }, "shortread_qc_minlength": { "type": "integer", @@ -382,6 +395,41 @@ "description": "Turn on saving of Kraken2 per-read taxonomic assignment file", "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - kraken2: `--output`" }, + "kraken2_save_minimizers": { + "type": "boolean", + "description": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.", + "fa_icon": "fas fa-save", + "help_text": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.\n\nAdds `--report-minimizer-data` to the kraken2 command." + }, + "run_krakenuniq": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on profiling with KrakenUniq. Requires database to be present CSV file passed to --databases" + }, + "krakenuniq_save_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of KrakenUniq-aligned reads", + "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--classified-out` and `--unclassified-out`" + }, + "krakenuniq_ram_chunk_size": { + "type": "string", + "default": "16G", + "description": "Specify how large to chunk database when loading into memory for KrakenUniq", + "fa_icon": "fas fa-database", + "help_text": "nf-core/taxprofiler utilises a 'low memory' option for KrakenUniq that can reduce the amount of RAM the process requires using the `--preloaded` option.\n\nA further extension to this option is that you can specify how large each chunk of the database should be that gets loaded into memory at any one time. You can specify the amount of RAM to chunk the database to with this parameter, and is particularly useful for people with limited computational resources.\n\nMore information about this parameter can be seen [here](https://github.com/fbreitwieser/krakenuniq/blob/master/README.md#new-release-v07).\n\n> Modifies KrakenUniq parameter: --preload\n\n> \n\n" + }, + "krakenuniq_save_readclassifications": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of KrakenUniq per-read taxonomic assignment file", + "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--output`" + }, + "run_bracken": { + "type": "boolean", + "description": "Post-process kraken2 reports with Bracken.", + "fa_icon": "fas fa-toggle-on" + }, "run_malt": { "type": "boolean", "fa_icon": "fas fa-toggle-on", @@ -415,6 +463,18 @@ "type": "boolean", "fa_icon": "fas fa-toggle-on", "description": "Turn on profiling with mOTUs. Requires database to be present CSV file passed to --databases" + }, + "motus_use_relative_abundance": { + "type": "boolean", + "description": "Turn on printing relative abundance instead of counts." + }, + "motus_save_mgc_read_counts": { + "type": "boolean", + "description": "Turn on saving the mgc reads count." + }, + "motus_remove_ncbi_ids": { + "type": "boolean", + "description": "Turn on removing NCBI taxonomic IDs." } }, "fa_icon": "fas fa-align-center" diff --git a/subworkflows/local/longread_hostremoval.nf b/subworkflows/local/longread_hostremoval.nf index 82ea8ca..3f9fdef 100644 --- a/subworkflows/local/longread_hostremoval.nf +++ b/subworkflows/local/longread_hostremoval.nf @@ -6,6 +6,8 @@ include { MINIMAP2_INDEX } from '../../modules/nf-core/minimap2/inde include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main' include { SAMTOOLS_BAM2FQ } from '../../modules/nf-core/samtools/bam2fq/main' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_STATS } from '../../modules/nf-core/samtools/stats/main' workflow LONGREAD_HOSTREMOVAL { take: @@ -39,9 +41,21 @@ workflow LONGREAD_HOSTREMOVAL { SAMTOOLS_BAM2FQ ( SAMTOOLS_VIEW.out.bam, false ) ch_versions = ch_versions.mix( SAMTOOLS_BAM2FQ.out.versions.first() ) + SAMTOOLS_INDEX ( SAMTOOLS_VIEW.out.bam ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions.first() ) + + bam_bai = MINIMAP2_ALIGN.out.bam + .join(SAMTOOLS_INDEX.out.bai, remainder: true) + + SAMTOOLS_STATS ( bam_bai, reference ) + ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_STATS.out.stats ) + emit: + stats = SAMTOOLS_STATS.out.stats //channel: [val(meta), [reads ] ] reads = SAMTOOLS_BAM2FQ.out.reads // channel: [ val(meta), [ reads ] ] versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files } diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf index ce537e8..961417d 100644 --- a/subworkflows/local/longread_preprocessing.nf +++ b/subworkflows/local/longread_preprocessing.nf @@ -3,7 +3,9 @@ // include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/fastqc/main' -include { PORECHOP } from '../../modules/nf-core/porechop/main' +include { FALCO as FALCO_PROCESSED } from '../../modules/nf-core/falco/main' + +include { PORECHOP_PORECHOP } from '../../modules/nf-core/porechop/porechop/main' include { FILTLONG } from '../../modules/nf-core/filtlong/main' workflow LONGREAD_PREPROCESSING { @@ -15,9 +17,9 @@ workflow LONGREAD_PREPROCESSING { ch_multiqc_files = Channel.empty() if ( !params.longread_qc_skipadaptertrim && params.longread_qc_skipqualityfilter) { - PORECHOP ( reads ) + PORECHOP_PORECHOP ( reads ) - ch_processed_reads = PORECHOP.out.reads + ch_processed_reads = PORECHOP_PORECHOP.out.reads .map { meta, reads -> def meta_new = meta.clone() @@ -25,8 +27,8 @@ workflow LONGREAD_PREPROCESSING { [ meta_new, reads ] } - ch_versions = ch_versions.mix(PORECHOP.out.versions.first()) - ch_multiqc_files = ch_multiqc_files.mix( PORECHOP.out.log ) + ch_versions = ch_versions.mix(PORECHOP_PORECHOP.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( PORECHOP_PORECHOP.out.log ) } else if ( params.longread_qc_skipadaptertrim && !params.longread_qc_skipqualityfilter) { @@ -35,8 +37,8 @@ workflow LONGREAD_PREPROCESSING { ch_multiqc_files = ch_multiqc_files.mix( FILTLONG.out.log ) } else { - PORECHOP ( reads ) - ch_clipped_reads = PORECHOP.out.reads + PORECHOP_PORECHOP ( reads ) + ch_clipped_reads = PORECHOP_PORECHOP.out.reads .map { meta, reads -> def meta_new = meta.clone() @@ -46,14 +48,22 @@ workflow LONGREAD_PREPROCESSING { ch_processed_reads = FILTLONG ( ch_clipped_reads.map{ meta, reads -> [meta, [], reads ]} ).reads - ch_versions = ch_versions.mix(PORECHOP.out.versions.first()) + ch_versions = ch_versions.mix(PORECHOP_PORECHOP.out.versions.first()) ch_versions = ch_versions.mix(FILTLONG.out.versions.first()) - ch_multiqc_files = ch_multiqc_files.mix( PORECHOP.out.log ) + ch_multiqc_files = ch_multiqc_files.mix( PORECHOP_PORECHOP.out.log ) ch_multiqc_files = ch_multiqc_files.mix( FILTLONG.out.log ) } - FASTQC_PROCESSED ( ch_processed_reads ) - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + if (params.preprocessing_qc_tool == 'fastqc') { + FASTQC_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + + } else if (params.preprocessing_qc_tool == 'falco') { + FALCO_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FALCO_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt ) + } emit: reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 11c4a72..f5c970c 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -5,12 +5,15 @@ include { MALT_RUN } from '../../modules/nf-core/malt/run/main' include { MEGAN_RMA2INFO as MEGAN_RMA2INFO_TSV } from '../../modules/nf-core/megan/rma2info/main' include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/kraken2/kraken2/main' +include { KRAKEN2_STANDARD_REPORT } from '../../modules/local/kraken2_standard_report' +include { BRACKEN_BRACKEN } from '../../modules/nf-core/bracken/bracken/main' include { CENTRIFUGE_CENTRIFUGE } from '../../modules/nf-core/centrifuge/centrifuge/main' include { CENTRIFUGE_KREPORT } from '../../modules/nf-core/centrifuge/kreport/main' include { METAPHLAN3_METAPHLAN3 } from '../../modules/nf-core/metaphlan3/metaphlan3/main' include { KAIJU_KAIJU } from '../../modules/nf-core/kaiju/kaiju/main' include { DIAMOND_BLASTX } from '../../modules/nf-core/diamond/blastx/main' include { MOTUS_PROFILE } from '../../modules/nf-core/motus/profile/main' +include { KRAKENUNIQ_PRELOADEDKRAKENUNIQ } from '../../modules/nf-core/krakenuniq/preloadedkrakenuniq/main' workflow PROFILING { take: @@ -39,12 +42,13 @@ workflow PROFILING { .combine(databases) .branch { malt: it[2]['tool'] == 'malt' - kraken2: it[2]['tool'] == 'kraken2' + kraken2: it[2]['tool'] == 'kraken2' || it[2]['tool'] == 'bracken' // to reuse the kraken module to produce the input data for bracken metaphlan3: it[2]['tool'] == 'metaphlan3' centrifuge: it[2]['tool'] == 'centrifuge' kaiju: it[2]['tool'] == 'kaiju' diamond: it[2]['tool'] == 'diamond' motus: it[2]['tool'] == 'motus' + krakenuniq: it[2]['tool'] == 'krakenuniq' unknown: true } @@ -93,7 +97,7 @@ workflow PROFILING { db: it[2] } - MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) + MALT_RUN ( ch_input_for_malt.reads, ch_input_for_malt.db ) ch_maltrun_for_megan = MALT_RUN.out.rma6 .transpose() @@ -129,7 +133,46 @@ workflow PROFILING { ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report ) ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) ch_raw_classifications = ch_raw_classifications.mix( KRAKEN2_KRAKEN2.out.classified_reads_assignment ) - ch_raw_profiles = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.report ) + ch_raw_profiles = ch_raw_profiles.mix( + KRAKEN2_KRAKEN2.out.report + // Set the tool to be strictly 'kraken2' instead of potentially 'bracken' for downstream use. + // Will remain distinct from 'pure' Kraken2 results due to distinct database names in file names. + .map { meta, report -> [meta + [tool: 'kraken2'], report]} + ) + + } + + if ( params.run_kraken2 && params.run_bracken ) { + // Remove files from 'pure' kraken2 runs, so only those aligned against Bracken & kraken2 database are used. + def ch_kraken2_output = KRAKEN2_KRAKEN2.out.report + .filter { + meta, report -> + if ( meta['instrument_platform'] == 'OXFORD_NANOPORE' ) log.warn "[nf-core/taxprofiler] Bracken has not been evaluated for Nanopore data. Skipping Bracken for sample ${meta.id}." + meta['tool'] == 'bracken' && meta['instrument_platform'] != 'OXFORD_NANOPORE' + } + + // If necessary, convert the eight column output to six column output. + if (params.kraken2_save_minimizers) { + ch_kraken2_output = KRAKEN2_STANDARD_REPORT(ch_kraken2_output).report + } + + // Extract the database name to combine by. + ch_bracken_databases = databases + .filter { meta, db -> meta['tool'] == 'bracken' } + .map { meta, db -> [meta['db_name'], meta, db] } + + // Extract the database name to combine by. + ch_input_for_bracken = ch_kraken2_output + .map { meta, report -> [meta['db_name'], meta, report] } + .combine(ch_bracken_databases, by: 0) + .multiMap { key, meta, report, db_meta, db -> + report: [meta + db_meta, report] + db: db + } + + BRACKEN_BRACKEN(ch_input_for_bracken.report, ch_input_for_bracken.db) + ch_versions = ch_versions.mix(BRACKEN_BRACKEN.out.versions.first()) + ch_raw_profiles = ch_raw_profiles.mix(BRACKEN_BRACKEN.out.reports) } @@ -228,6 +271,28 @@ workflow PROFILING { ch_multiqc_files = ch_multiqc_files.mix( MOTUS_PROFILE.out.log ) } + if ( params.run_krakenuniq ) { + ch_input_for_krakenuniq = ch_input_for_profiling.krakenuniq + .map { + meta, reads, db_meta, db -> + [[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db] + } + .groupTuple(by: [0,2,3]) + .dump(tag: "krakenuniq_premultimap") + .multiMap { + single_meta, reads, db_meta, db -> + reads: [ single_meta + db_meta, reads.flatten() ] + db: db + } + // Hardcode to _always_ produce the report file (which is our basic otput, and goes into) + KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads.dump(tag: "krakenuniq_input"), ch_input_for_krakenuniq.db.dump(tag: "krakenuniq_db"), params.krakenuniq_ram_chunk_size, params.krakenuniq_save_reads, true, params.krakenuniq_save_readclassifications ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) + ch_versions = ch_versions.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.classified_assignment ) + ch_raw_profiles = ch_raw_profiles.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) + + } + emit: classifications = ch_raw_classifications profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index a5a43fe..7f5a0fb 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -10,6 +10,7 @@ workflow SHORTREAD_ADAPTERREMOVAL { take: reads // [[meta], [reads]] + adapterlist // file main: ch_versions = Channel.empty() @@ -21,15 +22,15 @@ workflow SHORTREAD_ADAPTERREMOVAL { paired: !it[0].single_end } - ADAPTERREMOVAL_SINGLE ( ch_input_for_adapterremoval.single, [] ) - ADAPTERREMOVAL_PAIRED ( ch_input_for_adapterremoval.paired, [] ) + ADAPTERREMOVAL_SINGLE ( ch_input_for_adapterremoval.single, adapterlist ) + ADAPTERREMOVAL_PAIRED ( ch_input_for_adapterremoval.paired, adapterlist ) /* * Due to the ~slightly~ very ugly output implementation of the current AdapterRemoval2 version, each file * has to be exported in a separate channel and we must manually recombine when necessary. */ - if ( params.shortread_qc_mergepairs && !params.shortread_qc_excludeunmerged ) { + if ( params.shortread_qc_mergepairs && params.shortread_qc_includeunmerged ) { ch_concat_fastq = Channel.empty() .mix( @@ -39,9 +40,7 @@ workflow SHORTREAD_ADAPTERREMOVAL { ADAPTERREMOVAL_PAIRED.out.paired_truncated ) .map { meta, reads -> - def meta_new = meta.clone() - meta_new.single_end = true - [meta_new, reads] + [meta + [single_end: true], reads] } .groupTuple() // Paired-end reads cause a nested tuple during grouping. @@ -54,7 +53,7 @@ workflow SHORTREAD_ADAPTERREMOVAL { ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads .mix(ADAPTERREMOVAL_SINGLE.out.singles_truncated) - } else if ( params.shortread_qc_mergepairs && params.shortread_qc_excludeunmerged ) { + } else if ( params.shortread_qc_mergepairs && !params.shortread_qc_includeunmerged ) { ch_concat_fastq = Channel.empty() .mix( @@ -62,9 +61,7 @@ workflow SHORTREAD_ADAPTERREMOVAL { ADAPTERREMOVAL_PAIRED.out.collapsed_truncated ) .map { meta, reads -> - def meta_new = meta.clone() - meta_new.single_end = true - [meta_new, reads] + [meta + [single_end: true], reads] } .groupTuple() .map { meta, fastq -> [meta, fastq.flatten()] } diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index d466041..cac5a27 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -8,6 +8,7 @@ include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/fastp/main' workflow SHORTREAD_FASTP { take: reads // [[meta], [reads]] + adapterlist main: ch_versions = Channel.empty() @@ -19,9 +20,9 @@ workflow SHORTREAD_FASTP { paired: it[0]['single_end'] == false } - FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) + FASTP_SINGLE ( ch_input_for_fastp.single, adapterlist, false, false ) // Last parameter here turns on merging of PE data - FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_qc_mergepairs ) + FASTP_PAIRED ( ch_input_for_fastp.paired, adapterlist, false, params.shortread_qc_mergepairs ) if ( params.shortread_qc_mergepairs ) { ch_fastp_reads_prepped_pe = FASTP_PAIRED.out.reads_merged diff --git a/subworkflows/local/shortread_hostremoval.nf b/subworkflows/local/shortread_hostremoval.nf index d181a34..04c8556 100644 --- a/subworkflows/local/shortread_hostremoval.nf +++ b/subworkflows/local/shortread_hostremoval.nf @@ -4,6 +4,9 @@ include { BOWTIE2_BUILD } from '../../modules/nf-core/bowtie2/build/main' include { BOWTIE2_ALIGN } from '../../modules/nf-core/bowtie2/align/main' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_STATS } from '../../modules/nf-core/samtools/stats/main' +include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main' workflow SHORTREAD_HOSTREMOVAL { take: @@ -22,11 +25,31 @@ workflow SHORTREAD_HOSTREMOVAL { ch_bowtie2_index = index.first() } - BOWTIE2_ALIGN ( reads, ch_bowtie2_index, true, false ) + BOWTIE2_ALIGN ( reads, ch_bowtie2_index, true, true) ch_versions = ch_versions.mix( BOWTIE2_ALIGN.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( BOWTIE2_ALIGN.out.log ) + ch_bowtie2_mapped = BOWTIE2_ALIGN.out.bam + .map { + meta, reads -> + [ meta, reads, [] ] + } + + SAMTOOLS_VIEW ( ch_bowtie2_mapped, [], [] ) + ch_versions = ch_versions.mix( SAMTOOLS_VIEW.out.versions.first() ) + + SAMTOOLS_INDEX ( SAMTOOLS_VIEW.out.bam ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions.first() ) + + bam_bai = BOWTIE2_ALIGN.out.bam + .join(SAMTOOLS_INDEX.out.bai, remainder: true) + + SAMTOOLS_STATS ( bam_bai, reference ) + ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_STATS.out.stats ) + emit: + stats = SAMTOOLS_STATS.out.stats reads = BOWTIE2_ALIGN.out.fastq // channel: [ val(meta), [ reads ] ] versions = ch_versions // channel: [ versions.yml ] mqc = ch_multiqc_files diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index 859c1d5..c823e3d 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -5,31 +5,39 @@ include { SHORTREAD_FASTP } from './shortread_fastp' include { SHORTREAD_ADAPTERREMOVAL } from './shortread_adapterremoval' -include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/fastqc/main' +include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/fastqc/main' +include { FALCO as FALCO_PROCESSED } from '../../modules/nf-core/falco/main' workflow SHORTREAD_PREPROCESSING { take: reads // [ [ meta ], [ reads ] ] + adapterlist // file main: ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() if ( params.shortread_qc_tool == "fastp" ) { - ch_processed_reads = SHORTREAD_FASTP ( reads ).reads + ch_processed_reads = SHORTREAD_FASTP ( reads, adapterlist ).reads ch_versions = ch_versions.mix( SHORTREAD_FASTP.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc ) } else if ( params.shortread_qc_tool == "adapterremoval" ) { - ch_processed_reads = SHORTREAD_ADAPTERREMOVAL ( reads ).reads + ch_processed_reads = SHORTREAD_ADAPTERREMOVAL ( reads, adapterlist ).reads ch_versions = ch_versions.mix( SHORTREAD_ADAPTERREMOVAL.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_ADAPTERREMOVAL.out.mqc ) } else { ch_processed_reads = reads } - FASTQC_PROCESSED ( ch_processed_reads ) - ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + if (params.preprocessing_qc_tool == 'fastqc') { + FASTQC_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + } else if (params.preprocessing_qc_tool == 'falco') { + FALCO_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FALCO_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt ) + } emit: reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf index cbb0fab..8c73472 100644 --- a/subworkflows/local/standardisation_profiles.nf +++ b/subworkflows/local/standardisation_profiles.nf @@ -3,7 +3,7 @@ // include { KAIJU_KAIJU2TABLE } from '../../modules/nf-core/kaiju/kaiju2table/main' -include { KRAKENTOOLS_COMBINEKREPORTS } from '../../modules/nf-core/krakentools/combinekreports/main' +include { KRAKENTOOLS_COMBINEKREPORTS as KRAKENTOOLS_COMBINEKREPORTS_KRAKEN } from '../../modules/nf-core/krakentools/combinekreports/main' include { KRAKENTOOLS_COMBINEKREPORTS as KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE } from '../../modules/nf-core/krakentools/combinekreports/main' include { METAPHLAN3_MERGEMETAPHLANTABLES } from '../../modules/nf-core/metaphlan3/mergemetaphlantables/main' include { MOTUS_MERGE } from '../../modules/nf-core/motus/merge/main' @@ -93,10 +93,10 @@ workflow STANDARDISATION_PROFILES { [[id:it[0]], it[1]] } - KRAKENTOOLS_COMBINEKREPORTS ( ch_profiles_for_kraken2 ) - ch_standardised_tables = ch_standardised_tables.mix( KRAKENTOOLS_COMBINEKREPORTS.out.txt ) - ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS.out.txt ) - ch_versions = ch_versions.mix( KRAKENTOOLS_COMBINEKREPORTS.out.versions ) + KRAKENTOOLS_COMBINEKREPORTS_KRAKEN ( ch_profiles_for_kraken2 ) + ch_standardised_tables = ch_standardised_tables.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.txt ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.txt ) + ch_versions = ch_versions.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.versions ) // MetaPhlAn3 diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 8b9edb7..68fc47a 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -12,7 +12,8 @@ WorkflowTaxprofiler.initialise(params, log) // TODO nf-core: Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist def checkPathParamList = [ params.input, params.databases, params.hostremoval_reference, - params.shortread_hostremoval_index, params.multiqc_config + params.shortread_hostremoval_index, params.multiqc_config, + params.shortread_qc_adapterlist ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } @@ -27,7 +28,7 @@ if ( params.input ) { if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } if (params.shortread_qc_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." -if (params.shortread_qc_excludeunmerged && !params.shortread_qc_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_qc_mergepairs" +if (params.shortread_qc_includeunmerged && !params.shortread_qc_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging is not turned on. Please specify --shortread_qc_mergepairs" if (params.shortread_complexityfilter_tool == 'fastp' && ( params.perform_shortread_qc == false || params.shortread_qc_tool != 'fastp' )) exit 1, "ERROR: [nf-core/taxprofiler] cannot use fastp complexity filtering if preprocessing not turned on and/or tool is not fastp. Please specify --perform_shortread_qc and/or --shortread_qc_tool 'fastp'" @@ -41,6 +42,7 @@ if (params.longread_hostremoval_index ) { ch_longread_reference_index = fi if (params.diamond_save_reads ) log.warn "[nf-core/taxprofiler] DIAMOND only allows output of a single format. As --diamond_save_reads supplied, only aligned reads in SAM format will be produced, no taxonomic profiles will be available." if (params.run_malt && params.run_krona && !params.krona_taxonomy_directory) log.warn "[nf-core/taxprofiler] Krona can only be run on MALT output if path to Krona taxonomy database supplied to --krona_taxonomy_directory. Krona will not be executed in this run for MALT." +if (params.run_bracken && !params.run_kraken2) exit 1, 'ERROR: [nf-core/taxprofiler] You are attempting to run Bracken without running kraken2. This is not possible! Please set --run_kraken2 as well.' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -84,6 +86,7 @@ include { STANDARDISATION_PROFILES } from '../subworkflows/local/standardis // MODULE: Installed directly from nf-core/modules // include { FASTQC } from '../modules/nf-core/fastqc/main' +include { FALCO } from '../modules/nf-core/falco/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' @@ -101,6 +104,12 @@ workflow TAXPROFILER { ch_versions = Channel.empty() ch_multiqc_logo= Channel.fromPath("$projectDir/docs/images/nf-core-taxprofiler_logo_custom_light.png") + adapterlist = params.shortread_qc_adapterlist ? file(params.shortread_qc_adapterlist) : [] + + if ( params.shortread_qc_adapterlist ) { + if ( params.shortread_qc_tool == 'adapterremoval' && !(adapterlist.extension == 'txt') ) error "[nf-core/taxprofiler] ERROR: AdapterRemoval2 adapter list requires a `.txt` format and extension. Check input: --shortread_qc_adapterlist ${params.shortread_qc_adapterlist}" + if ( params.shortread_qc_tool == 'fastp' && !adapterlist.extension.matches(".*(fa|fasta|fna|fas)") ) error "[nf-core/taxprofiler] ERROR: fastp adapter list requires a `.fasta` format and extension (or fa, fas, fna). Check input: --shortread_qc_adapterlist ${params.shortread_qc_adapterlist}" + } /* SUBWORKFLOW: Read in samplesheet, validate and stage input files @@ -120,17 +129,19 @@ workflow TAXPROFILER { */ ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ) - FASTQC ( - ch_input_for_fastqc - ) - - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - + if ( params.preprocessing_qc_tool == 'falco' ) { + FALCO ( ch_input_for_fastqc ) + ch_versions = ch_versions.mix(FALCO.out.versions.first()) + } else { + FASTQC ( ch_input_for_fastqc ) + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + } /* SUBWORKFLOW: PERFORM PREPROCESSING */ + if ( params.perform_shortread_qc ) { - ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads + ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq, adapterlist ).reads ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions ) } else { ch_shortreads_preprocessed = INPUT_CHECK.out.fastq @@ -254,7 +265,13 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + + if ( params.preprocessing_qc_tool == 'falco' ) { + ch_multiqc_files = ch_multiqc_files.mix(FALCO.out.txt.collect{it[1]}.ifEmpty([])) + } else { + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + } + if (params.perform_shortread_qc) { ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) @@ -272,6 +289,10 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_HOSTREMOVAL.out.mqc.collect{it[1]}.ifEmpty([])) } + if (params.perform_longread_hostremoval) { + ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_HOSTREMOVAL.out.mqc.collect{it[1]}.ifEmpty([])) + } + ch_multiqc_files = ch_multiqc_files.mix( PROFILING.out.mqc.collect{it[1]}.ifEmpty([]) ) if ( params.run_profile_standardisation ) {