1
0
Fork 0
mirror of https://github.com/MillironX/taxprofiler.git synced 2024-12-22 08:58:16 +00:00

Merge branch 'dev' into update_taxpasta_version

This commit is contained in:
Sofia Stamouli 2023-03-13 09:38:10 +01:00 committed by GitHub
commit 0007c99d1c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
30 changed files with 1046 additions and 766 deletions

View file

@ -15,7 +15,6 @@ jobs:
steps:
- name: Launch workflow via tower
uses: nf-core/tower-action@v3
# TODO nf-core: You can customise AWS full pipeline tests as required
# Add full size test data (but still relatively small datasets for few samples)
# on the `test_full.config` test runs with only one set of parameters
with:

View file

@ -29,19 +29,10 @@ jobs:
- "latest-everything"
parameters:
- "--preprocessing_qc_tool falco"
- "--perform_longread_qc false"
- "--perform_shortread_qc false"
- "--shortread_qc_tool fastp"
- "--shortread_qc_tool fastp --shortread_qc_mergepairs --shortread_qc_includeunmerged"
- "--shortread_qc_tool fastp --shortread_qc_mergepairs"
- "--shortread_qc_tool adapterremoval"
- "--shortread_qc_tool adapterremoval --shortread_qc_mergepairs --shortread_qc_includeunmerged"
- "--shortread_qc_tool adapterremoval --shortread_qc_mergepairs"
- "--shortread_complexityfilter_tool bbduk"
- "--shortread_complexityfilter_tool prinseqplusplus"
- "--perform_runmerging"
- "--perform_runmerging --shortread_qc_mergepairs"
- "--shortread_complexityfilter false --perform_shortread_hostremoval"
steps:
- name: Check out pipeline code

View file

@ -3,10 +3,18 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## v1.0dev - [date]
## v1.0.0 - Dodgy Dachshund [date]
Initial release of nf-core/taxprofiler, created with the [nf-core](https://nf-co.re/) template.
- Add read quality control (sequencing QC, adapter removal and merging)
- Add read complexity filtering
- Add host-reads removal step
- Add run merging
- Add taxonomic classification
- Add taxon table standardisation
- Add post-classification visualisation
### `Added`
### `Fixed`

View file

@ -16,6 +16,10 @@
> Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
- [falco](https://doi.org/10.12688/f1000research.21142.2)
> de Sena Brandine G and Smith AD. Falco: high-speed FastQC emulation for quality control of sequencing data. F1000Research 2021, 8:1874
- [fastp](https://doi.org/10.1093/bioinformatics/bty560)
> Chen, Shifu, Yanqing Zhou, Yaru Chen, and Jia Gu. 2018. Fastp: An Ultra-Fast All-in-One FASTQ Preprocessor. Bioinformatics 34 (17): i884-90. 10.1093/bioinformatics/bty560.
@ -26,12 +30,30 @@
- [Porechop](https://github.com/rrwick/Porechop)
- [FILTLONG](https://github.com/rrwick/Filtlong)
- [BBTools](http://sourceforge.net/projects/bbmap/)
- [PRINSEQ++](https://doi.org/10.7287/peerj.preprints.27553v1)
> Cantu, Vito Adrian, Jeffrey Sadural, and Robert Edwards. 2019. PRINSEQ++, a Multi-Threaded Tool for Fast and Efficient Quality Control and Preprocessing of Sequencing Datasets. e27553v1. PeerJ Preprints. doi: 10.7287/peerj.preprints.27553v1.
- [Bowtie2](https://doi.org/10.1038/nmeth.1923)
> Langmead, B., & Salzberg, S. L. (2012). Fast gapped-read alignment with Bowtie 2. Nature Methods, 9(4), 357359. doi: 10.1038/nmeth.1923
- [minimap2](https://doi.org/10.1093/bioinformatics/bty191)
> Li, H. (2018). Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics , 34(18), 30943100. doi: 10.1093/bioinformatics/bty191
- [SAMTools](https://doi.org/10.1093/gigascience/giab008)
> Danecek, P., Bonfield, J. K., Liddle, J., Marshall, J., Ohan, V., Pollard, M. O., Whitwham, A., Keane, T., McCarthy, S. A., Davies, R. M., & Li, H. (2021). Twelve years of SAMtools and BCFtools. GigaScience, 10(2). doi: 10.1093/gigascience/giab008
- [Bracken](https://doi.org/10.7717/peerj-cs.104)
> Lu, J., Breitwieser, F. P., Thielen, P., & Salzberg, S. L. (2017). Bracken: Estimating species abundance in metagenomics data. PeerJ Computer Science, 3, e104. doi: 10.7717/peerj-cs.104
- [Kraken2](https://doi.org/10.1186/s13059-019-1891-0)
> Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. Improved Metagenomic Analysis with Kraken 2. Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0.
@ -40,13 +62,9 @@
> Breitwieser, Florian P., Daniel N. Baker, and Steven L. Salzberg. 2018. KrakenUniq: confident and fast metagenomics classification using unique k-mer counts. Genome Biology 19 (1): 198. doi: 10.1186/s13059-018-1568-0
- [Bracken](https://doi.org/10.7717/peerj-cs.104)
- [MetaPhlAn3](https://doi.org/10.7554/eLife.65088)
> Lu, J., Breitwieser, F. P., Thielen, P., & Salzberg, S. L. (2017). Bracken: Estimating species abundance in metagenomics data. PeerJ Computer Science, 3, e104. doi: 10.7717/peerj-cs.104
- [Krona](https://doi.org/10.1186/1471-2105-12-385)
> Ondov, Brian D., Nicholas H. Bergman, and Adam M. Phillippy. 2011. Interactive metagenomic visualization in a Web browser. BMC Bioinformatics 12 (1): 385. doi: 10.1186/1471-2105-12-385.
> Beghini, Francesco, Lauren J McIver, Aitor Blanco-Míguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, et al. 2021. “Integrating Taxonomic, Functional, and Strain-Level Profiling of Diverse Microbial Communities with BioBakery 3.” Edited by Peter Turnbaugh, Eduardo Franco, and C Titus Brown. ELife 10 (May): e65088. doi: 10.7554/eLife.65088
- [MALT](https://doi.org/10.1038/s41559-017-0446-6)
@ -56,23 +74,25 @@
> Huson, Daniel H., Sina Beier, Isabell Flade, Anna Górska, Mohamed El-Hadidi, Suparna Mitra, Hans-Joachim Ruscheweyh, and Rewati Tappu. 2016. “MEGAN Community Edition - Interactive Exploration and Analysis of Large-Scale Microbiome Sequencing Data.” PLoS Computational Biology 12 (6): e1004957. doi: 10.1371/journal.pcbi.1004957.
- [MetaPhlAn3](https://doi.org/10.7554/eLife.65088)
- [DIAMOND](https://doi.org/10.1038/nmeth.3176)
> Beghini, Francesco, Lauren J McIver, Aitor Blanco-Míguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, et al. 2021. “Integrating Taxonomic, Functional, and Strain-Level Profiling of Diverse Microbial Communities with BioBakery 3.” Edited by Peter Turnbaugh, Eduardo Franco, and C Titus Brown. ELife 10 (May): e65088. doi: 10.7554/eLife.65088
> Buchfink, Benjamin, Chao Xie, and Daniel H. Huson. 2015. “Fast and Sensitive Protein Alignment Using DIAMOND.” Nature Methods 12 (1): 59-60. doi: 10.1038/nmeth.3176.
- [Centrifuge](https://doi.org/10.1101/gr.210641.116)
> Kim, Daehwan, Li Song, Florian P. Breitwieser, and Steven L. Salzberg. 2016. “Centrifuge: Rapid and Sensitive Classification of Metagenomic Sequences.” Genome Research 26 (12): 1721-29. doi: 10.1101/gr.210641.116.
- [DIAMOND](https://doi.org/10.1038/nmeth.3176)
- [Kaiju](https://doi.org/10.1038/ncomms11257)
> Buchfink, Benjamin, Chao Xie, and Daniel H. Huson. 2015. “Fast and Sensitive Protein Alignment Using DIAMOND.” Nature Methods 12 (1): 59-60. doi: 10.1038/nmeth.3176.
> Menzel, P., Ng, K. L., & Krogh, A. (2016). Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nature Communications, 7, 11257. doi: 10.1038/ncomms11257
- [FILTLONG](https://github.com/rrwick/Filtlong)
- [mOTUs](https://doi.org/10.1186/s40168-022-01410-z)
- [falco](https://doi.org/10.12688/f1000research.21142.2)
> Ruscheweyh, H.-J., Milanese, A., Paoli, L., Karcher, N., Clayssen, Q., Keller, M. I., Wirbel, J., Bork, P., Mende, D. R., Zeller, G., & Sunagawa, S. (2022). Cultivation-independent genomes greatly expand taxonomic-profiling capabilities of mOTUs across various environments. Microbiome, 10(1), 212. doi: 10.1186/s40168-022-01410-z
> de Sena Brandine G and Smith AD. Falco: high-speed FastQC emulation for quality control of sequencing data. F1000Research 2021, 8:1874
- [Krona](https://doi.org/10.1186/1471-2105-12-385)
> Ondov, Brian D., Nicholas H. Bergman, and Adam M. Phillippy. 2011. Interactive metagenomic visualization in a Web browser. BMC Bioinformatics 12 (1): 385. doi: 10.1186/1471-2105-12-385.
## Software packaging/containerisation tools

View file

@ -1,6 +1,6 @@
MIT License
Copyright (c) nf-core community
Copyright (c) James A. Fellows Yates, Sofia Stamouli, Moritz E. Beber, Lauri Mesilaakso, Thomas A. Christensen II, Jianhong Ou, Mahwash Jamy, Maxime Borry, Rafal Stepien, Tanja Normark
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

View file

@ -12,20 +12,16 @@
## Introduction
> ⚠️ This pipeline is still under development! While the pipeline is usable, not all functionality will be available!
**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic classification and profiling of shotgun metagenomic data. It allows for in-parallel taxonomic identification of reads or taxonomic abundance estimation with multiple classification and profiling tools against multiple databases, produces standardised output tables.
The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!
On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources.The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/taxprofiler/results).
The nf-core/taxprofiler CI test dataset uses sequencing data from [Maixer et al. (2021) Curr. Bio.](https://doi.org/10.1016/j.cub.2021.09.031). The AWS full test dataset uses sequencing data and reference genomes from [Meslier (2022) _Sci. Data_](https://doi.org/10.1038/s41597-022-01762-z)
The nf-core/taxprofiler CI test dataset uses sequencing data from [Maixner et al. (2021) Curr. Bio.](https://doi.org/10.1016/j.cub.2021.09.031). The AWS full test dataset uses sequencing data and reference genomes from [Meslier (2022) _Sci. Data_](https://doi.org/10.1038/s41597-022-01762-z)
## Pipeline summary
<!-- TODO nf-core: Fill in short bullet-pointed list of the default steps in the pipeline -->
![](docs/images/taxprofiler_tube.png)
1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) or [`falco`](https://github.com/smithlabcode/falco) as an alternative option)
@ -46,7 +42,7 @@ The nf-core/taxprofiler CI test dataset uses sequencing data from [Maixer et al.
- [KrakenUniq](https://github.com/fbreitwieser/krakenuniq)
5. Perform optional post-processing with:
- [bracken](https://ccb.jhu.edu/software/bracken/)
6. Standardises output tables
6. Standardises output tables ([`Taxpasta`](https://taxpasta.readthedocs.io))
7. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
8. Plotting Kraken2, Centrifuge, Kaiju and MALT results ([`Krona`](https://hpc.nih.gov/apps/kronatools.html))
@ -81,11 +77,18 @@ The nf-core/taxprofiler pipeline comes with documentation about the pipeline [us
## Credits
nf-core/taxprofiler was originally written by nf-core community.
nf-core/taxprofiler was originally written by [James A. Fellows Yates](https://github.com/jfy133), [Moritz Beber](https://github.com/Midnighter), and [Sofia Stamouli](https://github.com/sofsam).
We thank the following people for their extensive assistance in the development of this pipeline:
We thank the following people for their contributions to the development of this pipeline:
[James A. Fellows Yates](https://github.com/jfy133), [Moritz Beber](https://github.com/Midnighter), [Lauri Mesilaakso](https://github.com/ljmesi), [Sofia Stamouli](https://github.com/sofsam), [Maxime Borry](https://github.com/maxibor),[Thomas A. Christensen II](https://github.com/MillironX), [Jianhong Ou](https://github.com/jianhong), [Rafal Stepien](https://github.com/rafalstepien), [Mahwash Jamy](https://github.com/mjamy).
[Lauri Mesilaakso](https://github.com/ljmesi), [Tanja Normark](https://github.com/talnor), [Maxime Borry](https://github.com/maxibor),[Thomas A. Christensen II](https://github.com/MillironX), [Jianhong Ou](https://github.com/jianhong), [Rafal Stepien](https://github.com/rafalstepien), [Mahwash Jamy](https://github.com/mjamy), and the [nf-core/community](https://nf-co.re/community).
We also are grateful for the feedback and comments from:
- [Alex Hübner](https://github.com/alexhbnr)
- [LilyAnderssonLee](https://github.com/LilyAnderssonLee)
Credit and thanks also goes to [Zandra Fagernäs](https://github.com/ZandraFagernas) for the logo.
## Contributions and Support
@ -98,8 +101,6 @@ For further information or help, don't hesitate to get in touch on the [Slack `#
<!-- TODO nf-core: Add citation for pipeline after first release. Uncomment lines below and update Zenodo doi and badge at the top of this file. -->
<!-- If you use nf-core/taxprofiler for your analysis, please cite it using the following doi: [10.5281/zenodo.XXXXXX](https://doi.org/10.5281/zenodo.XXXXXX) -->
<!-- TODO nf-core: Add bibliography of tools and data used in your pipeline -->
An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.
You can cite the `nf-core` publication as follows:

View file

@ -4,7 +4,7 @@
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="description" content="nf-core/taxprofiler: Taxonomic profiling of shotgun metagenomic data">
<meta name="description" content="nf-core/taxprofiler: Taxonomic classification and profiling of shotgun metagenomic data">
<title>nf-core/taxprofiler Pipeline Report</title>
</head>
<body>

View file

@ -39,33 +39,30 @@ sp:
diamond:
contents: "diamond v"
num_lines: 10
#extra_fn_clean_exts:
# - '_fastp'
# - '.pe.settings'
# - '.se.settings'
fastqc/data:
fn_re: ".*(fastqc|falco)_data.txt$"
fastqc/zip:
fn: "*_fastqc.zip"
top_modules:
- "fastqc":
name: "FastQC (pre-Trimming)"
name: "FastQC / Falco (pre-Trimming)"
path_filters:
- "*raw_*fastqc.zip"
- "*raw*"
path_filters_exclude:
- "*processed*"
extra: "If used in this run, Falco is a drop-in replacement for FastQC producing the same output, written by Guilherme de Sena Brandine and Andrew D. Smith."
- "fastqc":
name: "Falco (pre-Trimming)"
name: "FastQC / Falco (post-Trimming)"
path_filters:
- "*_raw_falco_*_report.html"
- "*processed*"
path_filters_exclude:
- "*raw*"
extra: "If used in this run, Falco is a drop-in replacement for FastQC producing the same output, written by Guilherme de Sena Brandine and Andrew D. Smith."
- "fastp"
- "adapterRemoval"
- "porechop":
extra: ": if you get the error message 'Error - was not able to plot data.' this means that porechop did not detect any adapters and therefore no statistics generated."
- "fastqc":
name: "FastQC (post-Trimming)"
path_filters:
- "*_processed_*fastqc.zip"
- "fastqc":
name: "Falco (post-Trimming)"
path_filters:
- "*_processed_falco_*_report.html"
- "bbduk"
- "prinseqplusplus"
- "filtlong"
@ -105,19 +102,20 @@ top_modules:
#It is not possible to set placement for custom kraken and centrifuge columns.
table_columns_placement:
FastQC (pre-Trimming):
FastQC / Falco (pre-Trimming):
total_sequences: 100
avg_sequence_length: 110
median_sequence_length: 120
percent_duplicates: 130
percent_gc: 140
percent_fails: 150
Falco (pre-Trimming):
FastQC / Falco (post-Trimming):
total_sequences: 200
avg_sequence_length: 210
percent_duplicates: 220
percent_gc: 230
percent_fails: 240
median_sequence_length: 220
percent_duplicates: 230
percent_gc: 240
percent_fails: 250
fastp:
pct_adapter: 300
pct_surviving: 310
@ -141,19 +139,6 @@ table_columns_placement:
Middle Split Percent: 460
Filtlong:
Target bases: 500
FastQC (post-Trimming):
total_sequences: 600
avg_sequence_length: 610
median_sequence_length: 620
percent_duplicates: 630
percent_gc: 640
percent_fails: 650
Falco (post-Trimming):
total_sequences: 700
avg_sequence_length: 710
percent_duplicates: 720
percent_gc: 730
percent_fails: 740
BBDuk:
Input reads: 800
Total Removed bases percent: 810
@ -205,25 +190,18 @@ table_columns_placement:
"Number of ext-mOTUs": 1880
table_columns_visible:
FastQC (pre-Trimming):
FastQC / Falco (pre-Trimming):
total_sequences: True
avg_sequence_length: True
percent_duplicates: True
percent_gc: True
percent_fails: False
Falco (pre-Trimming):
FastQC / Falco (post-Trimming):
total_sequences: True
avg_sequence_length: True
percent_duplicates: True
percent_gc: True
percent_duplicates: False
percent_gc: False
percent_fails: False
fastp:
pct_adapter: True
pct_surviving: True
pct_duplication: False
after_filtering_gc_content: False
after_filtering_q30_rate: False
after_filtering_q30_bases: False
porechop:
Input reads: False
Start Trimmed:
@ -232,6 +210,13 @@ table_columns_visible:
End Trimmed Percent: True
Middle Split: False
Middle Split Percent: True
fastp:
pct_adapter: True
pct_surviving: True
pct_duplication: False
after_filtering_gc_content: False
after_filtering_q30_rate: False
after_filtering_q30_bases: False
Filtlong:
Target bases: True
Adapter Removal:
@ -239,18 +224,6 @@ table_columns_visible:
percent_aligned: True
percent_collapsed: True
percent_discarded: False
FastQC (post-Trimming):
total_sequences: True
avg_sequence_length: True
percent_duplicates: False
percent_gc: False
percent_fails: False
Falco (post-Trimming):
total_sequences: True
avg_sequence_length: True
percent_duplicates: False
percent_gc: False
percent_fails: False
BBDuk:
Input reads: False
Total Removed bases Percent: False
@ -278,25 +251,13 @@ table_columns_visible:
motus: False
table_columns_name:
FastQC (pre-Trimming):
FastQC / Falco (pre-Trimming):
total_sequences: "Nr. Input Reads"
avg_sequence_length: "Length Input Reads"
percent_gc: "% GC Input Reads"
percent_duplicates: "% Dups Input Reads"
percent_fails: "% Failed Input Reads"
Falco (pre-Trimming):
total_sequences: "Nr. Input Reads"
avg_sequence_length: "Length Input Reads"
percent_gc: "% GC Input Reads"
percent_duplicates: "% Dups Input Reads"
percent_fails: "% Failed Input Reads"
FastQC (post-Trimming):
total_sequences: "Nr. Processed Reads"
avg_sequence_length: "Length Processed Reads"
percent_gc: "% GC Processed Reads"
percent_duplicates: "% Dups Processed Reads"
percent_fails: "% Failed Processed Reads"
Falco (post-Trimming):
FastQC / Falco (post-Trimming):
total_sequences: "Nr. Processed Reads"
avg_sequence_length: "Length Processed Reads"
percent_gc: "% GC Processed Reads"
@ -314,7 +275,8 @@ extra_fn_clean_exts:
- ".bbduk"
- ".unmapped"
- "_filtered"
- "_processed"
- type: remove
pattern: "_falco"
section_comments:
general_stats: "By default, all read count columns are displayed as millions (M) of reads."

View file

@ -10,7 +10,6 @@
process {
// TODO nf-core: Check the defaults for all processes
cpus = { check_max( 1 * task.attempt, 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
@ -24,7 +23,6 @@ process {
// These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
// If possible, it would be nice to keep the same label naming convention when
// adding in your local modules too.
// TODO nf-core: Customise requirements for specific processes.
// See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
withLabel:process_single {
cpus = { check_max( 1 , 'cpus' ) }
@ -62,6 +60,19 @@ process {
withName:CUSTOM_DUMPSOFTWAREVERSIONS {
cache = false
}
withName: BRACKEN_BRACKEN {
errorStrategy = 'ignore'
}
withName: CENTRIFUGE_KREPORT {
errorStrategy = {task.exitStatus == 255 ? 'ignore' : 'retry'}
}
withName: KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE {
errorStrategy = { task.exitStatus in [255,1] ? 'ignore' : 'retry' }
}
withName: MEGAN_RMA2INFO_TSV {
cpus = { check_max( 1 , 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }

View file

@ -196,8 +196,8 @@ process {
publishDir = [
path: { "${params.outdir}/bowtie2/build" },
mode: params.publish_dir_mode,
enabled: params.save_hostremoval_index,
pattern: 'bowtie2'
pattern: 'bowtie2',
enabled: params.save_hostremoval_index
]
}
@ -213,14 +213,14 @@ process {
[
path: { "${params.outdir}/bowtie2/align" },
mode: params.publish_dir_mode,
enabled: params.save_hostremoval_bam,
pattern: '*.bam'
pattern: '*.bam',
enabled: params.save_hostremoval_bam
],
[
path: { "${params.outdir}/bowtie2/align" },
mode: params.publish_dir_mode,
enabled: params.save_hostremoval_unmapped,
pattern: '*.fastq.gz'
pattern: '*.fastq.gz',
enabled: params.save_hostremoval_unmapped
]
]
}
@ -230,8 +230,8 @@ process {
publishDir = [
path: { "${params.outdir}/minimap2/index" },
mode: params.publish_dir_mode,
enabled: params.save_hostremoval_index,
pattern: '*.mmi'
pattern: '*.mmi',
enabled: params.save_hostremoval_index
]
}
@ -240,8 +240,8 @@ process {
publishDir = [
path: { "${params.outdir}/minimap2/align" },
mode: params.publish_dir_mode,
enabled: params.save_hostremoval_bam,
pattern: '*.bam'
pattern: '*.bam',
enabled: params.save_hostremoval_bam
]
}
@ -255,8 +255,8 @@ process {
publishDir = [
path: { "${params.outdir}/samtools/bam2fq" },
mode: params.publish_dir_mode,
enabled: params.save_hostremoval_unmapped,
pattern: '*.fq.gz'
pattern: '*.fq.gz',
enabled: params.save_hostremoval_unmapped
]
}
@ -354,7 +354,6 @@ process {
}
withName: BRACKEN_BRACKEN {
errorStrategy = 'ignore'
ext.args = { "${meta.db_params}" }
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.bracken" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.bracken" }
publishDir = [
@ -446,17 +445,16 @@ process {
}
withName: CENTRIFUGE_CENTRIFUGE {
ext.args = { "${meta.db_params}" }
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.centrifuge" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.centrifuge" }
publishDir = [
path: { "${params.outdir}/centrifuge/${meta.db_name}/" },
mode: params.publish_dir_mode,
pattern: '*.{txt,sam,gz}'
]
ext.args = { "${meta.db_params}" }
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.centrifuge" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.centrifuge" }
}
withName: CENTRIFUGE_KREPORT {
errorStrategy = {task.exitStatus == 255 ? 'ignore' : 'retry'}
ext.args = { "${meta.db_params}" }
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.centrifuge" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.centrifuge" }
publishDir = [
@ -467,7 +465,6 @@ process {
}
withName: KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE {
errorStrategy = { task.exitStatus in [255,1] ? 'ignore' : 'retry' }
ext.prefix = { "centrifuge_${meta.id}_combined_reports" }
publishDir = [
path: { "${params.outdir}/centrifuge/" },
@ -477,16 +474,16 @@ process {
}
withName: KAIJU_KAIJU {
ext.args = { "${meta.db_params}" }
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.kaiju" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.kaiju" }
publishDir = [
path: { "${params.outdir}/kaiju/${meta.db_name}/" },
mode: params.publish_dir_mode,
pattern: '*.tsv'
]
ext.args = { "${meta.db_params}" }
}
withName: '.*PROFILING:KAIJU_KAIJU2TABLE' {
withName: 'KAIJU_KAIJU2TABLE_SINGLE' {
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.kaijutable" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.kaijutable" }
publishDir = [
path: { "${params.outdir}/kaiju/${meta.db_name}/" },
@ -495,7 +492,7 @@ process {
]
}
withName: '.*STANDARDISATION_PROFILES:KAIJU_KAIJU2TABLE' {
withName: 'KAIJU_KAIJU2TABLE_COMBINED' {
ext.prefix = { "kaiju_${meta.id}_combined_reports" }
publishDir = [
path: { "${params.outdir}/kaiju/" },

View file

@ -45,10 +45,11 @@ params {
run_motus = false
run_krona = true
krona_taxonomy_directory = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/metagenome/krona_taxonomy.tab'
malt_save_reads = true
kraken2_save_reads = true
centrifuge_save_reads = true
diamond_save_reads = true
malt_save_reads = false
kraken2_save_reads = false
centrifuge_save_reads = false
diamond_save_reads = false
run_profile_standardisation = true
}
process {

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 690 KiB

After

Width:  |  Height:  |  Size: 714 KiB

File diff suppressed because it is too large Load diff

Before

Width:  |  Height:  |  Size: 269 KiB

After

Width:  |  Height:  |  Size: 289 KiB

View file

@ -6,8 +6,6 @@ This document describes the output produced by the pipeline. Most of the plots a
The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory.
<!-- TODO nf-core: Write this documentation describing your workflow's output -->
## Pipeline overview
The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
@ -18,12 +16,12 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
- [AdapterRemoval](#adapterremoval) - Adapter trimming for Illumina data
- [Porechop](#porechop) - Adapter removal for Oxford Nanopore data
- [BBDuk](#bbduk) - Quality trimming and filtering for Illumina data
- [PRINSEQ++](#prinseq++) - Quality trimming and filtering for Illunina data
- [PRINSEQ++](#prinseq) - Quality trimming and filtering for Illunina data
- [Filtlong](#filtlong) - Quality trimming and filtering for Nanopore data
- [Bowtie2](#bowtie2) - Host removal for Illumina reads
- [minimap2](#minimap2) - Host removal for Nanopore reads
- [SAMtools stats](#samtoolsstats) - Statistics from host removal
- [SAMtools bam2fq](#samtoolsfastq) - Converts unmapped BAM file to fastq format (minimap2 only)
- [SAMtools stats](#samtools-stats) - Statistics from host removal
- [SAMtools bam2fq](#samtools-bam2fq) - Converts unmapped BAM file to fastq format (minimap2 only)
- [Bracken](#bracken) - Taxonomic classifier using k-mers and abundance estimations
- [Kraken2](#kraken2) - Taxonomic classifier using exact k-mer matches
- [KrakenUniq](#krakenuniq) - Taxonomic classifier that combines the k-mer-based classification and the number of unique k-mers found in each species
@ -37,19 +35,25 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
### FastQC or falco
![](images/taxprofiler_tube.png)
### FastQC or Falco
<details markdown="1">
<summary>Output files</summary>
- `fastqc/`
- `*_fastqc.html`: FastQC report containing quality metrics.
- `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images.
- `{fastqc,falco}/`
- {raw,preprocessed}
- `*html`: FastQC or Falco report containing quality metrics in HTML format.
- `*.txt`: FastQC or Falco report containing quality metrics in TXT format.
- `*.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images (FastQC only).
</details>
[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
If preprocessing is turned on, nf-core/taxprofiler runs FastQC/Falco twice -once before and once after adapter removal/read merging, to allow evaluation of the performance of these preprocessing steps. Note in the General Stats table, the columns of these two instances of FastQC/Falco are placed next to each other to make it easier to evaluate. However, the columns of the actual preprocessing steps (i.e, fastp, AdapterRemoval, and Porechop) will be displayed _after_ the two FastQC/Falco columns, even if they were run 'between' the two FastQC/Falco jobs in the pipeline itself.
> Falco produces identical output to FastQC but in the `falco/` directory.
![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png)
@ -58,8 +62,6 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png)
> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality.
### fastp
[fastp](https://github.com/OpenGene/fastp) is a FASTQ pre-processing tool for quality control, trimmming of adapters, quality filtering and other features.
@ -188,9 +190,12 @@ It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) and/
<summary>Output files</summary>
- `bowtie2/`
- `<sample_id>.bam`: BAM file containing reads that aligned against the user-supplied reference genome as well as unmapped reads
- `<sample_id>.bowtie2.log`: log file about the mapped reads
- `<sample_id>.unmapped.fastq.gz`: the off-target reads from the mapping that is used in downstream steps.
- `build/`
- `*.bt2`: Bowtie2 indicies of reference genome, only if `--save_hostremoval_index` supplied.
- `align/`
- `<sample_id>.bam`: BAM file containing reads that aligned against the user-supplied reference genome as well as unmapped reads
- `<sample_id>.bowtie2.log`: log file about the mapped reads
- `<sample_id>.unmapped.fastq.gz`: the off-target reads from the mapping that is used in downstream steps.
</details>
@ -212,7 +217,10 @@ It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) or o
<summary>Output files</summary>
- `minimap2`
- `<sample_id>.bam`: Alignment file in BAM format containing both mapped and unmapped reads.
- `build/`
- `*.mmi2`: minimap2 indices of reference genome, only if `--save_hostremoval_index` supplied.
- `align/`
- `<sample_id>.bam`: Alignment file in BAM format containing both mapped and unmapped reads.
</details>
@ -245,13 +253,31 @@ This directory will be present and contain the unmapped reads from the `.fastq`
<details markdown="1">
<summary>Output files</summary>
- `samtoolsstats`
- `samtools/stats`
- `<sample_id>.stats`: File containing samtools stats output.
</details>
In most cases you do not need to check this file, as it is rendered in the MultiQC run report.
### Run Merging
nf-core/taxprofiler offers the option to merge FASTQ files of multiple sequencing runs or libraries that derive from the same sample, as specified in the input samplesheet.
This is the last preprocessing step, so if you have multiple runs or libraries (and run merging turned on), this will represent the final reads that will go into classification/profiling steps.
<details markdown="1">
<summary>Output files</summary>
- `run_merging/`
- `*.fastq.gz`: Concatenated FASTQ files on a per-sample basis
</details>
Note that you will only find samples that went through the run merging step in this directory. For samples that had a single run or library will not go through this step of the pipeline and thus will not be present in this directory.
⚠️ You must make sure to turn on the saving of the reads from the previous preprocessing step you may have turned on, if you have single-run or library reads in your pipeline run, and wish to save the final reads that go into classification/profiling!
### Bracken
[Bracken](https://ccb.jhu.edu/software/bracken/) (Bayesian Reestimation of Abundance with Kraken) is a highly accurate statistical method that computes the abundance of species in DNA sequences from a metagenomics sample. Braken uses the taxonomy labels assigned by Kraken, a highly accurate metagenomics classification algorithm, to estimate the number of reads originating from each species present in a sample.

View file

@ -17,7 +17,7 @@ Both contain metadata and paths to the data of your input samples and databases.
When running nf-core/taxprofiler, every step and tool is 'opt in'. To run a given classifier or profiler you must make sure to supply both a database in your `<database>.csv` and supply `--run_<profiler>` flag to your command. Omitting either will result in the profiling tool not executing.
nf-core/profiler also includes optional pre-processing (adapter clipping, merge running etc.) or post-processing (visualisation) steps. These are also opt in with a `--perform_<step>` flag. In some cases, the pre- and post-processing steps may also require additional files. Please check the parameters tab of this documentation for more information.
nf-core/taxprofiler also includes optional pre-processing (adapter clipping, merge running etc.) or post-processing (visualisation) steps. These are also opt in with a `--perform_<step>` flag. In some cases, the pre- and post-processing steps may also require additional files. Please check the parameters tab of this documentation for more information.
Please see the rest of this page for information about how to prepare input samplesheets and databases and how to run Nextflow pipelines. See the [parameters](https://nf-co.re/taxprofiler/parameters) documentation for more information about specific options the pipeline also offers.
@ -89,7 +89,9 @@ The pipeline takes the paths and specific classification/profiling parameters of
> ⚠️ To allow user freedom, nf-core/taxprofiler does not check for mandatory or the validity of non-file database parameters for correct execution of the tool - excluding options offered via pipeline level parameters! Please validate your database parameters (cross-referencing [parameters](https://nf-co.re/taxprofiler/parameters, and the given tool documentation) before submitting the database sheet! For example, if you don't use the default read length - Bracken will require `-r <read_length>` in the `db_params` column.
An example database sheet can look as follows, where 5 tools are being used, and `malt` and `kraken2` will be used against two databases each. This is because specifying `bracken` implies first running `kraken2` on the same database.
An example database sheet can look as follows, where 7 tools are being used, and `malt` and `kraken2` will be used against two databases each.
`kraken2` will be run twice even though only having a single 'dedicated' database because specifying `bracken` implies first running `kraken2` on the `bracken` database, as required by `bracken`.
```console
tool,db_name,db_params,db_path
@ -199,7 +201,7 @@ You can optionally save the FASTQ output of the run merging with the `--save_com
> ⚠️ For nanopore data: we do not recommend performing any read preprocessing or complexity filtering if you are using ONTs Guppy toolkit for basecalling and post-processing.
#### Host Removal
#### Host-Read Removal
Removal of possible-host reads from FASTQ files prior classification/profiling can be activated with `--perform_shortread_hostremoval` or `--perform_longread_hostremoval`.

View file

@ -12,11 +12,6 @@ class WorkflowTaxprofiler {
public static void initialise(params, log) {
genomeExistsError(params, log)
// TODO update as necessary
//if (!params.fasta) {
// log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file."
// System.exit(1)
//}
}
//

View file

@ -1,31 +0,0 @@
process ENSURE_FASTQ_EXTENSION {
tag "$meta.id"
label 'process_low'
conda "conda-forge::bash=5.0
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv2/biocontainers_v1.2.0_cv2.img' :
'biocontainers/biocontainers:v1.2.0_cv2' }"
input:
tuple val(meta), path(reads)
output:
tuple val(meta), path('*.fastq.gz'), emit: reads
script:
if (meta.single_end) {
fastq = "${reads.baseName}.fastq.gz"
"""
ln -s '${reads}' '${fastq}'
"""
} else {
first = "${reads[0].baseName}.fastq.gz"
second = "${reads[1].baseName}.fastq.gz"
"""
ln -s '${reads[0]}' '${first}'
ln -s '${reads[1]}' '${second}'
"""
}
}

View file

@ -2,10 +2,10 @@ process KRAKEN2_STANDARD_REPORT {
tag "$meta.id"
label 'process_single'
conda "conda-forge::sed=4.8"
conda "conda-forge::sed=4.7"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv2/biocontainers_v1.2.0_cv2.img' :
'biocontainers/biocontainers:v1.2.0_cv2' }"
'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
'ubuntu:20.04' }"
input:
tuple val(meta), path(report)

View file

@ -4,8 +4,8 @@ process KRONA_CLEANUP {
conda "conda-forge::sed=4.7"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img' :
'biocontainers/biocontainers:v1.2.0_cv1' }"
'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
'ubuntu:20.04' }"
input:
tuple val(meta), path(krona, stageAs: 'uncleaned.krona.txt')

View file

@ -1,5 +1,6 @@
process SAMPLESHEET_CHECK {
tag "$samplesheet"
label 'process_single'
conda "conda-forge::python=3.8.3"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
@ -13,6 +14,9 @@ process SAMPLESHEET_CHECK {
path '*.csv' , emit: csv
path "versions.yml", emit: versions
when:
task.ext.when == null || task.ext.when
script: // This script is bundled with the pipeline, in nf-core/taxprofiler/bin/
"""
check_samplesheet.py \\

View file

@ -9,7 +9,6 @@
// Global default params, used in configs
params {
// TODO nf-core: Specify your pipeline's command line flags
// Input options
input = null
@ -103,7 +102,7 @@ params {
shortread_hostremoval_index = null
longread_hostremoval_index = null
save_hostremoval_index = false
save_hostremoval_bam = false
save_hostremoval_bam = false
save_hostremoval_unmapped = false
@ -301,12 +300,12 @@ dag {
manifest {
name = 'nf-core/taxprofiler'
author = """nf-core community"""
author = """James A. Fellows Yates, Sofia Stamouli, Moritz E. Beber, Lauri Mesilaakso, Thomas A. Christensen II, Jianhong Ou, Mahwash Jamy, Maxime Borry, Rafal Stepien, Tanja Normark"""
homePage = 'https://github.com/nf-core/taxprofiler'
description = """Taxonomic profiling of shotgun metagenomic data"""
description = """Taxonomic classification and profiling of shotgun metagenomic data"""
mainScript = 'main.nf'
nextflowVersion = '!>=22.10.1'
version = '1.0dev'
version = '1.0.0'
doi = ''
}

View file

@ -67,7 +67,7 @@
"save_preprocessed_reads": {
"type": "boolean",
"fa_icon": "fas fa-save",
"description": "Save reads from adapter clipping/pair-merging, length filtering for both short and long reads",
"description": "Save reads from samples that went through the adapter clipping, pair-merging, and length filtering steps for both short and long reads",
"help_text": "This saves the FASTQ output from the following tools:\n\n- fastp\n- AdapterRemoval\n- Porechop\n- Filtlong\n\nThese reads will be a mixture of: adapter clipped, quality trimmed, pair-merged, and length filtered, depending on the parameters you set."
}
},
@ -116,7 +116,8 @@
"type": "string",
"default": "None",
"description": "Specify a list of all possible adapters to trim. Overrides --shortread_qc_adapter1/2. Formats: .txt (AdapterRemoval) or .fasta. (fastp).",
"help_text": "Allows to supply a file with a list of adapter (combinations) to remove from all files. \n\nOverrides the --shortread_qc_adapter1/--shortread_qc_adapter2 parameters . \n\nFor AdapterRemoval this consists of a two column table with a `.txt` extension: first column represents forward strand, second column for reverse strand. You must supply all possible combinations, one per line, and this list is applied to all files. See AdapterRemoval documentation for more information.\n\nFor fastp this consists of a standard FASTA format with a `.fasta`/`.fa`/`.fna`/`.fas` extension. The adapter sequence in this file should be at least 6bp long, otherwise it will be skipped. fastp trims the adapters present in the FASTA file one by one.\n\n> Modifies AdapterRemoval parameter: --adapter-list\n> Modifies fastp parameter: --adapter_fasta"
"help_text": "Allows to supply a file with a list of adapter (combinations) to remove from all files. \n\nOverrides the --shortread_qc_adapter1/--shortread_qc_adapter2 parameters . \n\nFor AdapterRemoval this consists of a two column table with a `.txt` extension: first column represents forward strand, second column for reverse strand. You must supply all possible combinations, one per line, and this list is applied to all files. See AdapterRemoval documentation for more information.\n\nFor fastp this consists of a standard FASTA format with a `.fasta`/`.fa`/`.fna`/`.fas` extension. The adapter sequence in this file should be at least 6bp long, otherwise it will be skipped. fastp trims the adapters present in the FASTA file one by one.\n\n> Modifies AdapterRemoval parameter: --adapter-list\n> Modifies fastp parameter: --adapter_fasta",
"fa_icon": "fas fa-th-list"
},
"shortread_qc_mergepairs": {
"type": "boolean",
@ -194,7 +195,7 @@
"save_complexityfiltered_reads": {
"type": "boolean",
"fa_icon": "fas fa-save",
"description": "Save complexity filtered short-reads",
"description": "Save reads from samples that went through the complexity filtering step",
"help_text": "Specify whether to save the final complexity filtered reads in your results directory (`--outdir`)."
}
},
@ -302,7 +303,7 @@
"save_hostremoval_unmapped": {
"type": "boolean",
"fa_icon": "fas fa-save",
"description": "Save unmapped reads in FASTQ format from host removal",
"description": "Save reads from samples that went through the host-removal step",
"help_text": "Save only the reads NOT mapped to the reference genome in FASTQ format (as exported from `samtools view` and `bam2fq`).\n\nThis can be useful if you wish to perform other analyses on the off-target reads from the host mapping, such as manual profiling or _de novo_ assembly."
}
},
@ -323,8 +324,8 @@
"save_runmerged_reads": {
"type": "boolean",
"fa_icon": "fas fa-save",
"description": "Save run-concatenated input FASTQ files for each sample",
"help_text": "Save the run- and library-concatenated reads of a given sample in FASTQ format."
"description": "Save reads from samples that went through the run-merging step",
"help_text": "Save the run- and library-concatenated reads of a given sample in FASTQ format.\n\n> \u26a0\ufe0f Only samples that went through the run-merging step of the pipeline will be stored in the resulting directory. \n\nIf you wish to save the files that go to the classification/profiling steps for samples that _did not_ go through run merging, you must supply the appropriate upstream `--save_<preprocessing_step>` flag.\n\n"
}
},
"fa_icon": "fas fa-clipboard-check"
@ -427,7 +428,7 @@
},
"run_bracken": {
"type": "boolean",
"description": "Post-process kraken2 reports with Bracken.",
"description": "Turn on Bracken (and the required Kraken2 prerequisite step).",
"fa_icon": "fas fa-toggle-on"
},
"run_malt": {
@ -513,34 +514,39 @@
"standardisation_taxpasta_format": {
"type": "string",
"default": "tsv",
"fa_icon": "fas fa-file",
"fa_icon": "fas fa-pastafarianism",
"description": "The desired output format.",
"enum": ["tsv", "csv", "arrow", "parquet", "biom"]
},
"taxpasta_taxonomy_dir": {
"type": "string",
"description": "The path to a directory containing taxdump files.",
"help_text": "This arguments provides the path to the directory containing taxdump files. At least nodes.dmp and names.dmp are required. A merged.dmp file is optional. \n\nModifies tool parameter(s):\n-taxpasta: `--taxpasta_taxonomy_dir`"
"help_text": "This arguments provides the path to the directory containing taxdump files. At least nodes.dmp and names.dmp are required. A merged.dmp file is optional. \n\nModifies tool parameter(s):\n-taxpasta: `--taxpasta_taxonomy_dir`",
"fa_icon": "fas fa-tree"
},
"taxpasta_add_name": {
"type": "boolean",
"description": "Add the taxon name to the output.",
"help_text": "The standard output format of taxpasta is a two-column table including the read counts and the integer taxonomic ID. The taxon name can be added as additional information to the output table.\n\nModifies tool parameter(s):\n- taxpasta: `--taxpasta_add_name`"
"help_text": "The standard output format of taxpasta is a two-column table including the read counts and the integer taxonomic ID. The taxon name can be added as additional information to the output table.\n\nModifies tool parameter(s):\n- taxpasta: `--taxpasta_add_name`",
"fa_icon": "fas fa-tag"
},
"taxpasta_add_rank": {
"type": "boolean",
"description": "Add the taxon rank to the output.",
"help_text": "The standard output format of taxpasta is a two-column table including the read counts and the integer taxonomic ID. The taxon rank can be added as additional information to the output table.\n\nModifies tool parameter(s):\n- taxpasta: `--taxpasta_add_rank`"
"help_text": "The standard output format of taxpasta is a two-column table including the read counts and the integer taxonomic ID. The taxon rank can be added as additional information to the output table.\n\nModifies tool parameter(s):\n- taxpasta: `--taxpasta_add_rank`",
"fa_icon": "fas fa-sort-amount-down-alt"
},
"taxpasta_add_lineage": {
"type": "boolean",
"description": "Add the taxon's entire lineage to the output.",
"help_text": "\nThe standard output format of taxpasta is a two-column table including the read counts and the integer taxonomic ID. The taxon's entire lineage with the taxon names separated by semi-colons can be added as additional information to the output table.\n\nModifies tool parameter(s):\n- taxpasta: `--taxpasta_add_lineage`\n"
"description": "Add the taxon's entire name lineage to the output.",
"help_text": "\nThe standard output format of taxpasta is a two-column table including the read counts and the integer taxonomic ID. The taxon's entire lineage with the taxon names separated by semi-colons can be added as additional information to the output table.\n\nModifies tool parameter(s):\n- taxpasta: `--taxpasta_add_lineage`\n",
"fa_icon": "fas fa-link"
},
"taxpasta_add_idlineage": {
"type": "boolean",
"description": "Add the taxon's entire lineage to the output.",
"help_text": "\nThe standard output format of taxpasta is a two-column table including the read counts and the integer taxonomic ID. The taxon's entire lineage with the taxon identifiers separated by semi-colons can be added as additional information to the output table.\n\nModifies tool parameter(s):\n- taxpasta: `--taxpasta_add_idlineage`\n"
"description": "Add the taxon's entire ID lineage to the output.",
"help_text": "\nThe standard output format of taxpasta is a two-column table including the read counts and the integer taxonomic ID. The taxon's entire lineage with the taxon identifiers separated by semi-colons can be added as additional information to the output table.\n\nModifies tool parameter(s):\n- taxpasta: `--taxpasta_add_idlineage`\n",
"fa_icon": "fas fa-link"
}
},
"fa_icon": "fas fa-chart-line"

View file

@ -28,22 +28,20 @@ workflow DB_CHECK {
// Normal checks for within-row validity, so can be moved to separate functions
parsed_samplesheet = Channel.fromPath(dbsheet)
.splitCsv ( header:true, sep:',' )
.map {
validate_db_rows(it)
create_db_channels(it)
.map { row ->
validate_db_rows(row)
return [ row.subMap(['tool', 'db_name', 'db_params']), file(row.db_path) ]
}
ch_dbs_for_untar = parsed_samplesheet
.branch {
untar: it[1].toString().endsWith(".tar.gz")
.branch { db_meta, db ->
untar: db.name.endsWith(".tar.gz")
skip: true
}
// Filter the channel to untar only those databases for tools that are selected to be run by the user.
ch_input_untar = ch_dbs_for_untar.untar
.filter {
params["run_${it[0]['tool']}"]
}
.filter { db_meta, db -> params["run_${db_meta.tool}"] }
UNTAR (ch_input_untar)
ch_versions = ch_versions.mix(UNTAR.out.versions.first())
@ -54,41 +52,27 @@ workflow DB_CHECK {
versions = ch_versions // channel: [ versions.yml ]
}
def validate_db_rows(LinkedHashMap row){
def validate_db_rows(LinkedHashMap row) {
// check minimum number of columns
if (row.size() < 4) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database input sheet - malformed row (e.g. missing column). See documentation for more information. Error in: ${row}"
// check minimum number of columns
if (row.size() < 4) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database input sheet - malformed row (e.g. missing column). See documentation for more information. Error in: ${row}"
// all columns there
def expected_headers = ['tool', 'db_name', 'db_params', 'db_path']
if ( !row.keySet().containsAll(expected_headers) ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database input sheet - malformed column names. Please check input TSV. Column names should be: ${expected_keys.join(", ")}"
// all columns there
def expected_headers = ['tool', 'db_name', 'db_params', 'db_path']
if ( !row.keySet().containsAll(expected_headers) ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database input sheet - malformed column names. Please check input TSV. Column names should be: ${expected_headers.join(", ")}"
// valid tools specified// TIFNISIH LIST
def expected_tools = [ "bracken", "centrifuge", "diamond", "kaiju", "kraken2", "krakenuniq", "malt", "metaphlan3", "motus" ]
if ( !expected_tools.contains(row.tool) ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid tool name. Please see documentation for all supported profilers. Error in: ${row}"
// valid tools specified
def expected_tools = [ "bracken", "centrifuge", "diamond", "kaiju", "kraken2", "krakenuniq", "malt", "metaphlan3", "motus" ]
if ( !expected_tools.contains(row.tool) ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid tool name. Please see documentation for all supported profilers. Error in: ${row}"
// detect quotes in params
if ( row.db_params.contains('"') ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. No quotes allowed. Error in: ${row}"
if ( row.db_params.contains("'") ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. No quotes allowed. Error in: ${row}"
// detect quotes in params
if ( row.db_params.contains('"') ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. No quotes allowed. Error in: ${row}"
if ( row.db_params.contains("'") ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. No quotes allowed. Error in: ${row}"
// check if any form of bracken params, that it must have `;`
if ( row.tool == 'bracken' && row.db_params && !row.db_params.contains(";") ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. Bracken requires a semi-colon if passing parameter. Error in: ${row}"
// check if any form of bracken params, that it must have `;`
if ( row.tool == 'bracken' && row.db_params && !row.db_params.contains(";") ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. Bracken requires a semi-colon if passing parameter. Error in: ${row}"
// ensure that the database directory exists
if (!file(row.db_path, type: 'dir').exists()) exit 1, "ERROR: Please check input samplesheet -> database path could not be found!\n${row.db_path}"
}
def create_db_channels(LinkedHashMap row) {
def meta = [:]
meta.tool = row.tool
meta.db_name = row.db_name
meta.db_params = row.db_params
def array = []
if (!file(row.db_path, type: 'dir').exists()) {
exit 1, "ERROR: Please check input samplesheet -> database path could not be found!\n${row.db_path}"
}
array = [ meta, file(row.db_path) ]
return array
}

View file

@ -12,9 +12,9 @@ workflow INPUT_CHECK {
parsed_samplesheet = SAMPLESHEET_CHECK ( samplesheet )
.csv
.splitCsv ( header:true, sep:',' )
.branch {
fasta: it['fasta'] != ''
nanopore: it['instrument_platform'] == 'OXFORD_NANOPORE'
.branch { row ->
fasta: row.fasta != ''
nanopore: row.instrument_platform == 'OXFORD_NANOPORE'
fastq: true
}
@ -37,49 +37,42 @@ workflow INPUT_CHECK {
// Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
def create_fastq_channel(LinkedHashMap row) {
// create meta map
def meta = [:]
meta.id = row.sample
meta.run_accession = row.run_accession
meta.instrument_platform = row.instrument_platform
meta.single_end = row.single_end.toBoolean()
meta.is_fasta = false
def meta = row.subMap(['sample', 'run_accession', 'instrument_platform'])
meta.id = meta.sample
meta.single_end = row.single_end.toBoolean()
meta.is_fasta = false
// add path(s) of the fastq file(s) to the meta map
def fastq_meta = []
if (!file(row.fastq_1).exists()) {
exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}"
}
if (meta.single_end) {
fastq_meta = [ meta, [ file(row.fastq_1) ] ]
return [ meta, [ file(row.fastq_1) ] ]
} else {
if (meta.instrument_platform == 'OXFORD_NANOPORE') {
if (row.fastq_2 != '') {
exit 1, "ERROR: Please check input samplesheet -> For Oxford Nanopore reads Read 2 FastQ should be empty!\n${row.fastq_2}"
}
fastq_meta = [ meta, [ file(row.fastq_1) ] ]
return [ meta, [ file(row.fastq_1) ] ]
} else {
if (!file(row.fastq_2).exists()) {
exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}"
}
fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
return [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
}
}
return fastq_meta
}// Function to get list of [ meta, fasta ]
def create_fasta_channel(LinkedHashMap row) {
def meta = [:]
meta.id = row.sample
meta.run_accession = row.run_accession
meta.instrument_platform = row.instrument_platform
meta.single_end = true
meta.is_fasta = true
}
// Function to get list of [ meta, fasta ]
def create_fasta_channel(LinkedHashMap row) {
def meta = row.subMap(['sample', 'run_accession', 'instrument_platform'])
meta.id = meta.sample
meta.single_end = true
meta.is_fasta = true
def array = []
if (!file(row.fasta).exists()) {
exit 1, "ERROR: Please check input samplesheet -> FastA file does not exist!\n${row.fasta}"
}
array = [ meta, [ file(row.fasta) ] ]
return array
return [ meta, [ file(row.fasta) ] ]
}

View file

@ -46,7 +46,7 @@ workflow LONGREAD_HOSTREMOVAL {
ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions.first() )
bam_bai = MINIMAP2_ALIGN.out.bam
.join(SAMTOOLS_INDEX.out.bai, remainder: true)
.join(SAMTOOLS_INDEX.out.bai)
SAMTOOLS_STATS ( bam_bai, reference )
ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions.first())

View file

@ -20,33 +20,23 @@ workflow LONGREAD_PREPROCESSING {
PORECHOP_PORECHOP ( reads )
ch_processed_reads = PORECHOP_PORECHOP.out.reads
.map {
meta, reads ->
def meta_new = meta.clone()
meta_new['single_end'] = 1
[ meta_new, reads ]
}
.map { meta, reads -> [ meta + [single_end: 1], reads ] }
ch_versions = ch_versions.mix(PORECHOP_PORECHOP.out.versions.first())
ch_multiqc_files = ch_multiqc_files.mix( PORECHOP_PORECHOP.out.log )
} else if ( params.longread_qc_skipadaptertrim && !params.longread_qc_skipqualityfilter) {
ch_processed_reads = FILTLONG ( reads.map{ meta, reads -> [meta, [], reads ]} )
ch_processed_reads = FILTLONG ( reads.map { meta, reads -> [meta, [], reads ] } )
ch_versions = ch_versions.mix(FILTLONG.out.versions.first())
ch_multiqc_files = ch_multiqc_files.mix( FILTLONG.out.log )
} else {
PORECHOP_PORECHOP ( reads )
ch_clipped_reads = PORECHOP_PORECHOP.out.reads
.map {
meta, reads ->
def meta_new = meta.clone()
meta_new['single_end'] = 1
[ meta_new, reads ]
}
.map { meta, reads -> [ meta + [single_end: 1], reads ] }
ch_processed_reads = FILTLONG ( ch_clipped_reads.map{ meta, reads -> [meta, [], reads ]} ).reads
ch_processed_reads = FILTLONG ( ch_clipped_reads.map { meta, reads -> [ meta, [], reads ] } ).reads
ch_versions = ch_versions.mix(PORECHOP_PORECHOP.out.versions.first())
ch_versions = ch_versions.mix(FILTLONG.out.versions.first())

View file

@ -2,19 +2,19 @@
// Run profiling
//
include { MALT_RUN } from '../../modules/nf-core/malt/run/main'
include { MEGAN_RMA2INFO as MEGAN_RMA2INFO_TSV } from '../../modules/nf-core/megan/rma2info/main'
include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/kraken2/kraken2/main'
include { KRAKEN2_STANDARD_REPORT } from '../../modules/local/kraken2_standard_report'
include { BRACKEN_BRACKEN } from '../../modules/nf-core/bracken/bracken/main'
include { CENTRIFUGE_CENTRIFUGE } from '../../modules/nf-core/centrifuge/centrifuge/main'
include { CENTRIFUGE_KREPORT } from '../../modules/nf-core/centrifuge/kreport/main'
include { METAPHLAN3_METAPHLAN3 } from '../../modules/nf-core/metaphlan3/metaphlan3/main'
include { KAIJU_KAIJU } from '../../modules/nf-core/kaiju/kaiju/main'
include { KAIJU_KAIJU2TABLE } from '../../modules/nf-core/kaiju/kaiju2table/main'
include { DIAMOND_BLASTX } from '../../modules/nf-core/diamond/blastx/main'
include { MOTUS_PROFILE } from '../../modules/nf-core/motus/profile/main'
include { KRAKENUNIQ_PRELOADEDKRAKENUNIQ } from '../../modules/nf-core/krakenuniq/preloadedkrakenuniq/main'
include { MALT_RUN } from '../../modules/nf-core/malt/run/main'
include { MEGAN_RMA2INFO as MEGAN_RMA2INFO_TSV } from '../../modules/nf-core/megan/rma2info/main'
include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/kraken2/kraken2/main'
include { KRAKEN2_STANDARD_REPORT } from '../../modules/local/kraken2_standard_report'
include { BRACKEN_BRACKEN } from '../../modules/nf-core/bracken/bracken/main'
include { CENTRIFUGE_CENTRIFUGE } from '../../modules/nf-core/centrifuge/centrifuge/main'
include { CENTRIFUGE_KREPORT } from '../../modules/nf-core/centrifuge/kreport/main'
include { METAPHLAN3_METAPHLAN3 } from '../../modules/nf-core/metaphlan3/metaphlan3/main'
include { KAIJU_KAIJU } from '../../modules/nf-core/kaiju/kaiju/main'
include { KAIJU_KAIJU2TABLE as KAIJU_KAIJU2TABLE_SINGLE } from '../../modules/nf-core/kaiju/kaiju2table/main'
include { DIAMOND_BLASTX } from '../../modules/nf-core/diamond/blastx/main'
include { MOTUS_PROFILE } from '../../modules/nf-core/motus/profile/main'
include { KRAKENUNIQ_PRELOADEDKRAKENUNIQ } from '../../modules/nf-core/krakenuniq/preloadedkrakenuniq/main'
workflow PROFILING {
take:
@ -35,10 +35,7 @@ workflow PROFILING {
ch_input_for_profiling = reads
.map {
meta, reads ->
def meta_new = meta.clone()
pairtype = meta_new['single_end'] ? '_se' : '_pe'
meta_new['id'] = meta_new['id'] + pairtype
[meta_new, reads]
[meta + [id: "${meta.id}${meta.single_end ? '_se' : '_pe'}"], reads]
}
.combine(databases)
.branch {
@ -68,34 +65,34 @@ workflow PROFILING {
// MALT: We groupTuple to have all samples in one channel for MALT as database
// loading takes a long time, so we only want to run it once per database
ch_input_for_malt = ch_input_for_profiling.malt
.map {
meta, reads, db_meta, db ->
.map {
meta, reads, db_meta, db ->
// Reset entire input meta for MALT to just database name,
// as we don't run run on a per-sample basis due to huge datbaases
// so all samples are in one run and so sample-specific metadata
// unnecessary. Set as database name to prevent `null` job ID and prefix.
def temp_meta = [ id: meta['db_name'] ]
// Reset entire input meta for MALT to just database name,
// as we don't run run on a per-sample basis due to huge datbaases
// so all samples are in one run and so sample-specific metadata
// unnecessary. Set as database name to prevent `null` job ID and prefix.
def temp_meta = [ id: meta['db_name'] ]
// Extend database parameters to specify whether to save alignments or not
def new_db_meta = db_meta.clone()
def sam_format = params.malt_save_reads ? ' --alignments ./ -za false' : ""
new_db_meta['db_params'] = db_meta['db_params'] + sam_format
// Extend database parameters to specify whether to save alignments or not
def new_db_meta = db_meta.clone()
def sam_format = params.malt_save_reads ? ' --alignments ./ -za false' : ""
new_db_meta['db_params'] = db_meta['db_params'] + sam_format
// Combine reduced sample metadata with updated database parameters metadata,
// make sure id is db_name for publishing purposes.
def new_meta = temp_meta + new_db_meta
new_meta['id'] = new_meta['db_name']
// Combine reduced sample metadata with updated database parameters metadata,
// make sure id is db_name for publishing purposes.
def new_meta = temp_meta + new_db_meta
new_meta['id'] = new_meta['db_name']
[ new_meta, reads, db ]
[ new_meta, reads, db ]
}
.groupTuple(by: [0,2])
.multiMap {
it ->
reads: [ it[0], it[1].flatten() ]
db: it[2]
}
}
.groupTuple(by: [0,2])
.multiMap {
meta, reads, db ->
reads: [ meta, reads.flatten() ]
db: db
}
MALT_RUN ( ch_input_for_malt.reads, ch_input_for_malt.db )
@ -120,12 +117,11 @@ workflow PROFILING {
}
if ( params.run_kraken2 ) {
if ( params.run_kraken2 || params.run_bracken ) {
// Have to pick first element of db_params if using bracken,
// as db sheet for bracken must have ; sep list to
// distinguish between kraken and bracken parameters
ch_input_for_kraken2 = ch_input_for_profiling.kraken2
.dump(tag: "ch_input_for_kraken2_b4")
.map {
meta, reads, db_meta, db ->
def db_meta_new = db_meta.clone()
@ -272,10 +268,10 @@ workflow PROFILING {
ch_versions = ch_versions.mix( KAIJU_KAIJU.out.versions.first() )
ch_raw_classifications = ch_raw_classifications.mix( KAIJU_KAIJU.out.results )
KAIJU_KAIJU2TABLE ( KAIJU_KAIJU.out.results, ch_input_for_kaiju.db, params.kaiju_taxon_rank)
ch_versions = ch_versions.mix( KAIJU_KAIJU2TABLE.out.versions )
ch_multiqc_files = ch_multiqc_files.mix( KAIJU_KAIJU2TABLE.out.summary )
ch_raw_profiles = ch_raw_profiles.mix( KAIJU_KAIJU2TABLE.out.summary )
KAIJU_KAIJU2TABLE_SINGLE ( KAIJU_KAIJU.out.results, ch_input_for_kaiju.db, params.kaiju_taxon_rank)
ch_versions = ch_versions.mix( KAIJU_KAIJU2TABLE_SINGLE.out.versions )
ch_multiqc_files = ch_multiqc_files.mix( KAIJU_KAIJU2TABLE_SINGLE.out.summary )
ch_raw_profiles = ch_raw_profiles.mix( KAIJU_KAIJU2TABLE_SINGLE.out.summary )
}
if ( params.run_diamond ) {
@ -329,7 +325,7 @@ workflow PROFILING {
reads: [ single_meta + db_meta, reads.flatten() ]
db: db
}
// Hardcode to _always_ produce the report file (which is our basic otput, and goes into)
// Hardcode to _always_ produce the report file (which is our basic output, and goes into)
KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.krakenuniq_ram_chunk_size, params.krakenuniq_save_reads, true, params.krakenuniq_save_readclassifications )
ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report )
ch_versions = ch_versions.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions.first() )

View file

@ -3,7 +3,7 @@
//
include { BRACKEN_COMBINEBRACKENOUTPUTS } from '../../modules/nf-core/bracken/combinebrackenoutputs/main'
include { KAIJU_KAIJU2TABLE } from '../../modules/nf-core/kaiju/kaiju2table/main'
include { KAIJU_KAIJU2TABLE as KAIJU_KAIJU2TABLE_COMBINED } from '../../modules/nf-core/kaiju/kaiju2table/main'
include { KRAKENTOOLS_COMBINEKREPORTS as KRAKENTOOLS_COMBINEKREPORTS_KRAKEN } from '../../modules/nf-core/krakentools/combinekreports/main'
include { KRAKENTOOLS_COMBINEKREPORTS as KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE } from '../../modules/nf-core/krakentools/combinekreports/main'
include { METAPHLAN3_MERGEMETAPHLANTABLES } from '../../modules/nf-core/metaphlan3/mergemetaphlantables/main'
@ -103,9 +103,9 @@ workflow STANDARDISATION_PROFILES {
[[id:it[0]], it[1]]
}
KAIJU_KAIJU2TABLE ( ch_profiles_for_kaiju, ch_input_databases.kaiju.map{it[1]}, params.kaiju_taxon_rank)
ch_multiqc_files = ch_multiqc_files.mix( KAIJU_KAIJU2TABLE.out.summary )
ch_versions = ch_versions.mix( KAIJU_KAIJU2TABLE.out.versions )
KAIJU_KAIJU2TABLE_COMBINED ( ch_profiles_for_kaiju, ch_input_databases.kaiju.map{it[1]}, params.kaiju_taxon_rank)
ch_multiqc_files = ch_multiqc_files.mix( KAIJU_KAIJU2TABLE_COMBINED.out.summary )
ch_versions = ch_versions.mix( KAIJU_KAIJU2TABLE_COMBINED.out.versions )
// Kraken2
@ -151,7 +151,7 @@ workflow STANDARDISATION_PROFILES {
MOTUS_MERGE ( ch_profiles_for_motus, ch_input_databases.motus.map{it[1]}, motu_version )
ch_versions = ch_versions.mix( MOTUS_MERGE.out.versions )
emit:
taxpasta = TAXPASTA_MERGE.out.merged_profiles
versions = ch_versions

View file

@ -9,13 +9,13 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params)
// Validate input parameters
WorkflowTaxprofiler.initialise(params, log)
// TODO nf-core: Add all file path parameters for the pipeline to the list below
// Check input path parameters to see if they exist
def checkPathParamList = [ params.input, params.genome, params.databases,
params.outdir, params.longread_hostremoval_index,
params.hostremoval_reference, params.shortread_hostremoval_index,
params.multiqc_config, params.shortread_qc_adapterlist,
params.krona_taxonomy_directory,
params.taxpasta_taxonomy_dir,
params.multiqc_logo, params.multiqc_methods_description
]
for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
@ -301,7 +301,6 @@ workflow TAXPROFILER {
ch_multiqc_files = ch_multiqc_files.mix( STANDARDISATION_PROFILES.out.mqc.collect{it[1]}.ifEmpty([]) )
}
// TODO create multiQC module for metaphlan
MULTIQC (
ch_multiqc_files.collect(),
ch_multiqc_config.toList(),