Merge branch 'dev' into nf-core-template-merge-2.8

dev
James A. Fellows Yates 1 year ago committed by GitHub
commit bca6e645a8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -15,9 +15,6 @@ jobs:
steps:
- name: Launch workflow via tower
uses: seqeralabs/action-tower-launch@v1
# TODO nf-core: You can customise AWS full pipeline tests as required
# Add full size test data (but still relatively small datasets for few samples)
# on the `test_full.config` test runs with only one set of parameters
with:
workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}

@ -22,10 +22,18 @@ jobs:
if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/taxprofiler') }}"
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
NXF_VER:
- "22.10.1"
- "latest-everything"
parameters:
- "--preprocessing_qc_tool falco"
- "--shortread_qc_tool fastp"
- "--shortread_qc_tool adapterremoval"
- "--shortread_complexityfilter_tool bbduk"
- "--shortread_complexityfilter_tool prinseqplusplus"
steps:
- name: Check out pipeline code
uses: actions/checkout@v3
@ -35,9 +43,122 @@ jobs:
with:
version: "${{ matrix.NXF_VER }}"
- name: Show current locale
run: locale
- name: Set UTF-8 enabled locale
run: |
sudo locale-gen en_US.UTF-8
sudo update-locale LANG=en_US.UTF-8
- name: Run pipeline with test data
# TODO nf-core: You can customise CI pipeline run tests as required
# For example: adding multiple test runs with different parameters
# Remember that you can parallelise this by using strategy.matrix
uses: Wandalen/wretry.action@v1.0.11
with:
command: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results ${{ matrix.parameters }}
attempt_limit: 3
motus:
name: Test mOTUs with workflow parameters
if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/taxprofiler') }}
runs-on: ubuntu-latest
strategy:
matrix:
NXF_VER:
- "22.10.1"
- "latest-everything"
steps:
- name: Check out pipeline code
uses: actions/checkout@v2
- name: Install Nextflow
uses: nf-core/setup-nextflow@v1
with:
version: "${{ matrix.NXF_VER }}"
- name: Show current locale
run: locale
- name: Set UTF-8 enabled locale
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
sudo locale-gen en_US.UTF-8
sudo update-locale LANG=en_US.UTF-8
- name: Prepare the database
run: |
wget https://raw.githubusercontent.com/motu-tool/mOTUs/master/motus/downloadDB.py
python downloadDB.py > download_db_log.txt
echo 'tool,db_name,db_params,db_path' > 'database_motus.csv'
echo 'motus,db_mOTU,,db_mOTU' >> 'database_motus.csv'
- name: Run pipeline with test data
uses: Wandalen/wretry.action@v1.0.11
with:
command: nextflow run ${GITHUB_WORKSPACE} -profile test_motus,docker --outdir ./results --databases ./database_motus.csv
attempt_limit: 3
krakenuniq:
name: Test KrakenUniq with workflow parameters
if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/taxprofiler') }}
runs-on: ubuntu-latest
strategy:
matrix:
NXF_VER:
- "22.10.1"
- "latest-everything"
steps:
- name: Check out pipeline code
uses: actions/checkout@v2
- name: Install Nextflow
uses: nf-core/setup-nextflow@v1
with:
version: "${{ matrix.NXF_VER }}"
- name: Show current locale
run: locale
- name: Set UTF-8 enabled locale
run: |
sudo locale-gen en_US.UTF-8
sudo update-locale LANG=en_US.UTF-8
- name: Run pipeline with test data
uses: Wandalen/wretry.action@v1.0.11
with:
command: nextflow run ${GITHUB_WORKSPACE} -profile test_krakenuniq,docker --outdir ./results
attempt_limit: 3
malt:
name: Test MALT with workflow parameters
if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/taxprofiler') }}
runs-on: ubuntu-latest
strategy:
matrix:
NXF_VER:
- "22.10.1"
- "latest-everything"
steps:
- name: Check out pipeline code
uses: actions/checkout@v2
- name: Install Nextflow
uses: nf-core/setup-nextflow@v1
with:
version: "${{ matrix.NXF_VER }}"
- name: Show current locale
run: locale
- name: Set UTF-8 enabled locale
run: |
sudo locale-gen en_US.UTF-8
sudo update-locale LANG=en_US.UTF-8
- name: Run pipeline with test data
uses: Wandalen/wretry.action@v1.0.11
with:
command: nextflow run ${GITHUB_WORKSPACE} -profile test_nothing,docker --run_malt --input 'https://github.com/nf-core/test-datasets/raw/taxprofiler/samplesheet_shortreadsonly.csv' --outdir ./results
attempt_limit: 3

@ -10,3 +10,4 @@ testing/
testing*
*.pyc
bin/
tests/

@ -5,8 +5,37 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## v1.1.0dev - [date]
### `Added`
- [#276](https://github.com/nf-core/taxprofiler/pull/276) Implemented batching in the KrakenUniq samples processing. (added by @Midnighter)
- [#272](https://github.com/nf-core/taxprofiler/pull/272) - Add saving of final 'analysis-ready-reads' to dedicated directory. (❤️ to @alexhbnr for reporting, added by @jfy133)
### `Fixed`
- [#271](https://github.com/nf-core/taxprofiler/pull/271/files) Improved standardised table generation documentation nd mOTUs manual database download tutorial (♥ to @prototaxites for reporting, fix by @jfy133)
- [#269](https://github.com/nf-core/taxprofiler/pull/269/files) Reduced output files in AWS full test output due to very large files
- [#270](https://github.com/nf-core/taxprofiler/pull/270/files) Fixed warning for host removal index parameter, and improved index checks (♥ to @prototaxites for reporting, fix by @jfy133)
- [#274](https://github.com/nf-core/taxprofiler/pull/274/files) Substituted the samtools/bam2fq module with samtools/fastq module (fix by @sofstam)
- [#275](https://github.com/nf-core/taxprofiler/pull/275/files) Replaced function used for error reporting to more Nextflow friendly method (fix by @jfy133)
- [#285](https://github.com/nf-core/taxprofiler/pull/285/files) Fixed overly large log files in Kraken2 output (♥ to @prototaxites for reporting, fix by @Midnighter & @jfy133)
- [#286](https://github.com/nf-core/taxprofiler/pull/286/files) Runtime optimisation of MultiQC step via improved log file processing (fix by @Midnighter & @jfy133)
### `Dependencies`
### `Deprecated`
## v1.0.0 - Dodgy Dachshund [2023-03-13]
Initial release of nf-core/taxprofiler, created with the [nf-core](https://nf-co.re/) template.
- Add read quality control (sequencing QC, adapter removal and merging)
- Add read complexity filtering
- Add host-reads removal step
- Add run merging
- Add taxonomic classification
- Add taxon table standardisation
- Add post-classification visualisation
### `Added`
### `Fixed`

@ -13,8 +13,87 @@
- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
> Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
- [falco](https://doi.org/10.12688/f1000research.21142.2)
> de Sena Brandine G and Smith AD. Falco: high-speed FastQC emulation for quality control of sequencing data. F1000Research 2021, 8:1874
- [fastp](https://doi.org/10.1093/bioinformatics/bty560)
> Chen, Shifu, Yanqing Zhou, Yaru Chen, and Jia Gu. 2018. Fastp: An Ultra-Fast All-in-One FASTQ Preprocessor. Bioinformatics 34 (17): i884-90. 10.1093/bioinformatics/bty560.
- [AdapterRemoval2](https://doi.org/10.1186/s13104-016-1900-2)
> Schubert, Mikkel, Stinus Lindgreen, and Ludovic Orlando. 2016. AdapterRemoval v2: Rapid Adapter Trimming, Identification, and Read Merging. BMC Research Notes 9 (February): 88. doi:10.1186/s13104-016-1900-2.
- [Porechop](https://github.com/rrwick/Porechop)
- [FILTLONG](https://github.com/rrwick/Filtlong)
- [BBTools](http://sourceforge.net/projects/bbmap/)
- [PRINSEQ++](https://doi.org/10.7287/peerj.preprints.27553v1)
> Cantu, Vito Adrian, Jeffrey Sadural, and Robert Edwards. 2019. PRINSEQ++, a Multi-Threaded Tool for Fast and Efficient Quality Control and Preprocessing of Sequencing Datasets. e27553v1. PeerJ Preprints. doi: 10.7287/peerj.preprints.27553v1.
- [Bowtie2](https://doi.org/10.1038/nmeth.1923)
> Langmead, B., & Salzberg, S. L. (2012). Fast gapped-read alignment with Bowtie 2. Nature Methods, 9(4), 357359. doi: 10.1038/nmeth.1923
- [minimap2](https://doi.org/10.1093/bioinformatics/bty191)
> Li, H. (2018). Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics , 34(18), 30943100. doi: 10.1093/bioinformatics/bty191
- [SAMTools](https://doi.org/10.1093/gigascience/giab008)
> Danecek, P., Bonfield, J. K., Liddle, J., Marshall, J., Ohan, V., Pollard, M. O., Whitwham, A., Keane, T., McCarthy, S. A., Davies, R. M., & Li, H. (2021). Twelve years of SAMtools and BCFtools. GigaScience, 10(2). doi: 10.1093/gigascience/giab008
- [Bracken](https://doi.org/10.7717/peerj-cs.104)
> Lu, J., Breitwieser, F. P., Thielen, P., & Salzberg, S. L. (2017). Bracken: Estimating species abundance in metagenomics data. PeerJ Computer Science, 3, e104. doi: 10.7717/peerj-cs.104
- [Kraken2](https://doi.org/10.1186/s13059-019-1891-0)
> Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. Improved Metagenomic Analysis with Kraken 2. Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0.
- [KrakenUniq](https://doi.org/10.1186/s13059-018-1568-0)
> Breitwieser, Florian P., Daniel N. Baker, and Steven L. Salzberg. 2018. KrakenUniq: confident and fast metagenomics classification using unique k-mer counts. Genome Biology 19 (1): 198. doi: 10.1186/s13059-018-1568-0
- [MetaPhlAn3](https://doi.org/10.7554/eLife.65088)
> Beghini, Francesco, Lauren J McIver, Aitor Blanco-Míguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, et al. 2021. “Integrating Taxonomic, Functional, and Strain-Level Profiling of Diverse Microbial Communities with BioBakery 3.” Edited by Peter Turnbaugh, Eduardo Franco, and C Titus Brown. ELife 10 (May): e65088. doi: 10.7554/eLife.65088
- [MALT](https://doi.org/10.1038/s41559-017-0446-6)
> Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico. Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6.
- [MEGAN](https://doi.org/10.1371/journal.pcbi.1004957)
> Huson, Daniel H., Sina Beier, Isabell Flade, Anna Górska, Mohamed El-Hadidi, Suparna Mitra, Hans-Joachim Ruscheweyh, and Rewati Tappu. 2016. “MEGAN Community Edition - Interactive Exploration and Analysis of Large-Scale Microbiome Sequencing Data.” PLoS Computational Biology 12 (6): e1004957. doi: 10.1371/journal.pcbi.1004957.
- [DIAMOND](https://doi.org/10.1038/nmeth.3176)
> Buchfink, Benjamin, Chao Xie, and Daniel H. Huson. 2015. “Fast and Sensitive Protein Alignment Using DIAMOND.” Nature Methods 12 (1): 59-60. doi: 10.1038/nmeth.3176.
- [Centrifuge](https://doi.org/10.1101/gr.210641.116)
> Kim, Daehwan, Li Song, Florian P. Breitwieser, and Steven L. Salzberg. 2016. “Centrifuge: Rapid and Sensitive Classification of Metagenomic Sequences.” Genome Research 26 (12): 1721-29. doi: 10.1101/gr.210641.116.
- [Kaiju](https://doi.org/10.1038/ncomms11257)
> Menzel, P., Ng, K. L., & Krogh, A. (2016). Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nature Communications, 7, 11257. doi: 10.1038/ncomms11257
- [mOTUs](https://doi.org/10.1186/s40168-022-01410-z)
> Ruscheweyh, H.-J., Milanese, A., Paoli, L., Karcher, N., Clayssen, Q., Keller, M. I., Wirbel, J., Bork, P., Mende, D. R., Zeller, G., & Sunagawa, S. (2022). Cultivation-independent genomes greatly expand taxonomic-profiling capabilities of mOTUs across various environments. Microbiome, 10(1), 212. doi: 10.1186/s40168-022-01410-z
- [Krona](https://doi.org/10.1186/1471-2105-12-385)
> Ondov, Brian D., Nicholas H. Bergman, and Adam M. Phillippy. 2011. Interactive metagenomic visualization in a Web browser. BMC Bioinformatics 12 (1): 385. doi: 10.1186/1471-2105-12-385.
## Software packaging/containerisation tools
- [Anaconda](https://anaconda.com)
@ -33,3 +112,13 @@
- [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/)
> Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675.
## Data
- [Maixner (2021)](https://doi.org/10.1016/j.cub.2021.09.031) (CI Test Data)
> Maixner, Frank, Mohamed S. Sarhan, Kun D. Huang, Adrian Tett, Alexander Schoenafinger, Stefania Zingale, Aitor Blanco-Míguez, et al. 2021. “Hallstatt Miners Consumed Blue Cheese and Beer during the Iron Age and Retained a Non-Westernized Gut Microbiome until the Baroque Period.” Current Biology: CB 31 (23): 514962.e6. doi: 10.1016/j.cub.2021.09.031.
- [Meslier (2022)](https://doi.org/10.1038/s41597-022-01762-z) (AWS Full Test data)
> Meslier, Victoria, Benoit Quinquis, Kévin Da Silva, Florian Plaza Oñate, Nicolas Pons, Hugo Roume, Mircea Podar, and Mathieu Almeida. 2022. “Benchmarking Second and Third-Generation Sequencing Platforms for Microbial Metagenomics.” Scientific Data 9 (1): 694. doi: 10.1038/s41597-022-01762-z.

@ -1,4 +1,4 @@
# ![nf-core/taxprofiler](docs/images/nf-core-taxprofiler_logo_light.png#gh-light-mode-only) ![nf-core/taxprofiler](docs/images/nf-core-taxprofiler_logo_dark.png#gh-dark-mode-only)
# ![nf-core/taxprofiler](docs/images/nf-core-taxprofiler_logo_custom_light.png#gh-light-mode-only) ![nf-core/taxprofiler](docs/images/nf-core-taxprofiler_logo_custom_dark.png#gh-dark-mode-only)
[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/taxprofiler/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)
@ -12,20 +12,33 @@
## Introduction
**nf-core/taxprofiler** is a bioinformatics pipeline that ...
<!-- TODO nf-core:
Complete this sentence with a 2-3 sentence summary of what types of data the pipeline ingests, a brief overview of the
major pipeline sections and the types of output it produces. You're giving an overview to someone new
to nf-core here, in 15-20 seconds. For an example, see https://github.com/nf-core/rnaseq/blob/master/README.md#introduction
-->
<!-- TODO nf-core: Include a figure that guides the user through the major workflow steps. Many nf-core
workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples. -->
<!-- TODO nf-core: Fill in short bullet-pointed list of the default steps in the pipeline -->
1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic classification and profiling of shotgun and long-read metagenomic data. It allows for in-parallel taxonomic identification of reads or taxonomic abundance estimation with multiple classification and profiling tools against multiple databases, produces standardised output tables.
## Pipeline summary
![](docs/images/taxprofiler_tube.png)
1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) or [`falco`](https://github.com/smithlabcode/falco) as an alternative option)
2. Performs optional read pre-processing
- Adapter clipping and merging (short-read: [fastp](https://github.com/OpenGene/fastp), [AdapterRemoval2](https://github.com/MikkelSchubert/adapterremoval); long-read: [porechop](https://github.com/rrwick/Porechop))
- Low complexity and quality filtering (short-read: [bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus); long-read: [Filtlong](https://github.com/rrwick/Filtlong))
- Host-read removal (short-read: [BowTie2](http://bowtie-bio.sourceforge.net/bowtie2/); long-read: [Minimap2](https://github.com/lh3/minimap2))
- Run merging
3. Supports statistics for host-read removal ([Samtools](http://www.htslib.org/))
4. Performs taxonomic classification and/or profiling using one or more of:
- [Kraken2](https://ccb.jhu.edu/software/kraken2/)
- [MetaPhlAn3](https://huttenhower.sph.harvard.edu/metaphlan/)
- [MALT](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/malt/)
- [DIAMOND](https://github.com/bbuchfink/diamond)
- [Centrifuge](https://ccb.jhu.edu/software/centrifuge/)
- [Kaiju](https://kaiju.binf.ku.dk/)
- [mOTUs](https://motu-tool.org/)
- [KrakenUniq](https://github.com/fbreitwieser/krakenuniq)
5. Perform optional post-processing with:
- [bracken](https://ccb.jhu.edu/software/bracken/)
6. Standardises output tables ([`Taxpasta`](https://taxpasta.readthedocs.io))
7. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
8. Plotting Kraken2, Centrifuge, Kaiju and MALT results ([`Krona`](https://hpc.nih.gov/apps/kronatools.html))
## Usage
@ -34,35 +47,44 @@
> to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline)
> with `-profile test` before running the workflow on actual data.
<!-- TODO nf-core: Describe the minimum required steps to execute the pipeline, e.g. how to prepare samplesheets.
Explain what rows and columns represent. For instance (please edit as appropriate):
First, prepare a samplesheet with your input data that looks as follows:
`samplesheet.csv`:
```csv
sample,fastq_1,fastq_2
CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
2612,run1,ILLUMINA,2612_run1_R1.fq.gz,,
2612,run2,ILLUMINA,2612_run2_R1.fq.gz,,
2612,run3,ILLUMINA,2612_run3_R1.fq.gz,2612_run3_R2.fq.gz,
```
Each row represents a fastq file (single-end) or a pair of fastq files (paired end).
Each row represents a fastq file (single-end), a pair of fastq files (paired end), or a fasta (with long reads).
-->
Additionally, you will need a database sheet that looks as follows:
Now, you can run the pipeline using:
`databases.csv`:
<!-- TODO nf-core: update the following command to include all required parameters for a minimal example -->
```
tool,db_name,db_params,db_path
kraken2,db2,--quick,/<path>/<to>/kraken2/testdb-kraken2.tar.gz
metaphlan3,db1,,/<path>/<to>/metaphlan3/metaphlan_database/
```
That includes directories or `.tar.gz` archives containing databases for the tools you wish to run the pipeline against.
Now, you can run the pipeline using:
```bash
nextflow run nf-core/taxprofiler \
-profile <docker/singularity/.../institute> \
--input samplesheet.csv \
--outdir <OUTDIR>
--databases databases.csv \
--outdir <OUTDIR> \
--run_kraken2 --run_metaphlan3
```
> **Warning:**
> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those
> Please provide pipeline parameters via the CLI (as above) or Nextflow `-params-file` option. Custom config files including those
> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;
> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files).
@ -70,7 +92,7 @@ For more details, please refer to the [usage documentation](https://nf-co.re/tax
## Pipeline output
To see the the results of a test run with a full size dataset refer to the [results](https://nf-co.re/taxprofiler/results) tab on the nf-core website pipeline page.
To see the results of a test run with a full size dataset refer to the [results](https://nf-co.re/taxprofiler/results) tab on the nf-core website pipeline page.
For more details about the output files and reports, please refer to the
[output documentation](https://nf-co.re/taxprofiler/output).
@ -78,9 +100,34 @@ For more details about the output files and reports, please refer to the
nf-core/taxprofiler was originally written by James A. Fellows Yates, Sofia Stamouli, Moritz E. Beber, and the nf-core/taxprofiler team.
We thank the following people for their extensive assistance in the development of this pipeline:
### Team
<!-- TODO nf-core: If applicable, make list of people who have also contributed -->
- [James A. Fellows Yates](https://github.com/jfy133)
- [Sofia Stamouli](https://github.com/sofstam)
- [Moritz E. Beber](https://github.com/Midnighter)
We thank the following people for their contributions to the development of this pipeline:
- [Lauri Mesilaakso](https://github.com/ljmesi)
- [Tanja Normark](https://github.com/talnor)
- [Maxime Borry](https://github.com/maxibor)
- [Thomas A. Christensen II](https://github.com/MillironX)
- [Jianhong Ou](https://github.com/jianhong)
- [Rafal Stepien](https://github.com/rafalstepien)
- [Mahwash Jamy](https://github.com/mjamy)
### Acknowledgments
We also are grateful for the feedback and comments from:
- The general [nf-core/community](https://nf-co.re/community)
And specifically to
- [Alex Hübner](https://github.com/alexhbnr)
- [Lily Andersson Lee](https://github.com/LilyAnderssonLee)
❤️ also goes to [Zandra Fagernäs](https://github.com/ZandraFagernas) for the logo.
## Contributions and Support
@ -90,10 +137,7 @@ For further information or help, don't hesitate to get in touch on the [Slack `#
## Citations
<!-- TODO nf-core: Add citation for pipeline after first release. Uncomment lines below and update Zenodo doi and badge at the top of this file. -->
<!-- If you use nf-core/taxprofiler for your analysis, please cite it using the following doi: [10.5281/zenodo.XXXXXX](https://doi.org/10.5281/zenodo.XXXXXX) -->
<!-- TODO nf-core: Add bibliography of tools and data used in your pipeline -->
If you use nf-core/taxprofiler for your analysis, please cite it using the following doi: [10.5281/zenodo.7728364](https://doi.org/10.5281/zenodo.7728364)
An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.

@ -11,3 +11,271 @@ report_section_order:
order: -1002
export_plots: true
custom_logo: "nf-core-taxprofiler_logo_custom_light.png"
custom_logo_url: https://nf-co.re/taxprofiler
custom_logo_title: "nf-core/taxprofiler"
run_modules:
- fastqc
- adapterRemoval
- fastp
- bbduk
- prinseqplusplus
- porechop
- filtlong
- bowtie2
- minimap2
- samtools
- kraken
- kaiju
- metaphlan
- diamond
- malt
- motus
- custom_content
sp:
diamond:
fn_re: ".*.diamond.log$"
fastqc/data:
fn_re: ".*(fastqc|falco)_data.txt$"
fastqc/zip:
fn: "*_fastqc.zip"
top_modules:
- "fastqc":
name: "FastQC / Falco (pre-Trimming)"
path_filters:
- "*raw*"
path_filters_exclude:
- "*processed*"
extra: "If used in this run, Falco is a drop-in replacement for FastQC producing the same output, written by Guilherme de Sena Brandine and Andrew D. Smith."
- "fastqc":
name: "FastQC / Falco (post-Trimming)"
path_filters:
- "*processed*"
path_filters_exclude:
- "*raw*"
extra: "If used in this run, Falco is a drop-in replacement for FastQC producing the same output, written by Guilherme de Sena Brandine and Andrew D. Smith."
- "fastp"
- "adapterRemoval"
- "porechop":
extra: ": if you get the error message 'Error - was not able to plot data.' this means that porechop did not detect any adapters and therefore no statistics generated."
- "bbduk"
- "prinseqplusplus"
- "filtlong"
- "bowtie2":
name: "bowtie2"
- "samtools":
name: "Samtools Stats"
- "kraken":
name: "Kraken"
path_filters:
- "*.kraken2.kraken2.report.txt"
- "kraken":
name: "Bracken"
anchor: "bracken"
target: "Bracken"
doi: "10.7717/peerj-cs.104"
info: "Estimates species abundances in metagenomics samples by probabilistically re-distributing reads in the taxonomic tree."
extra: ": plot title will say Kraken2 due to the first step of bracken producing the same output format as Kraken. Abundance information is currently not supported in MultiQC."
path_filters:
- "*.bracken.kraken2.report.txt"
- "kraken":
name: "Centrifuge"
anchor: "centrifuge"
target: "Centrifuge"
doi: "10.1101/gr.210641.116"
info: "is a very rapid and memory-efficient system for the classification of DNA sequences from microbial samples. The system uses a novel indexing scheme based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index. Note: Figure title"
extra: ": plot title will say Kraken2 due to Centrifuge producing the same output format as Kraken. If activated, see the actual Kraken2 results in the section above."
path_filters:
- "*.centrifuge.txt"
- "malt":
name: "MALT"
- "diamond"
- "kaiju":
name: "Kaiju"
- "motus"
#It is not possible to set placement for custom kraken and centrifuge columns.
table_columns_placement:
FastQC / Falco (pre-Trimming):
total_sequences: 100
avg_sequence_length: 110
median_sequence_length: 120
percent_duplicates: 130
percent_gc: 140
percent_fails: 150
FastQC / Falco (post-Trimming):
total_sequences: 200
avg_sequence_length: 210
median_sequence_length: 220
percent_duplicates: 230
percent_gc: 240
percent_fails: 250
fastp:
pct_adapter: 300
pct_surviving: 310
pct_duplication: 320
after_filtering_gc_content: 330
after_filtering_q30_rate: 340
after_filtering_q30_bases: 350
filtering_result_passed_filter_reads: 360
Adapter Removal:
aligned_total: 360
percent_aligned: 370
percent_collapsed: 380
percent_discarded: 390
Porechop:
Input Reads: 400
Start Trimmed: 410
Start Trimmed Percent: 420
End Trimmed: 430
End Trimmed Percent: 440
Middle Split: 450
Middle Split Percent: 460
Filtlong:
Target bases: 500
BBDuk:
Input reads: 800
Total Removed bases percent: 810
Total Removed bases: 820
Total Removed reads percent: 830
Total Removed reads: 840
PRINSEQ++:
prinseqplusplus_total: 900
bowtie2:
overall_alignment_rate: 1000
Samtools Stats:
raw_total_sequences: 1100
reads_mapped: 1110
reads_mapped_percent: 1120
reads_properly_paired_percent: 1130
non-primary_alignments: 1140
reads_MQ0_percent: 1150
error_rate: 1160
Bracken:
"% Unclassified": 1200
"% Top 5": 1210
Centrifuge:
"% Unclassified": 1300
"% Top 5": 1310
DIAMOND:
queries_aligned: 1400
Kaiju:
assigned: 1500
"% Assigned": 1510
"% Unclassified": 1520
Kraken:
"% Unclassified": 1600
"% Top 5": 1610
MALT:
"Num. of queries": 1700
Total reads: 1710
Mappability: 1720
Assig. Taxonomy: 1730
Taxonomic assignment success: 1740
motus:
Total number of reads: 1800
Number of reads after filtering: 1810
Total number of inserts: 1820
Unique mappers: 1830
Multiple mappers: 1840
Ignored multiple mapper without unique hit: 1850
"Number of ref-mOTUs": 1860
"Number of meta-mOTUs": 1870
"Number of ext-mOTUs": 1880
table_columns_visible:
FastQC / Falco (pre-Trimming):
total_sequences: True
avg_sequence_length: True
percent_duplicates: True
percent_gc: True
percent_fails: False
FastQC / Falco (post-Trimming):
total_sequences: True
avg_sequence_length: True
percent_duplicates: False
percent_gc: False
percent_fails: False
porechop:
Input reads: False
Start Trimmed:
Start Trimmed Percent: True
End Trimmed: False
End Trimmed Percent: True
Middle Split: False
Middle Split Percent: True
fastp:
pct_adapter: True
pct_surviving: True
pct_duplication: False
after_filtering_gc_content: False
after_filtering_q30_rate: False
after_filtering_q30_bases: False
Filtlong:
Target bases: True
Adapter Removal:
aligned_total: True
percent_aligned: True
percent_collapsed: True
percent_discarded: False
BBDuk:
Input reads: False
Total Removed bases Percent: False
Total Removed bases: False
Total Removed reads percent: True
Total Removed reads: False
"PRINSEQ++":
prinseqplusplus_total: True
bowtie2:
overall_alignment_rate: True
Samtools Stats:
raw_total_sequences: True
reads_mapped: True
reads_mapped_percent: True
reads_properly_paired_percent: False
non-primary_alignments: False
reads_MQ0_percent: False
error_rate: False
Kraken: False
Bracken: False
Centrifuge: False
DIAMOND: False
Kaiju: False
MALT: False
motus: False
table_columns_name:
FastQC / Falco (pre-Trimming):
total_sequences: "Nr. Input Reads"
avg_sequence_length: "Length Input Reads"
percent_gc: "% GC Input Reads"
percent_duplicates: "% Dups Input Reads"
percent_fails: "% Failed Input Reads"
FastQC / Falco (post-Trimming):
total_sequences: "Nr. Processed Reads"
avg_sequence_length: "Length Processed Reads"
percent_gc: "% GC Processed Reads"
percent_duplicates: "% Dups Processed Reads"
percent_fails: "% Failed Processed Reads"
Samtools Stats:
raw_total_sequences: "Nr. Reads Into Mapping"
reads_mapped: "Nr. Mapped Reads"
reads_mapped_percent: "% Mapped Reads"
extra_fn_clean_exts:
- "kraken2.report.txt"
- ".txt"
- ".settings"
- ".bbduk"
- ".unmapped"
- "_filtered"
- type: remove
pattern: "_falco"
section_comments:
general_stats: "By default, all read count columns are displayed as millions (M) of reads."

@ -1,3 +1,6 @@
sample,fastq_1,fastq_2
SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz
SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz,
sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
2611,ERR5766174,ILLUMINA,,,/<path>/<to>/fasta/ERX5474930_ERR5766174_1.fa.gz
2612,ERR5766176,ILLUMINA,/<path>/<to>/fastq/ERX5474932_ERR5766176_1.fastq.gz,/<path>/<to>/fastq/ERX5474932_ERR5766176_2.fastq.gz,
2612,ERR5766180,ILLUMINA,/<path>/<to>/fastq/ERX5474936_ERR5766180_1.fastq.gz,,
2613,ERR5766181,ILLUMINA,/<path>/<to>/fastq/ERX5474937_ERR5766181_1.fastq.gz,/<path>/<to>/fastq/ERX5474937_ERR5766181_2.fastq.gz,
ERR3201952,ERR3201952,OXFORD_NANOPORE,/<path>/<to>/fastq/ERR3201952.fastq.gz,,

1 sample run_accession instrument_platform fastq_1 fastq_2 fasta
2 SAMPLE_PAIRED_END 2611 ERR5766174 ILLUMINA /path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz /path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz /<path>/<to>/fasta/ERX5474930_ERR5766174_1.fa.gz
3 SAMPLE_SINGLE_END 2612 ERR5766176 ILLUMINA /path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz /<path>/<to>/fastq/ERX5474932_ERR5766176_1.fastq.gz /<path>/<to>/fastq/ERX5474932_ERR5766176_2.fastq.gz
4 2612 ERR5766180 ILLUMINA /<path>/<to>/fastq/ERX5474936_ERR5766180_1.fastq.gz
5 2613 ERR5766181 ILLUMINA /<path>/<to>/fastq/ERX5474937_ERR5766181_1.fastq.gz /<path>/<to>/fastq/ERX5474937_ERR5766181_2.fastq.gz
6 ERR3201952 ERR3201952 OXFORD_NANOPORE /<path>/<to>/fastq/ERR3201952.fastq.gz

@ -1,258 +1,228 @@
#!/usr/bin/env python
"""Provide a command line tool to validate and transform tabular samplesheets."""
import argparse
import csv
import logging
from distutils import extension
import os
import sys
from collections import Counter
from pathlib import Path
logger = logging.getLogger()
class RowChecker:
"""
Define a service that can validate and transform each given row.
Attributes:
modified (list): A list of dicts, where each dict corresponds to a previously
validated and transformed row. The order of rows is maintained.
"""
VALID_FORMATS = (
".fq.gz",
".fastq.gz",
)
def __init__(
self,
sample_col="sample",
first_col="fastq_1",
second_col="fastq_2",
single_col="single_end",
**kwargs,
):
"""
Initialize the row checker with the expected column names.
Args:
sample_col (str): The name of the column that contains the sample name
(default "sample").
first_col (str): The name of the column that contains the first (or only)
FASTQ file path (default "fastq_1").
second_col (str): The name of the column that contains the second (if any)
FASTQ file path (default "fastq_2").
single_col (str): The name of the new column that will be inserted and
records whether the sample contains single- or paired-end sequencing
reads (default "single_end").
"""
super().__init__(**kwargs)
self._sample_col = sample_col
self._first_col = first_col
self._second_col = second_col
self._single_col = single_col
self._seen = set()
self.modified = []
def validate_and_transform(self, row):
"""
Perform all validations on the given row and insert the read pairing status.
Args:
row (dict): A mapping from column headers (keys) to elements of that row
(values).
"""
self._validate_sample(row)
self._validate_first(row)
self._validate_second(row)
self._validate_pair(row)
self._seen.add((row[self._sample_col], row[self._first_col]))
self.modified.append(row)
def _validate_sample(self, row):
"""Assert that the sample name exists and convert spaces to underscores."""
if len(row[self._sample_col]) <= 0:
raise AssertionError("Sample input is required.")
# Sanitize samples slightly.
row[self._sample_col] = row[self._sample_col].replace(" ", "_")
def _validate_first(self, row):
"""Assert that the first FASTQ entry is non-empty and has the right format."""
if len(row[self._first_col]) <= 0:
raise AssertionError("At least the first FASTQ file is required.")
self._validate_fastq_format(row[self._first_col])
def _validate_second(self, row):
"""Assert that the second FASTQ entry has the right format if it exists."""
if len(row[self._second_col]) > 0:
self._validate_fastq_format(row[self._second_col])
def _validate_pair(self, row):
"""Assert that read pairs have the same file extension. Report pair status."""
if row[self._first_col] and row[self._second_col]:
row[self._single_col] = False
first_col_suffix = Path(row[self._first_col]).suffixes[-2:]
second_col_suffix = Path(row[self._second_col]).suffixes[-2:]
if first_col_suffix != second_col_suffix:
raise AssertionError("FASTQ pairs must have the same file extensions.")
else:
row[self._single_col] = True
def _validate_fastq_format(self, filename):
"""Assert that a given filename has one of the expected FASTQ extensions."""
if not any(filename.endswith(extension) for extension in self.VALID_FORMATS):
raise AssertionError(
f"The FASTQ file has an unrecognized extension: {filename}\n"
f"It should be one of: {', '.join(self.VALID_FORMATS)}"
)
def validate_unique_samples(self):
"""
Assert that the combination of sample name and FASTQ filename is unique.
In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the
number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment.
"""
if len(self._seen) != len(self.modified):
raise AssertionError("The pair of sample name and FASTQ must be unique.")
seen = Counter()
for row in self.modified:
sample = row[self._sample_col]
seen[sample] += 1
row[self._sample_col] = f"{sample}_T{seen[sample]}"
def read_head(handle, num_lines=10):
"""Read the specified number of lines from the current position in the file."""
lines = []
for idx, line in enumerate(handle):
if idx == num_lines:
break
lines.append(line)
return "".join(lines)
import errno
import argparse
def sniff_format(handle):
"""
Detect the tabular format.
def parse_args(args=None):
Description = "Reformat nf-core/taxprofiler samplesheet file and check its contents."
Args:
handle (text file): A handle to a `text file`_ object. The read position is
expected to be at the beginning (index 0).
Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
Returns:
csv.Dialect: The detected tabular format.
parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("FILE_IN", help="Input samplesheet file.")
parser.add_argument("FILE_OUT", help="Output file.")
return parser.parse_args(args)
.. _text file:
https://docs.python.org/3/glossary.html#term-text-file
"""
peek = read_head(handle)
handle.seek(0)
sniffer = csv.Sniffer()
dialect = sniffer.sniff(peek)
return dialect
def make_dir(path):
if len(path) > 0:
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise exception
def print_error(error, context="Line", context_str=""):
error_str = "ERROR: Please check samplesheet -> {}".format(error)
if context != "" and context_str != "":
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
error, context.strip(), context_str.strip()
)
print(error_str)
sys.exit(1)
def check_samplesheet(file_in, file_out):
"""
Check that the tabular samplesheet has the structure expected by nf-core pipelines.
Validate the general shape of the table, expected columns, and each row. Also add
an additional column which records whether one or two FASTQ reads were found.
Args:
file_in (pathlib.Path): The given tabular samplesheet. The format can be either
CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``.
file_out (pathlib.Path): Where the validated and transformed samplesheet should
be created; always in CSV format.
Example:
This function checks that the samplesheet follows the following structure,
see also the `viral recon samplesheet`_::
sample,fastq_1,fastq_2
SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
.. _viral recon samplesheet:
https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
This function checks that the samplesheet follows the following structure:
sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
2611,ERR5766174,ILLUMINA,,,ERX5474930_ERR5766174_1.fa.gz
2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz,
2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,,
2613,ERR5766181,ILLUMINA,ERX5474937_ERR5766181_1.fastq.gz,ERX5474937_ERR5766181_2.fastq.gz,
"""
required_columns = {"sample", "fastq_1", "fastq_2"}
# See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
with file_in.open(newline="") as in_handle:
reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
# Validate the existence of the expected header columns.
if not required_columns.issubset(reader.fieldnames):
req_cols = ", ".join(required_columns)
logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.")
sys.exit(1)
# Validate each row.
checker = RowChecker()
for i, row in enumerate(reader):
try:
checker.validate_and_transform(row)
except AssertionError as error:
logger.critical(f"{str(error)} On line {i + 2}.")
sys.exit(1)
checker.validate_unique_samples()
header = list(reader.fieldnames)
header.insert(1, "single_end")
# See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
with file_out.open(mode="w", newline="") as out_handle:
writer = csv.DictWriter(out_handle, header, delimiter=",")
writer.writeheader()
for row in checker.modified:
writer.writerow(row)
def parse_args(argv=None):
"""Define and immediately parse command line arguments."""
parser = argparse.ArgumentParser(
description="Validate and transform a tabular samplesheet.",
epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv",
)
parser.add_argument(
"file_in",
metavar="FILE_IN",
type=Path,
help="Tabular input samplesheet in CSV or TSV format.",
FQ_EXTENSIONS = (".fq.gz", ".fastq.gz")
FA_EXTENSIONS = (
".fa.gz",
".fasta.gz",
".fna.gz",
".fas.gz",
)
parser.add_argument(
"file_out",
metavar="FILE_OUT",
type=Path,
help="Transformed output samplesheet in CSV format.",
)
parser.add_argument(
"-l",
"--log-level",
help="The desired log level (default WARNING).",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"),
default="WARNING",
)
return parser.parse_args(argv)
INSTRUMENT_PLATFORMS = [
"ABI_SOLID",
"BGISEQ",
"CAPILLARY",
"COMPLETE_GENOMICS",
"DNBSEQ",
"HELICOS",
"ILLUMINA",
"ION_TORRENT",
"LS454",
"OXFORD_NANOPORE",
"PACBIO_SMRT",
]
sample_mapping_dict = {}
with open(file_in, "r") as fin:
## Check header
MIN_COLS = 4
HEADER = [
"sample",
"run_accession",
"instrument_platform",
"fastq_1",
"fastq_2",
"fasta",
]
header = [x.strip('"') for x in fin.readline().strip().split(",")]
## Check for missing mandatory columns
missing_columns = list(set(HEADER) - set(header))
if len(missing_columns) > 0:
print(
"ERROR: Missing required column header -> {}. Note some columns can otherwise be empty. See pipeline documentation (https://nf-co.re/taxprofiler/usage).".format(
",".join(missing_columns)
)
)
sys.exit(1)
def main(argv=None):
"""Coordinate argument parsing and program execution."""
args = parse_args(argv)
logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s")
if not args.file_in.is_file():
logger.error(f"The given input file {args.file_in} was not found!")
sys.exit(2)
args.file_out.parent.mkdir(parents=True, exist_ok=True)
check_samplesheet(args.file_in, args.file_out)
## Find locations of mandatory columns
header_locs = {}
for i in HEADER:
header_locs[i] = header.index(i)
## Check sample entries
for line in fin:
## Pull out only relevant columns for downstream checking
line_parsed = [x.strip().strip('"') for x in line.strip().split(",")]
# Check valid number of columns per row
if len(line_parsed) < len(HEADER):
print_error(
"Invalid number of columns (minimum = {})!".format(len(HEADER)),
"Line",
line,
)
num_cols = len([x for x in line_parsed if x])
if num_cols < MIN_COLS:
print_error(
"Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
"Line",
line,
)
lspl = [line_parsed[i] for i in header_locs.values()]
## Check sample name entries
(
sample,
run_accession,
instrument_platform,
fastq_1,
fastq_2,
fasta,
) = lspl[: len(HEADER)]
sample = sample.replace(" ", "_")
if not sample:
print_error("Sample entry has not been specified!", "Line", line)
## Check FastQ file extension
for fastq in [fastq_1, fastq_2]:
if fastq:
if fastq.find(" ") != -1:
print_error("FastQ file contains spaces!", "Line", line)
if not fastq.endswith(FQ_EXTENSIONS):
print_error(
f"FastQ file does not have extension {' or '.join(list(FQ_EXTENSIONS))} !",
"Line",
line,
)
if fasta:
if fasta.find(" ") != -1:
print_error("FastA file contains spaces!", "Line", line)
if not fasta.endswith(FA_EXTENSIONS):
print_error(
f"FastA file does not have extension {' or '.join(list(FA_EXTENSIONS))}!",
"Line",
line,
)
sample_info = []
# Check run_accession
if not run_accession:
print_error("Run accession has not been specified!", "Line", line)
else:
sample_info.append(run_accession)
# Check instrument_platform
if not instrument_platform:
print_error("Instrument platform has not been specified!", "Line", line)
else:
if instrument_platform not in INSTRUMENT_PLATFORMS:
print_error(
f"Instrument platform {instrument_platform} is not supported! "
f"List of supported platforms {', '.join(INSTRUMENT_PLATFORMS)}",
"Line",
line,
)
sample_info.append(instrument_platform)
## Auto-detect paired-end/single-end
if sample and fastq_1 and fastq_2: ## Paired-end short reads
sample_info.extend(["0", fastq_1, fastq_2, fasta])
elif sample and fastq_1 and not fastq_2: ## Single-end short/long fastq reads
sample_info.extend(["1", fastq_1, fastq_2, fasta])
elif sample and fasta and not fastq_1 and not fastq_2: ## Single-end long reads
sample_info.extend(["1", fastq_1, fastq_2, fasta])
elif fasta and (fastq_1 or fastq_2):
print_error(
"FastQ and FastA files cannot be specified together in the same library!",
"Line",
line,
)
else:
print_error("Invalid combination of columns provided!", "Line", line)
## Create sample mapping dictionary = { sample: [ run_accession, instrument_platform, single_end, fastq_1, fastq_2 , fasta ] }
if sample not in sample_mapping_dict:
sample_mapping_dict[sample] = [sample_info]
else:
if sample_info in sample_mapping_dict[sample]:
print_error("Samplesheet contains duplicate rows!", "Line", line)
else:
sample_mapping_dict[sample].append(sample_info)
## Write validated samplesheet with appropriate columns
HEADER_OUT = [
"sample",
"run_accession",
"instrument_platform",
"single_end",
"fastq_1",
"fastq_2",
"fasta",
]
if len(sample_mapping_dict) > 0:
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
with open(file_out, "w") as fout:
fout.write(",".join(HEADER_OUT) + "\n")
for sample in sorted(sample_mapping_dict.keys()):
for idx, val in enumerate(sample_mapping_dict[sample]):
fout.write(f"{sample},{','.join(val)}\n")
else:
print_error("No entries to process!", "Samplesheet: {}".format(file_in))
def main(args=None):
args = parse_args(args)
check_samplesheet(args.FILE_IN, args.FILE_OUT)
if __name__ == "__main__":

@ -10,7 +10,6 @@
process {
// TODO nf-core: Check the defaults for all processes
cpus = { check_max( 1 * task.attempt, 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
@ -24,11 +23,10 @@ process {
// These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
// If possible, it would be nice to keep the same label naming convention when
// adding in your local modules too.
// TODO nf-core: Customise requirements for specific processes.
// See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
withLabel:process_single {
cpus = { check_max( 1 , 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
memory = { check_max( 1.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
}
withLabel:process_low {
@ -62,4 +60,27 @@ process {
withName:CUSTOM_DUMPSOFTWAREVERSIONS {
cache = false
}
withName: BRACKEN_BRACKEN {
errorStrategy = 'ignore'
}
withName: CENTRIFUGE_KREPORT {
errorStrategy = {task.exitStatus == 255 ? 'ignore' : 'retry'}
}
withName: KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE {
errorStrategy = { task.exitStatus in [255,1] ? 'ignore' : 'retry' }
}
withName: MEGAN_RMA2INFO_TSV {
cpus = { check_max( 1 , 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
}
withName: MEGAN_RMA2INFO_KRONA {
cpus = { check_max( 1 , 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
}
}

@ -12,22 +12,647 @@
process {
publishDir = [
path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
withName: SAMPLESHEET_CHECK {
withName: FASTQC {
ext.args = '--quiet'
ext.prefix = { "${meta.id}_${meta.run_accession}_raw" }
publishDir = [
path: { "${params.outdir}/pipeline_info" },
path: { "${params.outdir}/fastqc/raw" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
pattern: '*.{html,zip}'
]
}
withName: FASTQC {
withName: FASTQC_PROCESSED {
ext.args = '--quiet'
ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
publishDir = [
path: { "${params.outdir}/fastqc/processed" },
mode: params.publish_dir_mode,
pattern: '*.{html,zip}'
]
}
withName: FALCO {
ext.prefix = { "${meta.id}_${meta.run_accession}_raw_falco" }
publishDir = [
path: { "${params.outdir}/falco/raw" },
mode: params.publish_dir_mode,
pattern: '*.{html,txt,zip}'
]
}
withName: FALCO_PROCESSED {
ext.prefix = { "${meta.id}_${meta.run_accession}_processed_falco" }
publishDir = [
path: { "${params.outdir}/falco/processed" },
mode: params.publish_dir_mode,
pattern: '*.{html,txt,zip}'
]
}
withName: FASTP_SINGLE {
ext.args = [
// trimming options
params.shortread_qc_skipadaptertrim ? "--disable_adapter_trimming" : "",
params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter1 ? "--adapter_sequence ${params.shortread_qc_adapter1}" : "",
// filtering options
"--length_required ${params.shortread_qc_minlength}",
(params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp') ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : ''
].join(' ').trim()
ext.prefix = { "${meta.id}_${meta.run_accession}" }
publishDir = [
[
path: { "${params.outdir}/fastp" },
mode: params.publish_dir_mode,
pattern: '*.fastq.gz',
enabled: params.save_preprocessed_reads
],
[
path: { "${params.outdir}/fastp" },
mode: params.publish_dir_mode,
pattern: '*.{log,html,json}'
],
[
path: { "${params.outdir}/analysis_ready_fastqs" },
mode: params.publish_dir_mode,
pattern: '*.fastq.gz',
enabled: params.save_analysis_ready_fastqs,
// Don't know why `!` doesn't work here, but `== false` makes it work...
saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && params.save_analysis_ready_fastqs ? it : null }
]
]
}
withName: FASTP_PAIRED {
ext.args = [
// collapsing options - option to retain singletons
params.shortread_qc_includeunmerged ? '--include_unmerged' : '',
// trimming options
params.shortread_qc_skipadaptertrim ? "--disable_adapter_trimming" : "",
params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter1 ? "--adapter_sequence ${params.shortread_qc_adapter1}" : "",
params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter2 ? "--adapter_sequence_r2 ${params.shortread_qc_adapter2}" : "--detect_adapter_for_pe",
// filtering options
"--length_required ${params.shortread_qc_minlength}",
params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : ''
].join(' ').trim()
ext.prefix = { "${meta.id}_${meta.run_accession}" }
publishDir = [
[
path: { "${params.outdir}/fastp" },
mode: params.publish_dir_mode,
pattern: '*.fastq.gz',
enabled: params.save_preprocessed_reads
],
[
path: { "${params.outdir}/fastp" },
mode: params.publish_dir_mode,
pattern: '*.{log,html,json}'
],
[
path: { "${params.outdir}/analysis_ready_fastqs" },
mode: params.publish_dir_mode,
pattern: params.shortread_qc_mergepairs ? '*merged.fastq.gz' : '*.fastp.fastq.gz',
enabled: params.save_analysis_ready_fastqs,
saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && params.save_analysis_ready_fastqs ? it : null }
]
]
}
withName: ADAPTERREMOVAL_SINGLE {
ext.args = [
// trimming options
params.shortread_qc_skipadaptertrim ? "--adapter1 ''" : params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter1 ? "--adapter1 ${params.shortread_qc_adapter1}" : "",
// filtering options
"--minlength ${params.shortread_qc_minlength}"
].join(' ').trim()
ext.prefix = { "${meta.id}_${meta.run_accession}" }
publishDir = [
[
path: { "${params.outdir}/adapterremoval" },
mode: params.publish_dir_mode,
pattern: '*.fastq.gz',
enabled: params.save_preprocessed_reads
],
[
path: { "${params.outdir}/adapterremoval" },
mode: params.publish_dir_mode,
pattern: '*.settings'
],
[
path: { "${params.outdir}/analysis_ready_fastqs" },
mode: params.publish_dir_mode,
pattern: '*truncated.fastq.gz',
enabled: params.save_analysis_ready_fastqs,
saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && params.save_analysis_ready_fastqs ? it : null }
]
]
}
withName: ADAPTERREMOVAL_PAIRED {
ext.args = [
// collapsing options
params.shortread_qc_mergepairs ? "--collapse" : "",
// trimming options
params.shortread_qc_skipadaptertrim ? "--adapter1 ''" : params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter1 ? "--adapter1 ${params.shortread_qc_adapter1}" : "", // adding adapter list happens at module input channel level
params.shortread_qc_skipadaptertrim ? "--adapter2 ''" : params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter2 ? "--adapter2 ${params.shortread_qc_adapter2}" : "",
// filtering options
"--minlength ${params.shortread_qc_minlength}"
].join(' ').trim()
ext.prefix = { "${meta.id}_${meta.run_accession}" }
publishDir = [
[
path: { "${params.outdir}/adapterremoval" },
mode: params.publish_dir_mode,
pattern: '*.fastq.gz',
enabled: params.save_preprocessed_reads
],
[
path: { "${params.outdir}/adapterremoval" },
mode: params.publish_dir_mode,
pattern: '*.settings'
],
[
path: { "${params.outdir}/analysis_ready_fastqs" },
mode: params.publish_dir_mode,
pattern: '*{truncated.fastq,singleton.truncated}.gz',
enabled: params.save_analysis_ready_fastqs,
saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && !params.shortread_qc_mergepairs && params.save_analysis_ready_fastqs ? it : null}
]
]
}
// AdapterRemoval separate output merging
withName: CAT_FASTQ {
ext.prefix = { "${meta.id}_${meta.run_accession}" }
publishDir = [
[
path: { "${params.outdir}/analysis_ready_fastqs" },
mode: params.publish_dir_mode,
pattern: '*.fastq.gz',
enabled: params.save_analysis_ready_fastqs,
saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && params.save_analysis_ready_fastqs ? it : null }
]
]
}
withName: PORECHOP_PORECHOP {
ext.prefix = { "${meta.id}_${meta.run_accession}" }
publishDir = [
[
path: { "${params.outdir}/porechop" },
mode: params.publish_dir_mode,
pattern: '*_porechopped.fastq.gz',
enabled: params.save_preprocessed_reads
],
[
path: { "${params.outdir}/porechop" },
mode: params.publish_dir_mode,
pattern: '*.log'
],
[
path: { "${params.outdir}/analysis_ready_fastqs" },
mode: params.publish_dir_mode,
pattern: '*_porechopped.fastq.gz',
enabled: params.save_analysis_ready_fastqs,
saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_longread_hostremoval && params.longread_qc_skipqualityfilter && !params.longread_qc_skipadaptertrim && params.perform_longread_qc && params.save_analysis_ready_fastqs ? it : null }
]
]
}
withName: FILTLONG {
ext.args = [
"--min_length ${params.longread_qc_qualityfilter_minlength}",
"--keep_percent ${params.longread_qc_qualityfilter_keeppercent}",
"--target_bases ${params.longread_qc_qualityfilter_targetbases}"
]
.join(' ').trim()
ext.prefix = { "${meta.id}_${meta.run_accession}_filtered" }
publishDir = [
[
path: { "${params.outdir}/filtlong" },
mode: params.publish_dir_mode,
pattern: '*.fastq.gz',
enabled: params.save_preprocessed_reads
],
[
path: { "${params.outdir}/filtlong" },
mode: params.publish_dir_mode,
pattern: '*.log'
],
[
path: { "${params.outdir}/analysis_ready_fastqs" },
mode: params.publish_dir_mode,
pattern: '*.fastq.gz',
enabled: params.save_analysis_ready_fastqs,
saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_longread_hostremoval && !params.longread_qc_skipqualityfilter && params.perform_longread_qc && params.save_analysis_ready_fastqs ? it : null }
]
]
}
withName: BBMAP_BBDUK {
ext.args = [
"entropy=${params.shortread_complexityfilter_entropy}",
"entropywindow=${params.shortread_complexityfilter_bbduk_windowsize}",
params.shortread_complexityfilter_bbduk_mask ? "entropymask=t" : "entropymask=f"
].join(' ').trim()
ext.prefix = { "${meta.id}_${meta.run_accession}" }
publishDir = [
[
path: { "${params.outdir}/bbduk/" },
mode: params.publish_dir_mode,
pattern: '*.{fastq.gz}',
enabled: params.save_complexityfiltered_reads
],
[
path: { "${params.outdir}/bbduk/" },
mode: params.publish_dir_mode,
pattern: '*.log'
],
[
path: { "${params.outdir}/analysis_ready_fastqs" },
mode: params.publish_dir_mode,
pattern: '*.fastq.gz',
enabled: params.save_analysis_ready_fastqs,
saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && params.shortread_complexityfilter_tool && params.save_analysis_ready_fastqs ? it : null }
]
]
}
withName: PRINSEQPLUSPLUS {
ext.args = [
params.shortread_complexityfilter_prinseqplusplus_mode == 'dust' ? "-lc_dust=${params.shortread_complexityfilter_prinseqplusplus_dustscore}" : "-lc_entropy=${params.shortread_complexityfilter_entropy}",
"-trim_qual_left=0 -trim_qual_left=0 -trim_qual_window=0 -trim_qual_step=0",
].join(' ').trim()
ext.prefix = { "${meta.id}_${meta.run_accession}" }
publishDir = [
[
path: { "${params.outdir}/prinseqplusplus/" },
mode: params.publish_dir_mode,
pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz}',
enabled: params.save_complexityfiltered_reads
],
[
path: { "${params.outdir}/prinseqplusplus/" },
mode: params.publish_dir_mode,
pattern: '*.log'
],
[
path: { "${params.outdir}/analysis_ready_fastqs" },
mode: params.publish_dir_mode,
pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz}',
enabled: params.save_analysis_ready_fastqs,
saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && params.shortread_complexityfilter_tool && params.save_analysis_ready_fastqs ? it : null }
]
]
}
withName: BOWTIE2_BUILD {
publishDir = [
[
path: { "${params.outdir}/bowtie2/build" },
mode: params.publish_dir_mode,
pattern: 'bowtie2',
enabled: params.save_hostremoval_index
]
]
}
// Saving unmapped reads as FQ comes via input channel!
withName: BOWTIE2_ALIGN {
ext.prefix = { "${meta.id}_${meta.run_accession}" }
publishDir = [
[
path: { "${params.outdir}/bowtie2/align" },
mode: params.publish_dir_mode,
pattern: '*.log'
],
[
path: { "${params.outdir}/bowtie2/align" },
mode: params.publish_dir_mode,
pattern: '*.bam',
enabled: params.save_hostremoval_bam
],
[
path: { "${params.outdir}/bowtie2/align" },
mode: params.publish_dir_mode,
pattern: '*.fastq.gz',
enabled: params.save_hostremoval_unmapped
],
[
path: { "${params.outdir}/analysis_ready_fastqs" },
mode: params.publish_dir_mode,
enabled: params.perform_shortread_hostremoval,
pattern: '*.fastq.gz',
enabled: params.save_analysis_ready_fastqs,
saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && params.perform_shortread_hostremoval && params.save_analysis_ready_fastqs ? it : null }
]
]
}
withName: MINIMAP2_INDEX {
ext.args = '-x map-ont'
publishDir = [
path: { "${params.outdir}/minimap2/index" },
mode: params.publish_dir_mode,
pattern: '*.mmi',
enabled: params.save_hostremoval_index
]
}
withName: MINIMAP2_ALIGN {
ext.prefix = { "${meta.id}_${meta.run_accession}" }
publishDir = [
path: { "${params.outdir}/minimap2/align" },
mode: params.publish_dir_mode,
pattern: '*.bam',
enabled: params.save_hostremoval_bam
]
}
withName: SAMTOOLS_VIEW {
ext.args = '-f 4'
ext.prefix = { "${meta.id}_${meta.run_accession}.unmapped" }
}
withName: SAMTOOLS_FASTQ {
ext.prefix = { "${meta.id}_${meta.run_accession}.unmapped" }
publishDir = [
[
path: { "${params.outdir}/samtools/fastq" },
mode: params.publish_dir_mode,
pattern: '*.fastq.gz',
enabled: params.save_hostremoval_unmapped
],
[
path: { "${params.outdir}/analysis_ready_fastqs" },
mode: params.publish_dir_mode,
pattern: '*.fq.gz',
enabled: params.save_analysis_ready_fastqs,
saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun) ) && params.perform_longread_hostremoval && params.save_analysis_ready_fastqs ? it : null }
]
]
}
withName: SAMTOOLS_STATS {
ext.prefix = { "${meta.id}_${meta.run_accession}" }
publishDir = [
path: { "${params.outdir}/samtools/stats" },
mode: params.publish_dir_mode,
pattern: '*stats'
]
}
withName: MERGE_RUNS {
ext.prefix = { "${meta.id}" }
publishDir = [
[
path: { "${params.outdir}/run_merging/" },
mode: params.publish_dir_mode,
pattern: '*.fastq.gz',
enabled: params.save_runmerged_reads
],
[
path: { "${params.outdir}/analysis_ready_fastqs" },
mode: params.publish_dir_mode,
pattern: '*.fastq.gz',
enabled: params.perform_runmerging && params.save_analysis_ready_fastqs
]
]
}
withName: MALT_RUN {
ext.args = { "${meta.db_params} -m ${params.malt_mode}" }
// one run with multiple samples, so fix ID to just db name to ensure clean log name
ext.prefix = { "${meta.db_name}" }
publishDir = [
path: { "${params.outdir}/malt/${meta.db_name}/" },
mode: params.publish_dir_mode,
pattern: '*.{rma6,log,sam}'
]
}
withName: 'MEGAN_RMA2INFO_TSV' {
ext.args = "-c2c Taxonomy"
ext.prefix = { "${meta.id}" }
publishDir = [
path: { "${params.outdir}/malt/${meta.db_name}/" },
mode: params.publish_dir_mode,
pattern: '*.{txt.gz,megan}'
]
}
withName: KRAKEN2_KRAKEN2 {
ext.args = params.kraken2_save_minimizers ? { "${meta.db_params} --report-minimizer-data" } : { "${meta.db_params}" }
ext.prefix = params.perform_runmerging ? { meta.tool == "bracken" ? "${meta.id}_${meta.db_name}.bracken" : "${meta.id}_${meta.db_name}.kraken2" } : { meta.tool == "bracken" ? "${meta.id}_${meta.run_accession}_${meta.db_name}.bracken" : "${meta.id}_${meta.run_accession}_${meta.db_name}.kraken2" }
publishDir = [
path: { "${params.outdir}/kraken2/${meta.db_name}/" },
mode: params.publish_dir_mode,
pattern: '*.{txt,fastq.gz}'
]
}
withName: BRACKEN_BRACKEN {
ext.args = { "${meta.db_params}" }
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.bracken" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.bracken" }
publishDir = [
path: { "${params.outdir}/bracken/${meta.db_name}/" },
mode: params.publish_dir_mode,
pattern: '*.tsv'
]
}
withName: BRACKEN_COMBINEBRACKENOUTPUTS {
ext.prefix = { "bracken_${meta.id}_combined_reports" }
publishDir = [
path: { "${params.outdir}/bracken/" },
mode: params.publish_dir_mode,
pattern: '*.txt'
]
}
withName: KRAKENTOOLS_COMBINEKREPORTS_KRAKEN {
ext.prefix = { "kraken2_${meta.id}_combined_reports" }
publishDir = [
path: { "${params.outdir}/kraken2/" },
mode: params.publish_dir_mode,
pattern: '*.txt'
]
}
withName: KRAKENUNIQ_PRELOADEDKRAKENUNIQ {
ext.args = { "${meta.db_params}" }
// one run with multiple samples, so fix ID to just db name to ensure clean log name
ext.prefix = { "${meta.db_name}.krakenuniq" }
publishDir = [
path: { "${params.outdir}/krakenuniq/${meta.db_name}/" },
mode: params.publish_dir_mode,
pattern: '*.{txt,fastq.gz}'
]
}
withName: KRONA_CLEANUP {
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}" }
publishDir = [
path: { "${params.outdir}/krona/" },
mode: params.publish_dir_mode,
pattern: '*.{html}'
]
}
withName: KRONA_KTIMPORTTEXT {
ext.prefix = { "${meta.tool}_${meta.id}" }
publishDir = [
path: { "${params.outdir}/krona/" },
mode: params.publish_dir_mode,
pattern: '*.{html}'
]
}
withName: 'MEGAN_RMA2INFO_KRONA' {
ext.args = { "--read2class Taxonomy" }
ext.prefix = { "${meta.id}_${meta.db_name}" }
}
withName: KRONA_KTIMPORTTAXONOMY {
ext.args = "-i"
ext.prefix = { "${meta.tool}_${meta.id}" }
publishDir = [
path: { "${params.outdir}/krona/" },
mode: params.publish_dir_mode,
pattern: '*.{html}'
]
}
withName: METAPHLAN3_METAPHLAN3 {
ext.args = { "${meta.db_params}" }
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.metaphlan3" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.metaphlan3" }
publishDir = [
path: { "${params.outdir}/metaphlan3/${meta.db_name}/" },
mode: params.publish_dir_mode,
pattern: '*.{biom,txt}'
]
}
withName: METAPHLAN3_MERGEMETAPHLANTABLES {
ext.prefix = { "metaphlan3_${meta.id}_combined_reports" }
publishDir = [
path: { "${params.outdir}/metaphlan3/" },
mode: params.publish_dir_mode,
pattern: '*.{txt}'
]
}
withName: CENTRIFUGE_CENTRIFUGE {
ext.args = { "${meta.db_params}" }
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.centrifuge" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.centrifuge" }
publishDir = [
path: { "${params.outdir}/centrifuge/${meta.db_name}/" },
mode: params.publish_dir_mode,
pattern: '*.{txt,sam,gz}'
]
}
withName: CENTRIFUGE_KREPORT {
ext.args = { "${meta.db_params}" }
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.centrifuge" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.centrifuge" }
publishDir = [
path: { "${params.outdir}/centrifuge/${meta.db_name}/" },
mode: params.publish_dir_mode,
pattern: '*.{txt}'
]
}
withName: KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE {
ext.prefix = { "centrifuge_${meta.id}_combined_reports" }
publishDir = [
path: { "${params.outdir}/centrifuge/" },
mode: params.publish_dir_mode,
pattern: '*.{txt}'
]
}
withName: KAIJU_KAIJU {
ext.args = { "${meta.db_params}" }
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.kaiju" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.kaiju" }
publishDir = [
path: { "${params.outdir}/kaiju/${meta.db_name}/" },
mode: params.publish_dir_mode,
pattern: '*.tsv'
]
}
withName: 'KAIJU_KAIJU2TABLE_SINGLE' {
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.kaijutable" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.kaijutable" }
publishDir = [
path: { "${params.outdir}/kaiju/${meta.db_name}/" },
mode: params.publish_dir_mode,
pattern: '*.{txt}'
]
}
withName: 'KAIJU_KAIJU2TABLE_COMBINED' {
ext.prefix = { "kaiju_${meta.id}_combined_reports" }
publishDir = [
path: { "${params.outdir}/kaiju/" },
mode: params.publish_dir_mode,
pattern: '*.{txt}'
]
}
withName: KAIJU_KAIJU2KRONA {
ext.args = '-v -u'
}
withName: DIAMOND_BLASTX {
ext.args = { "${meta.db_params}" }
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.diamond" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.diamond" }
publishDir = [
path: { "${params.outdir}/diamond/${meta.db_name}/" },
mode: params.publish_dir_mode,
pattern: '*.{blast,xml,txt,daa,sam,tsv,paf,log}'
]
}
withName: MOTUS_PROFILE {
ext.args = {
[
params.motus_remove_ncbi_ids ? "" : "-p",
params.motus_use_relative_abundance ? "" : "-c",
params.motus_save_mgc_read_counts ? "-M ${task.ext.prefix}.mgc" : ""
].join(',').replaceAll(','," ")
}
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}" }
publishDir = [
path: { "${params.outdir}/motus/${meta.db_name}/" },
mode: params.publish_dir_mode
]
}
withName: MOTUS_MERGE {
ext.args = { params.standardisation_motus_generatebiom ? "-B" : "" }
ext.prefix = { "motus_${meta.id}_combined_reports" }
publishDir = [
path: { "${params.outdir}/motus/" },
mode: params.publish_dir_mode
]
}
withName: TAXPASTA_MERGE {
ext.args = {
[
"-p ${meta.tool} -o ${meta.tool}_${meta.id}.${params.standardisation_taxpasta_format}",
params.taxpasta_taxonomy_dir ? "--taxonomy ${params.taxpasta_taxonomy_dir}" : "",
params.taxpasta_add_name ? "--add-name" : "",
params.taxpasta_add_rank ? "--add-rank" : "",
params.taxpasta_add_lineage ? "--add-lineage" : "",
params.taxpasta_add_idlineage ? "--add-id-lineage" : ""
].join(' ').trim()
}
publishDir = [
path: { "${params.outdir}/taxpasta/" },
mode: params.publish_dir_mode,
pattern: '*.{tsv,csv,arrow,parquet,biom}'
]
}
withName: CUSTOM_DUMPSOFTWAREVERSIONS {
@ -38,4 +663,11 @@ process {
]
}
withName: MULTIQC {
publishDir = [
path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}
}

@ -20,10 +20,42 @@ params {
max_time = '6.h'
// Input data
// TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
// TODO nf-core: Give any required params for the test so that command line flags are not needed
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv'
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
perform_shortread_qc = true
perform_longread_qc = true
shortread_qc_mergepairs = true
perform_shortread_complexityfilter = true
perform_shortread_hostremoval = true
perform_longread_hostremoval = true
perform_runmerging = true
hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
run_kaiju = true
run_kraken2 = true
run_bracken = true
run_malt = false
run_metaphlan3 = true
run_centrifuge = true
run_diamond = true
run_krakenuniq = true
run_motus = false
run_krona = true
krona_taxonomy_directory = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/metagenome/krona_taxonomy.tab'
malt_save_reads = true
kraken2_save_reads = true
centrifuge_save_reads = true
run_profile_standardisation = true
}
// Genome references
genome = 'R64-1-1'
process {
withName: MALT_RUN {
maxForks = 1
ext.args = { "-m ${params.malt_mode} -J-Xmx12G" }
}
withName: MEGAN_RMA2INFO_TSV {
maxForks = 1
}
withName: MEGAN_RMA2INFO_KRONA {
maxForks = 1
}
}

@ -1,12 +1,10 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running full-size tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a full size pipeline test.
Use as follows:
nextflow run nf-core/taxprofiler -profile test_full,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/
@ -17,10 +15,57 @@ params {
config_profile_description = 'Full test dataset to check pipeline function'
// Input data for full size test
// TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA)
// TODO nf-core: Give any required params for the test so that command line flags are not needed
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv'
input = 'https://github.com/nf-core/test-datasets/raw/taxprofiler/samplesheet_full.csv'
databases = 'https://github.com/nf-core/test-datasets/raw/taxprofiler/database_full.csv'
// Genome references
genome = 'R64-1-1'
hostremoval_reference = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/819/615/GCA_000819615.1_ViralProj14015/GCA_000819615.1_ViralProj14015_genomic.fna.gz'
save_preprocessed_reads = false
perform_shortread_qc = true
shortread_qc_mergepairs = true
perform_shortread_complexityfilter = false
save_complexityfiltered_reads = false
perform_longread_qc = true
perform_shortread_hostremoval = true
perform_longread_hostremoval = true
save_hostremoval_index = false
save_hostremoval_bam = false
save_hostremoval_unmapped = false
perform_runmerging = true
save_runmerged_reads = false
run_centrifuge = true
centrifuge_save_reads = false
run_diamond = true
run_kaiju = true
run_kraken2 = true
kraken2_save_reads = false
kraken2_save_readclassification = false
kraken2_save_minimizers = false
run_krakenuniq = true
krakenuniq_save_reads = false
krakenuniq_save_readclassifications = false
run_bracken = true
run_malt = true
malt_save_reads = false
malt_generate_megansummary = true
run_metaphlan3 = true
run_motus = true
motus_save_mgc_read_counts = true
run_profile_standardisation = true
run_krona = true
}
cleanup = true

@ -0,0 +1,65 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/
//
// Separate test as KrakenUniq database can sometimes be too big for GHA
//
params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test to check KrakenUniq function'
// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'
// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_krakenuniq.csv'
perform_shortread_qc = true
perform_longread_qc = true
shortread_qc_mergepairs = true
perform_shortread_complexityfilter = true
perform_shortread_hostremoval = true
perform_longread_hostremoval = true
perform_runmerging = true
hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
run_kaiju = false
run_kraken2 = false
run_bracken = false
run_malt = false
run_metaphlan3 = false
run_centrifuge = false
run_diamond = false
run_krakenuniq = true
run_motus = false
run_krona = true
krona_taxonomy_directory = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/metagenome/krona_taxonomy.tab'
malt_save_reads = false
kraken2_save_reads = false
centrifuge_save_reads = false
diamond_save_reads = false
run_profile_standardisation = true
}
process {
withName: MALT_RUN {
maxForks = 1
}
withName: MEGAN_RMA2INFO_TSV {
maxForks = 1
}
withName: MEGAN_RMA2INFO_KRONA {
maxForks = 1
}
}

@ -0,0 +1,49 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/
//
// Separate test as mOTUs database download can be flaky
//
params {
config_profile_name = 'mOTUs Test profile'
config_profile_description = 'Minimal test to check mOTUs function'
// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'
// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
databases = 'database_motus.csv'
perform_shortread_qc = false
perform_longread_qc = false
perform_shortread_complexityfilter = false
perform_shortread_hostremoval = false
perform_longread_hostremoval = false
perform_runmerging = false
hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
run_kaiju = false
run_kraken2 = false
run_bracken = false
run_malt = false
run_metaphlan3 = false
run_centrifuge = false
run_diamond = false
run_krakenuniq = false
run_motus = true
motus_save_mgc_read_counts = false
motus_remove_ncbi_ids = false
motus_use_relative_abundance = false
run_profile_standardisation = true
}

@ -0,0 +1,49 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/
params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset skipping all preprocessing to check pipeline function'
// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'
// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
perform_shortread_qc = false
perform_longread_qc = false
perform_shortread_complexityfilter = false
perform_shortread_hostremoval = false
perform_longread_hostremoval = false
perform_runmerging = false
hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
run_kaiju = true
run_kraken2 = true
run_bracken = true
run_malt = true
run_metaphlan3 = true
run_centrifuge = true
run_diamond = true
run_krakenuniq = true
run_motus = false
run_krona = true
}
process {
withName: MALT_RUN {
maxForks = 1
ext.args = { "-m ${params.malt_mode} -J-Xmx12G" }
}
}

@ -0,0 +1,48 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/
params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset without performing any profiling to check pipeline function'
// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'
// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
perform_shortread_qc = true
perform_longread_qc = true
shortread_qc_mergepairs = true
perform_shortread_complexityfilter = true
perform_shortread_hostremoval = true
perform_longread_hostremoval = true
perform_runmerging = true
hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
run_kaiju = false
run_kraken2 = false
run_bracken = false
run_malt = false
run_metaphlan3 = false
run_centrifuge = false
run_diamond = false
run_krakenuniq = false
run_motus = false
}
process {
withName: MALT_RUN {
maxForks = 1
}
}

@ -0,0 +1,48 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/
params {
config_profile_name = 'Test profile'
config_profile_description = "Minimal test dataset without performing any preprocessing nor profiling to check pipeline function. Useful when you only wish to test a single profiler without having to 'opt-out' of all the others"
// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'
// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
perform_shortread_qc = false
perform_longread_qc = false
perform_shortread_complexityfilter = false
perform_shortread_hostremoval = false
perform_longread_hostremoval = false
perform_runmerging = false
hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
run_kaiju = false
run_kraken2 = false
run_bracken = false
run_malt = false
run_metaphlan3 = false
run_centrifuge = false
run_diamond = false
run_krakenuniq = false
run_motus = false
}
process {
withName: MALT_RUN {
maxForks = 1
ext.args = { "-m ${params.malt_mode} -J-Xmx12G" }
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 131 KiB

@ -0,0 +1,444 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
width="160mm"
height="120mm"
viewBox="0 0 160 120"
version="1.1"
id="svg5581"
inkscape:version="1.2 (1:1.2+202205241504+da316b6974)"
sodipodi:docname="nf-core-taxprofiler_icon.svg"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns="http://www.w3.org/2000/svg"
xmlns:svg="http://www.w3.org/2000/svg">
<sodipodi:namedview
id="namedview5583"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:showpageshadow="2"
inkscape:pageopacity="0.0"
inkscape:pagecheckerboard="0"
inkscape:deskcolor="#d1d1d1"
inkscape:document-units="mm"
showgrid="true"
inkscape:zoom="0.41291035"
inkscape:cx="541.27972"
inkscape:cy="445.61731"
inkscape:window-width="1920"
inkscape:window-height="1043"
inkscape:window-x="0"
inkscape:window-y="0"
inkscape:window-maximized="1"
inkscape:current-layer="layer1">
<inkscape:grid
type="xygrid"
id="grid6096"
originx="-7.7520637"
originy="-7.8991986" />
</sodipodi:namedview>
<defs
id="defs5578">
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient30057"
x1="10.213824"
y1="221.42242"
x2="218.95003"
y2="221.42242"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.6691607,0.01747377,-0.01751914,0.67089835,5.6293239,-100.28017)" />
<linearGradient
x1="0"
y1="0"
x2="1"
y2="0"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(37.87862,-29.594021,-29.594021,-37.87862,275.46292,136.24821)"
spreadMethod="pad"
id="linearGradient21662">
<stop
style="stop-opacity:1;stop-color:#000000"
offset="0"
id="stop21652" />
<stop
style="stop-opacity:1;stop-color:#0c542a"
offset="0.214724"
id="stop21654" />
<stop
style="stop-opacity:1;stop-color:#25af64"
offset="0.84662598"
id="stop21656" />
<stop
style="stop-opacity:1;stop-color:#25af64"
offset="0.846626"
id="stop21658" />
<stop
style="stop-opacity:1;stop-color:#25af64"
offset="1"
id="stop21660" />
</linearGradient>
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient2708"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174"
gradientUnits="userSpaceOnUse" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6027"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6029"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6031"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6033"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6035"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6037"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6039"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6041"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6043"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6045"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6047"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6049"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6051"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6053"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6055"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6057"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6059"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6061"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6063"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6065"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6067"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6069"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
</defs>
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1"
transform="translate(-7.7520638,-7.8991986)">
<g
id="g6123"
transform="translate(7.3175494,9.6409068)">
<path
id="path30012-5"
style="fill:#ffffff;fill-opacity:1;stroke:none;stroke-width:0.424868;stroke-linecap:round"
d="M 9.5900834,8.8480637 C 7.5556439,12.028833 7.7408039,16.123943 7.7687039,19.769833 c 0.83412,6.65305 4.1623501,13.08043 9.6588001,17.05639 2.95411,2.45491 6.166,4.56934 9.30414,6.77667 -1.90794,4.17676 -2.98236,9.29825 -0.29786,13.40918 1.67713,3.72229 4.84583,6.89989 5.31985,11.06218 -0.73221,2.86643 -3.54676,6.3663 -0.97712,9.12477 2.12262,2.77099 4.06395,5.64907 5.35322,8.82231 2.02674,3.35439 6.5976,4.09759 10.20366,3.73748 2.47738,-0.18007 4.72202,-2.67715 3.28025,-5.08004 -1.18247,-2.55998 -4.28744,-2.18411 -6.38635,-3.25125 -1.36801,-2.44734 -3.00653,-5.99123 -0.26124,-8.17889 2.5926,-2.70872 4.69882,-5.81991 6.94515,-8.77398 2.71899,-1.24434 5.98816,-0.0654 8.84595,0.2859 10.88656,2.48435 21.31368,6.88721 32.36055,8.64773 3.13924,0.41138 6.76175,-0.45373 9.579516,0.92166 1.40942,4.43747 2.34224,9.45414 5.76225,12.84528 2.6934,1.93573 7.02228,2.12481 9.53599,-0.13787 1.60636,-2.93172 -1.52253,-5.89536 -4.20194,-6.62585 -0.40071,-2.8223 -0.37969,-5.87179 0.26854,-8.58529 3.77785,-0.24182 7.24094,1.16954 10.61732,2.45295 1.78619,3.3965 5.12259,5.96321 8.91186,6.64761 1.60415,-1.40492 2.56073,1.72858 3.8909,2.50862 2.79709,2.93023 6.8997,5.0038 10.99366,4.75587 3.26301,-0.67749 3.79546,-4.59452 2.47471,-7.17014 -0.80988,-4.2018 -3.22071,-8.59389 -1.2047,-12.82352 1.17357,-5.20598 -1.35199,-10.96325 -5.77146,-13.90702 -4.33733,-3.64436 -10.39077,-3.64653 -14.92742,-6.93614 -10.03223,-6.15469 -21.64077,-9.58338 -33.280326,-11.28456 -12.56653,-0.98847 -25.20371,-0.69003 -37.78225,-1.58411 -7.09295,-0.25502 -14.55087,-1.15633 -21.23347,1.84537 -3.9067,-0.36049 -7.22928,-2.98747 -10.60062,-4.8325 -3.63865,-2.26998 -7.6488,-4.94361 -8.46777,-9.49866 -1.63814,-4.07488 -1.20485,-8.6292 -2.57509,-12.7242393 -0.092,-1.43702 -3.1468606,-2.01015 -3.5173206,-0.42766 z" />
<path
style="fill:url(#linearGradient30057);fill-opacity:1;stroke:none;stroke-width:0.424868;stroke-linecap:round"
d="M 10.931314,9.9653233 C 9.7827934,11.308473 9.8484234,13.338583 9.5801634,15.027613 c -0.34115,2.76171 -0.29206,5.58762 0.6731896,8.22888 1.139301,3.87108 2.982591,7.65359 6.139101,10.27078 3.79198,3.54184 8.2401,6.33641 12.43029,9.30118 -0.91535,2.99306 -2.65543,5.87431 -2.30804,9.13471 0.18061,2.677 1.94571,4.91009 3.11856,7.23201 1.50937,2.8256 3.84729,5.54985 3.82514,8.91032 -0.0787,2.49314 -2.48083,4.5082 -1.96562,7.03872 1.67419,2.68138 3.992,4.96942 5.15932,7.96769 0.65556,1.60629 1.4695,3.40539 3.31812,3.87174 2.38126,0.82152 5.41091,1.64045 7.63103,0.0125 0.82635,-1.48261 -1.06902,-2.72955 -2.36736,-2.87342 -1.71199,-0.17589 -3.69466,-0.59568 -4.39228,-2.41244 -0.96281,-2.26721 -2.29358,-4.88743 -1.3576,-7.36922 2.00069,-3.29907 5.15025,-5.72284 7.05316,-9.10416 0.97281,-1.66962 2.43831,-3.35242 4.55042,-3.26353 6.66383,-0.13609 13.02922,2.25844 19.35504,4.0482 6.38637,1.7963 12.67764,4.02096 19.20445,5.25889 3.77887,0.58276 7.72718,-0.18602 11.419706,0.93529 1.63461,0.75428 1.35512,2.9334 2.05535,4.34159 1.1597,3.06423 1.81242,6.5515 4.16823,8.9707 1.76834,1.42588 4.39281,1.36118 6.4399,0.63293 1.20689,-0.70879 -0.0705,-2.295 -0.82735,-2.84018 -0.85353,-0.97431 -2.49109,-0.81793 -3.19991,-1.88904 -0.61703,-3.21399 -0.48223,-6.57875 0.0181,-9.80404 0.2319,-1.63989 2.04068,-2.10159 3.44204,-1.83986 3.30625,0.10127 6.50718,1.46919 9.55674,2.3424 -0.71254,-1.81326 -1.19119,-3.75193 -0.97348,-5.72815 0.0204,-1.69846 0.15688,-3.58482 1.5917,-4.73058 1.83011,-1.86627 3.60232,-3.85815 5.70315,-5.41088 0.57552,0.16274 -0.36083,0.85452 -0.4987,1.09386 -1.90807,2.09201 -4.24787,3.84191 -5.83093,6.20217 -0.46736,1.92903 -0.41493,3.99677 -0.24575,5.96645 0.84833,2.57726 2.1706,5.12487 4.1346,7.01585 1.36224,0.87656 2.75795,2.00764 4.37729,2.2713 1.6213,-0.74843 1.95011,-2.85006 2.55124,-4.35962 1.10913,-3.99969 1.90782,-8.08534 2.71945,-12.15319 0.1702,-1.75995 0.75519,-4.04477 -0.9398,-5.26766 -0.66443,-0.31636 -0.15084,-1.05324 0.36147,-0.52827 2.02638,1.51086 1.30614,4.35901 1.06011,6.49585 -0.85333,4.26646 -1.66477,8.56551 -2.96213,12.72302 -0.11797,0.91577 -1.34029,1.9982 -0.53389,2.72401 2.58751,2.57362 4.81242,5.92113 8.5085,6.96547 1.59827,0.50471 3.80045,1.66945 5.18808,0.14586 1.17766,-1.4017 -0.0976,-3.1877 -0.28432,-4.72092 -0.75396,-3.29433 -2.15674,-6.56045 -1.96438,-9.99106 0.37367,-2.10533 1.65495,-4.11536 1.02167,-6.32262 -0.34992,-2.84568 -1.45335,-5.69952 -3.77909,-7.50012 -2.77154,-2.67603 -6.40479,-4.14933 -10.09936,-5.01092 -3.81561,-1.24593 -6.9919,-3.83431 -10.61749,-5.50364 -6.72466,-3.69962 -14.2084,-5.65169 -21.615834,-7.44474 -8.303242,-1.93229 -16.888842,-1.47941 -25.343602,-1.84797 -8.46313,-0.1352 -16.90366,-0.80082 -25.3617,-1.05107 -4.38737,-0.20312 -8.92121,-0.0523 -12.99753,1.77349 -2.83378,1.235 -5.55113,-0.83445 -8.06653,-1.90121 -3.95435,-2.11512 -8.20653,-4.63235 -11.45036,-7.81607 -1.44093,-1.52832 -1.98381,-3.13498 -2.61394,-4.93556 -0.98461,-2.81345 -1.62386,-5.97312 -1.9462,-9.06999 -0.24149,-1.39646 -0.0764,-3.01443 -0.81326,-4.2426397 -0.0219,-0.007 -0.045,-0.0107 -0.0678,-0.007 z"
id="path30012"
sodipodi:nodetypes="cccccccccccccccccccccccccccccccccccccccccccccccccccccccccscccc"
inkscape:export-filename="/Users/whx424/Pictures/Illustrations/taxprofiler_v11.png"
inkscape:export-xdpi="159"
inkscape:export-ydpi="159" />
<path
id="path1218-5-4"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 36.798094,98.985353 a 7.0058785,6.6371479 0 0 1 -7.00588,6.637147 7.0058785,6.6371479 0 0 1 -7.00589,-6.637147 7.0058785,6.6371479 0 0 1 7.00589,-6.63713 7.0058785,6.6371479 0 0 1 7.00588,6.63713 z" />
<path
id="path1218-8-0"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 48.458974,96.857863 a 7.0058789,6.6371479 0 0 1 -7.00588,6.637147 7.0058789,6.6371479 0 0 1 -7.00588,-6.637147 7.0058789,6.6371479 0 0 1 7.00588,-6.63715 7.0058789,6.6371479 0 0 1 7.00588,6.63715 z" />
<path
id="path1218-4-46"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 59.481564,99.639333 a 7.0058789,6.6371479 0 0 1 -7.00588,6.637157 7.0058789,6.6371479 0 0 1 -7.00588,-6.637157 7.0058789,6.6371479 0 0 1 7.00588,-6.63715 7.0058789,6.6371479 0 0 1 7.00588,6.63715 z" />
<path
id="path1218-7-4"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 71.168234,96.889183 a 7.0058785,6.6371479 0 0 1 -7.00589,6.637157 7.0058785,6.6371479 0 0 1 -7.00588,-6.637157 7.0058785,6.6371479 0 0 1 7.00588,-6.63715 7.0058785,6.6371479 0 0 1 7.00589,6.63715 z" />
<path
id="path1218-1-6"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 82.714904,99.255233 a 7.0058789,6.6371479 0 0 1 -7.00587,6.637147 7.0058789,6.6371479 0 0 1 -7.00588,-6.637147 7.0058789,6.6371479 0 0 1 7.00588,-6.63714 7.0058789,6.6371479 0 0 1 7.00587,6.63714 z" />
<path
id="path1218-73-4"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 93.813184,96.584163 a 7.0058789,6.6371479 0 0 1 -7.00588,6.637147 7.0058789,6.6371479 0 0 1 -7.00587,-6.637147 7.0058789,6.6371479 0 0 1 7.00587,-6.63715 7.0058789,6.6371479 0 0 1 7.00588,6.63715 z" />
<path
id="path1218-5-9-3"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 104.85924,98.601273 a 7.0058789,6.6371479 0 0 1 -7.005886,6.637147 7.0058789,6.6371479 0 0 1 -7.00588,-6.637147 7.0058789,6.6371479 0 0 1 7.00588,-6.63716 7.0058789,6.6371479 0 0 1 7.005886,6.63716 z" />
<path
id="path1218-8-7-8"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 116.52015,96.473773 a 7.0058785,6.6371479 0 0 1 -7.00589,6.637127 7.0058785,6.6371479 0 0 1 -7.00588,-6.637127 7.0058785,6.6371479 0 0 1 7.00588,-6.63714 7.0058785,6.6371479 0 0 1 7.00589,6.63714 z" />
<path
id="path1218-4-7-4"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 127.54274,99.255233 a 7.0058789,6.6371479 0 0 1 -7.00589,6.637147 7.0058789,6.6371479 0 0 1 -7.00588,-6.637147 7.0058789,6.6371479 0 0 1 7.00588,-6.63714 7.0058789,6.6371479 0 0 1 7.00589,6.63714 z" />
<path
id="path1218-7-9-00"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 139.22941,96.505103 a 7.0058785,6.6371479 0 0 1 -7.00587,6.637147 7.0058785,6.6371479 0 0 1 -7.00588,-6.637147 7.0058785,6.6371479 0 0 1 7.00588,-6.63716 7.0058785,6.6371479 0 0 1 7.00587,6.63716 z" />
<path
id="path1218-1-5-9"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 150.77607,98.871153 a 7.0058785,6.6371479 0 0 1 -7.00588,6.637137 7.0058785,6.6371479 0 0 1 -7.00588,-6.637137 7.0058785,6.6371479 0 0 1 7.00588,-6.63715 7.0058785,6.6371479 0 0 1 7.00588,6.63715 z" />
<g
id="g2682"
style="fill:url(#linearGradient2708);fill-opacity:1;stroke:url(#linearGradient2708)"
inkscape:export-filename="/Users/whx424/Pictures/Illustrations/taxprofiler_v11.png"
inkscape:export-xdpi="159"
inkscape:export-ydpi="159"
transform="matrix(3.7156967,0,0,3.7156967,-30.828156,-200.55501)">
<ellipse
style="fill:url(#linearGradient6027);fill-opacity:1;stroke:url(#linearGradient6029);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-5"
cx="16.378149"
cy="80.642143"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6031);fill-opacity:1;stroke:url(#linearGradient6033);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-8"
cx="19.516428"
cy="80.069572"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6035);fill-opacity:1;stroke:url(#linearGradient6037);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-4"
cx="22.48292"
cy="80.818146"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6039);fill-opacity:1;stroke:url(#linearGradient6041);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-7"
cx="25.628136"
cy="80.078003"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6043);fill-opacity:1;stroke:url(#linearGradient6045);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-1"
cx="28.735676"
cy="80.714775"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6047);fill-opacity:1;stroke:url(#linearGradient6049);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-73"
cx="31.72254"
cy="79.995911"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6051);fill-opacity:1;stroke:url(#linearGradient6053);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-5-9"
cx="34.695347"
cy="80.538773"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6055);fill-opacity:1;stroke:url(#linearGradient6057);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-8-7"
cx="37.83363"
cy="79.966202"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6059);fill-opacity:1;stroke:url(#linearGradient6061);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-4-7"
cx="40.800121"
cy="80.714775"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6063);fill-opacity:1;stroke:url(#linearGradient6065);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-7-9"
cx="43.945339"
cy="79.974632"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6067);fill-opacity:1;stroke:url(#linearGradient6069);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-1-5"
cx="47.052876"
cy="80.611404"
rx="1.4964142"
ry="1.4176555" />
</g>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 22 KiB

@ -0,0 +1,445 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
width="180mm"
height="180mm"
viewBox="0 0 180 180"
version="1.1"
id="svg5581"
inkscape:version="1.2 (1:1.2+202206011327+fc4e4096c5)"
sodipodi:docname="nf-core-taxprofiler_icon_border.svg"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns="http://www.w3.org/2000/svg"
xmlns:svg="http://www.w3.org/2000/svg">
<sodipodi:namedview
id="namedview5583"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:showpageshadow="2"
inkscape:pageopacity="0.0"
inkscape:pagecheckerboard="0"
inkscape:deskcolor="#d1d1d1"
inkscape:document-units="mm"
showgrid="true"
inkscape:zoom="0.41291035"
inkscape:cx="449.25006"
inkscape:cy="446.82823"
inkscape:window-width="1920"
inkscape:window-height="1016"
inkscape:window-x="784"
inkscape:window-y="1107"
inkscape:window-maximized="1"
inkscape:current-layer="layer1">
<inkscape:grid
type="xygrid"
id="grid6096"
originx="-7.7520638"
originy="-7.8991984"
dotted="true" />
</sodipodi:namedview>
<defs
id="defs5578">
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient30057"
x1="10.213824"
y1="221.42242"
x2="218.95003"
y2="221.42242"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.6691607,0.01747377,-0.01751914,0.67089835,5.6293239,-100.28017)" />
<linearGradient
x1="0"
y1="0"
x2="1"
y2="0"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(37.87862,-29.594021,-29.594021,-37.87862,275.46292,136.24821)"
spreadMethod="pad"
id="linearGradient21662">
<stop
style="stop-opacity:1;stop-color:#000000"
offset="0"
id="stop21652" />
<stop
style="stop-opacity:1;stop-color:#0c542a"
offset="0.214724"
id="stop21654" />
<stop
style="stop-opacity:1;stop-color:#25af64"
offset="0.84662598"
id="stop21656" />
<stop
style="stop-opacity:1;stop-color:#25af64"
offset="0.846626"
id="stop21658" />
<stop
style="stop-opacity:1;stop-color:#25af64"
offset="1"
id="stop21660" />
</linearGradient>
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient2708"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174"
gradientUnits="userSpaceOnUse" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6027"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6029"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6031"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6033"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6035"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6037"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6039"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6041"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6043"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6045"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6047"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6049"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6051"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6053"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6055"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6057"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6059"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6061"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6063"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6065"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6067"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient21662"
id="linearGradient6069"
gradientUnits="userSpaceOnUse"
x1="14.381735"
y1="80.392174"
x2="49.04929"
y2="80.392174" />
</defs>
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1"
transform="translate(-7.7520638,-7.8991986)">
<g
id="g6123"
transform="translate(17.317549,39.640907)">
<path
id="path30012-5"
style="fill:#ffffff;fill-opacity:1;stroke:none;stroke-width:0.424868;stroke-linecap:round"
d="M 9.5900834,8.8480637 C 7.5556439,12.028833 7.7408039,16.123943 7.7687039,19.769833 c 0.83412,6.65305 4.1623501,13.08043 9.6588001,17.05639 2.95411,2.45491 6.166,4.56934 9.30414,6.77667 -1.90794,4.17676 -2.98236,9.29825 -0.29786,13.40918 1.67713,3.72229 4.84583,6.89989 5.31985,11.06218 -0.73221,2.86643 -3.54676,6.3663 -0.97712,9.12477 2.12262,2.77099 4.06395,5.64907 5.35322,8.82231 2.02674,3.35439 6.5976,4.09759 10.20366,3.73748 2.47738,-0.18007 4.72202,-2.67715 3.28025,-5.08004 -1.18247,-2.55998 -4.28744,-2.18411 -6.38635,-3.25125 -1.36801,-2.44734 -3.00653,-5.99123 -0.26124,-8.17889 2.5926,-2.70872 4.69882,-5.81991 6.94515,-8.77398 2.71899,-1.24434 5.98816,-0.0654 8.84595,0.2859 10.88656,2.48435 21.31368,6.88721 32.36055,8.64773 3.13924,0.41138 6.76175,-0.45373 9.579516,0.92166 1.40942,4.43747 2.34224,9.45414 5.76225,12.84528 2.6934,1.93573 7.02228,2.12481 9.53599,-0.13787 1.60636,-2.93172 -1.52253,-5.89536 -4.20194,-6.62585 -0.40071,-2.8223 -0.37969,-5.87179 0.26854,-8.58529 3.77785,-0.24182 7.24094,1.16954 10.61732,2.45295 1.78619,3.3965 5.12259,5.96321 8.91186,6.64761 1.60415,-1.40492 2.56073,1.72858 3.8909,2.50862 2.79709,2.93023 6.8997,5.0038 10.99366,4.75587 3.26301,-0.67749 3.79546,-4.59452 2.47471,-7.17014 -0.80988,-4.2018 -3.22071,-8.59389 -1.2047,-12.82352 1.17357,-5.20598 -1.35199,-10.96325 -5.77146,-13.90702 -4.33733,-3.64436 -10.39077,-3.64653 -14.92742,-6.93614 -10.03223,-6.15469 -21.64077,-9.58338 -33.280326,-11.28456 -12.56653,-0.98847 -25.20371,-0.69003 -37.78225,-1.58411 -7.09295,-0.25502 -14.55087,-1.15633 -21.23347,1.84537 -3.9067,-0.36049 -7.22928,-2.98747 -10.60062,-4.8325 -3.63865,-2.26998 -7.6488,-4.94361 -8.46777,-9.49866 -1.63814,-4.07488 -1.20485,-8.6292 -2.57509,-12.7242393 -0.092,-1.43702 -3.1468606,-2.01015 -3.5173206,-0.42766 z" />
<path
style="fill:url(#linearGradient30057);fill-opacity:1;stroke:none;stroke-width:0.424868;stroke-linecap:round"
d="M 10.931314,9.9653233 C 9.7827934,11.308473 9.8484234,13.338583 9.5801634,15.027613 c -0.34115,2.76171 -0.29206,5.58762 0.6731896,8.22888 1.139301,3.87108 2.982591,7.65359 6.139101,10.27078 3.79198,3.54184 8.2401,6.33641 12.43029,9.30118 -0.91535,2.99306 -2.65543,5.87431 -2.30804,9.13471 0.18061,2.677 1.94571,4.91009 3.11856,7.23201 1.50937,2.8256 3.84729,5.54985 3.82514,8.91032 -0.0787,2.49314 -2.48083,4.5082 -1.96562,7.03872 1.67419,2.68138 3.992,4.96942 5.15932,7.96769 0.65556,1.60629 1.4695,3.40539 3.31812,3.87174 2.38126,0.82152 5.41091,1.64045 7.63103,0.0125 0.82635,-1.48261 -1.06902,-2.72955 -2.36736,-2.87342 -1.71199,-0.17589 -3.69466,-0.59568 -4.39228,-2.41244 -0.96281,-2.26721 -2.29358,-4.88743 -1.3576,-7.36922 2.00069,-3.29907 5.15025,-5.72284 7.05316,-9.10416 0.97281,-1.66962 2.43831,-3.35242 4.55042,-3.26353 6.66383,-0.13609 13.02922,2.25844 19.35504,4.0482 6.38637,1.7963 12.67764,4.02096 19.20445,5.25889 3.77887,0.58276 7.72718,-0.18602 11.419706,0.93529 1.63461,0.75428 1.35512,2.9334 2.05535,4.34159 1.1597,3.06423 1.81242,6.5515 4.16823,8.9707 1.76834,1.42588 4.39281,1.36118 6.4399,0.63293 1.20689,-0.70879 -0.0705,-2.295 -0.82735,-2.84018 -0.85353,-0.97431 -2.49109,-0.81793 -3.19991,-1.88904 -0.61703,-3.21399 -0.48223,-6.57875 0.0181,-9.80404 0.2319,-1.63989 2.04068,-2.10159 3.44204,-1.83986 3.30625,0.10127 6.50718,1.46919 9.55674,2.3424 -0.71254,-1.81326 -1.19119,-3.75193 -0.97348,-5.72815 0.0204,-1.69846 0.15688,-3.58482 1.5917,-4.73058 1.83011,-1.86627 3.60232,-3.85815 5.70315,-5.41088 0.57552,0.16274 -0.36083,0.85452 -0.4987,1.09386 -1.90807,2.09201 -4.24787,3.84191 -5.83093,6.20217 -0.46736,1.92903 -0.41493,3.99677 -0.24575,5.96645 0.84833,2.57726 2.1706,5.12487 4.1346,7.01585 1.36224,0.87656 2.75795,2.00764 4.37729,2.2713 1.6213,-0.74843 1.95011,-2.85006 2.55124,-4.35962 1.10913,-3.99969 1.90782,-8.08534 2.71945,-12.15319 0.1702,-1.75995 0.75519,-4.04477 -0.9398,-5.26766 -0.66443,-0.31636 -0.15084,-1.05324 0.36147,-0.52827 2.02638,1.51086 1.30614,4.35901 1.06011,6.49585 -0.85333,4.26646 -1.66477,8.56551 -2.96213,12.72302 -0.11797,0.91577 -1.34029,1.9982 -0.53389,2.72401 2.58751,2.57362 4.81242,5.92113 8.5085,6.96547 1.59827,0.50471 3.80045,1.66945 5.18808,0.14586 1.17766,-1.4017 -0.0976,-3.1877 -0.28432,-4.72092 -0.75396,-3.29433 -2.15674,-6.56045 -1.96438,-9.99106 0.37367,-2.10533 1.65495,-4.11536 1.02167,-6.32262 -0.34992,-2.84568 -1.45335,-5.69952 -3.77909,-7.50012 -2.77154,-2.67603 -6.40479,-4.14933 -10.09936,-5.01092 -3.81561,-1.24593 -6.9919,-3.83431 -10.61749,-5.50364 -6.72466,-3.69962 -14.2084,-5.65169 -21.615834,-7.44474 -8.303242,-1.93229 -16.888842,-1.47941 -25.343602,-1.84797 -8.46313,-0.1352 -16.90366,-0.80082 -25.3617,-1.05107 -4.38737,-0.20312 -8.92121,-0.0523 -12.99753,1.77349 -2.83378,1.235 -5.55113,-0.83445 -8.06653,-1.90121 -3.95435,-2.11512 -8.20653,-4.63235 -11.45036,-7.81607 -1.44093,-1.52832 -1.98381,-3.13498 -2.61394,-4.93556 -0.98461,-2.81345 -1.62386,-5.97312 -1.9462,-9.06999 -0.24149,-1.39646 -0.0764,-3.01443 -0.81326,-4.2426397 -0.0219,-0.007 -0.045,-0.0107 -0.0678,-0.007 z"
id="path30012"
sodipodi:nodetypes="cccccccccccccccccccccccccccccccccccccccccccccccccccccccccscccc"
inkscape:export-filename="/Users/whx424/Pictures/Illustrations/taxprofiler_v11.png"
inkscape:export-xdpi="159"
inkscape:export-ydpi="159" />
<path
id="path1218-5-4"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 36.798094,98.985353 a 7.0058785,6.6371479 0 0 1 -7.00588,6.637147 7.0058785,6.6371479 0 0 1 -7.00589,-6.637147 7.0058785,6.6371479 0 0 1 7.00589,-6.63713 7.0058785,6.6371479 0 0 1 7.00588,6.63713 z" />
<path
id="path1218-8-0"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 48.458974,96.857863 a 7.0058789,6.6371479 0 0 1 -7.00588,6.637147 7.0058789,6.6371479 0 0 1 -7.00588,-6.637147 7.0058789,6.6371479 0 0 1 7.00588,-6.63715 7.0058789,6.6371479 0 0 1 7.00588,6.63715 z" />
<path
id="path1218-4-46"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 59.481564,99.639333 a 7.0058789,6.6371479 0 0 1 -7.00588,6.637157 7.0058789,6.6371479 0 0 1 -7.00588,-6.637157 7.0058789,6.6371479 0 0 1 7.00588,-6.63715 7.0058789,6.6371479 0 0 1 7.00588,6.63715 z" />
<path
id="path1218-7-4"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 71.168234,96.889183 a 7.0058785,6.6371479 0 0 1 -7.00589,6.637157 7.0058785,6.6371479 0 0 1 -7.00588,-6.637157 7.0058785,6.6371479 0 0 1 7.00588,-6.63715 7.0058785,6.6371479 0 0 1 7.00589,6.63715 z" />
<path
id="path1218-1-6"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 82.714904,99.255233 a 7.0058789,6.6371479 0 0 1 -7.00587,6.637147 7.0058789,6.6371479 0 0 1 -7.00588,-6.637147 7.0058789,6.6371479 0 0 1 7.00588,-6.63714 7.0058789,6.6371479 0 0 1 7.00587,6.63714 z" />
<path
id="path1218-73-4"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 93.813184,96.584163 a 7.0058789,6.6371479 0 0 1 -7.00588,6.637147 7.0058789,6.6371479 0 0 1 -7.00587,-6.637147 7.0058789,6.6371479 0 0 1 7.00587,-6.63715 7.0058789,6.6371479 0 0 1 7.00588,6.63715 z" />
<path
id="path1218-5-9-3"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 104.85924,98.601273 a 7.0058789,6.6371479 0 0 1 -7.005886,6.637147 7.0058789,6.6371479 0 0 1 -7.00588,-6.637147 7.0058789,6.6371479 0 0 1 7.00588,-6.63716 7.0058789,6.6371479 0 0 1 7.005886,6.63716 z" />
<path
id="path1218-8-7-8"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 116.52015,96.473773 a 7.0058785,6.6371479 0 0 1 -7.00589,6.637127 7.0058785,6.6371479 0 0 1 -7.00588,-6.637127 7.0058785,6.6371479 0 0 1 7.00588,-6.63714 7.0058785,6.6371479 0 0 1 7.00589,6.63714 z" />
<path
id="path1218-4-7-4"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 127.54274,99.255233 a 7.0058789,6.6371479 0 0 1 -7.00589,6.637147 7.0058789,6.6371479 0 0 1 -7.00588,-6.637147 7.0058789,6.6371479 0 0 1 7.00588,-6.63714 7.0058789,6.6371479 0 0 1 7.00589,6.63714 z" />
<path
id="path1218-7-9-00"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 139.22941,96.505103 a 7.0058785,6.6371479 0 0 1 -7.00587,6.637147 7.0058785,6.6371479 0 0 1 -7.00588,-6.637147 7.0058785,6.6371479 0 0 1 7.00588,-6.63716 7.0058785,6.6371479 0 0 1 7.00587,6.63716 z" />
<path
id="path1218-1-5-9"
style="fill:#ffffff;fill-opacity:1;stroke:#ffffff;stroke-width:4.68179;stroke-linecap:round"
d="m 150.77607,98.871153 a 7.0058785,6.6371479 0 0 1 -7.00588,6.637137 7.0058785,6.6371479 0 0 1 -7.00588,-6.637137 7.0058785,6.6371479 0 0 1 7.00588,-6.63715 7.0058785,6.6371479 0 0 1 7.00588,6.63715 z" />
<g
id="g2682"
style="fill:url(#linearGradient2708);fill-opacity:1;stroke:url(#linearGradient2708)"
inkscape:export-filename="/Users/whx424/Pictures/Illustrations/taxprofiler_v11.png"
inkscape:export-xdpi="159"
inkscape:export-ydpi="159"
transform="matrix(3.7156967,0,0,3.7156967,-30.828156,-200.55501)">
<ellipse
style="fill:url(#linearGradient6027);fill-opacity:1;stroke:url(#linearGradient6029);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-5"
cx="16.378149"
cy="80.642143"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6031);fill-opacity:1;stroke:url(#linearGradient6033);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-8"
cx="19.516428"
cy="80.069572"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6035);fill-opacity:1;stroke:url(#linearGradient6037);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-4"
cx="22.48292"
cy="80.818146"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6039);fill-opacity:1;stroke:url(#linearGradient6041);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-7"
cx="25.628136"
cy="80.078003"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6043);fill-opacity:1;stroke:url(#linearGradient6045);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-1"
cx="28.735676"
cy="80.714775"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6047);fill-opacity:1;stroke:url(#linearGradient6049);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-73"
cx="31.72254"
cy="79.995911"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6051);fill-opacity:1;stroke:url(#linearGradient6053);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-5-9"
cx="34.695347"
cy="80.538773"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6055);fill-opacity:1;stroke:url(#linearGradient6057);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-8-7"
cx="37.83363"
cy="79.966202"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6059);fill-opacity:1;stroke:url(#linearGradient6061);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-4-7"
cx="40.800121"
cy="80.714775"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6063);fill-opacity:1;stroke:url(#linearGradient6065);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-7-9"
cx="43.945339"
cy="79.974632"
rx="1.4964142"
ry="1.4176555" />
<ellipse
style="fill:url(#linearGradient6067);fill-opacity:1;stroke:url(#linearGradient6069);stroke-width:1;stroke-linecap:round;stroke-miterlimit:4;stroke-dasharray:none"
id="path1218-1-5"
cx="47.052876"
cy="80.611404"
rx="1.4964142"
ry="1.4176555" />
</g>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 389 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 87 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 433 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 87 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 139 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 124 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 714 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 289 KiB

@ -6,36 +6,509 @@ This document describes the output produced by the pipeline. Most of the plots a
The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory.
<!-- TODO nf-core: Write this documentation describing your workflow's output -->
## Pipeline overview
The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
- [FastQC](#fastqc) - Raw read QC
- [falco](#fastqc) - Alternative to FastQC for raw read QC
- [fastp](#fastp) - Adapter trimming for Illumina data
- [AdapterRemoval](#adapterremoval) - Adapter trimming for Illumina data
- [Porechop](#porechop) - Adapter removal for Oxford Nanopore data
- [BBDuk](#bbduk) - Quality trimming and filtering for Illumina data
- [PRINSEQ++](#prinseq) - Quality trimming and filtering for Illunina data
- [Filtlong](#filtlong) - Quality trimming and filtering for Nanopore data
- [Bowtie2](#bowtie2) - Host removal for Illumina reads
- [minimap2](#minimap2) - Host removal for Nanopore reads
- [SAMtools stats](#samtools-stats) - Statistics from host removal
- [SAMtools fastq](#samtools-fastq) - Converts unmapped BAM file to fastq format (minimap2 only)
- [Analysis Ready Reads](#analysis-read-reads) - Optional results directory containing the final processed reads used as input for classification/profiling.
- [Bracken](#bracken) - Taxonomic classifier using k-mers and abundance estimations
- [Kraken2](#kraken2) - Taxonomic classifier using exact k-mer matches
- [KrakenUniq](#krakenuniq) - Taxonomic classifier that combines the k-mer-based classification and the number of unique k-mers found in each species
- [Centrifuge](#centrifuge) - Taxonomic classifier that uses a novel indexing scheme based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index.
- [Kaiju](#kaiju) - Taxonomic classifier that finds maximum (in-)exact matches on the protein-level.
- [Diamond](#diamond) - Sequence aligner for protein and translated DNA searches.
- [MALT](#malt) - Sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics
- [MetaPhlAn3](#metaphlan3) - Genome-level marker gene based taxonomic classifier
- [mOTUs](#motus) - Tool for marker gene-based OTU (mOTU) profiling.
- [TAXPASTA](#taxpasta) - Tool to standardise taxonomic profiles as well as merge profiles across samples from the same database and classifier/profiler.
- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
### FastQC
![](images/taxprofiler_tube.png)
### FastQC or Falco
<details markdown="1">
<summary>Output files</summary>
- `fastqc/`
- `*_fastqc.html`: FastQC report containing quality metrics.
- `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images.
- `{fastqc,falco}/`
- {raw,preprocessed}
- `*html`: FastQC or Falco report containing quality metrics in HTML format.
- `*.txt`: FastQC or Falco report containing quality metrics in TXT format.
- `*.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images (FastQC only).
</details>
[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
If preprocessing is turned on, nf-core/taxprofiler runs FastQC/Falco twice -once before and once after adapter removal/read merging, to allow evaluation of the performance of these preprocessing steps. Note in the General Stats table, the columns of these two instances of FastQC/Falco are placed next to each other to make it easier to evaluate. However, the columns of the actual preprocessing steps (i.e, fastp, AdapterRemoval, and Porechop) will be displayed _after_ the two FastQC/Falco columns, even if they were run 'between' the two FastQC/Falco jobs in the pipeline itself.
> Falco produces identical output to FastQC but in the `falco/` directory.
![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png)
![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png)
![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png)
> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality.
### fastp
[fastp](https://github.com/OpenGene/fastp) is a FASTQ pre-processing tool for quality control, trimmming of adapters, quality filtering and other features.
It is used in nf-core/taxprofiler for adapter trimming of short-reads.
<details markdown="1">
<summary>Output files</summary>
- `fastp`
- `<sample_id>.fastp.fastq.gz`: File with the trimmed unmerged fastq reads.
- `<sample_id>.merged.fastq.gz`: File with the reads that were successfully merged.
- `<sample_id>.*{log,html,json}`: Log files in different formats.
</details>
By default nf-core/taxprofiler will only provide the `<sample_id>.fastp.fastq.gz` file if fastp is selected. The file `<sample_id>.merged.fastq.gz` will be available in the output folder if you provide the argument ` --shortread_qc_mergepairs` (optionally retaining un-merged pairs when in combination with `--shortread_qc_includeunmerged`).
You can change the default value for low complexity filtering by using the argument `--shortread_complexityfilter_fastp_threshold`.
### AdapterRemoval
[AdapterRemoval](https://adapterremoval.readthedocs.io/en/stable/) searches for and removes remnant adapter sequences from High-Throughput Sequencing (HTS) data and (optionally) trims low quality bases from the 3' end of reads following adapter removal. It is popular in the field of palaeogenomics. The output logs are stored in the results folder, and as a part of the MultiQC report.
<details markdown="1">
<summary>Output files</summary>
- `adapterremoval/`
- `<sample_id>.settings`: AdapterRemoval log file containing general adapter removal, read trimming and merging statistics
- `<sample_id>.collapsed.fastq.gz` - read-pairs that merged and did not undergo trimming (only when `--shortread_qc_mergepairs` supplied)
- `<sample_id>.collapsed.truncated.fastq.gz` - read-pairs that merged underwent quality trimming (only when `--shortread_qc_mergepairs` supplied)
- `<sample_id>.pair1.truncated.fastq.gz` - read 1 of pairs that underwent quality trimming
- `<sample_id>.pair2.truncated.fastq.gz` - read 2 of pairs that underwent quality trimming (and could not merge if `--shortread_qc_mergepairs` supplied)
- `<sample_id>.singleton.truncated.fastq.gz` - orphaned read pairs where one of the pair was discarded
- `<sample_id>.discard.fastq.gz` - reads that were discarded due to length or quality filtering
</details>
By default nf-core/taxprofiler will only provide the `.settings` file if AdapterRemoval is selected.
You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. If this is selected, you may receive different combinations of `.fastq` files for each sample depending on the input types - e.g. whether you have merged or not, or if you're supplying both single- and paired-end reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.
> ⚠️ The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as complexity filtering, host removal, run merging etc..
### Porechop
[Porechop](https://github.com/rrwick/Porechop) is a tool for finding and removing adapters from Oxford Nanopore reads. Adapters on the ends of reads are trimmed and if a read has an adapter in its middle, it is considered a chimeric and it chopped into separate reads.
<details markdown="1">
<summary>Output files</summary>
- `porechop`
- `<sample_id>.log`: Log file containing trimming statistics
- `<sample_id>.fastq.gz`: Adapter-trimmed file
</details>
The output logs are saved in the output folder and are part of MultiQC report.You do not normally need to check these manually.
You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.
> ⚠️ We do **not** recommend using Porechop if you are already trimming the adapters with ONT's basecaller Guppy.
### BBDuk
[BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) stands for Decontamination Using Kmers. BBDuk was developed to combine most common data-quality-related trimming, filtering, and masking operations into a single high-performance tool.
It is used in nf-core/taxprofiler for complexity filtering using different algorithms. This means that it will remove reads with low sequence diversity (e.g. mono- or dinucleotide repeats).
<details markdown="1">
<summary>Output files</summary>
- `bbduk/`
- `<sample_id>.bbduk.log`: log file containing filtering statistics
- `<sample_id>.fastq.gz`: resulting FASTQ file without low-complexity reads
</details>
By default nf-core/taxprofiler will only provide the `.log` file if BBDuk is selected as the complexity filtering tool. You will only find the complexity filtered reads in your results directory if you provide ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.
> ⚠️ The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as host removal, run merging etc..
### PRINSEQ++
[PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus) is a C++ implementation of the [prinseq-lite.pl](https://prinseq.sourceforge.net/) program. It can be used to filter, reformat or trim genomic and metagenomic sequence data.
It is used in nf-core/taxprofiler for complexity filtering using different algorithms. This means that it will remove reads with low sequence diversity (e.g. mono- or dinucleotide repeats).
<details markdown="1">
<summary>Output files</summary>
- `prinseqplusplus/`
- `<sample_id>.log`: log file containing number of reads. Row IDs correspond to: `min_len, max_len, min_gc, max_gc, min_qual_score, min_qual_mean, ns_max_n, noiupac, derep, lc_entropy, lc_dust, trim_tail_left, trim_tail_right, trim_qual_left, trim_qual_right, trim_left, trim_right`
- `<sample_id>_good_out.fastq.gz`: resulting FASTQ file without low-complexity reads
</details>
By default nf-core/taxprofiler will only provide the `.log` file if PRINSEQ++ is selected as the complexity filtering tool. You will only find the complexity filtered `.fastq` files in your results directory if you supply ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.
> ⚠️ The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as host removal, run merging etc..
### Filtlong
[Filtlong](https://github.com/rrwick/Filtlong) is a quality filtering tool for long reads. It can take a set of small reads and produce a smaller, better subset.
<details markdown="1">
<summary>Output files</summary>
- `filtlong`
- `<sample_id>_filtered.fastq.gz`: Quality or short read data filtered file
- `<sample_id>_filtered.log`: log file containing summary statistics
</details>
You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.
> ⚠️ We do **not** recommend using Filtlong if you are performing filtering of low quality reads with ONT's basecaller Guppy.
### Bowtie2
[Bowtie 2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml) is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences. It is particularly good at aligning reads of about 50 up to 100s or 1,000s of characters, and particularly good at aligning to relatively long (e.g. mammalian) genomes.
It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) and/or other possible contaminant reads (e.g. Phi X) from short-read `.fastq` files prior to profiling.
<details markdown="1">
<summary>Output files</summary>
- `bowtie2/`
- `build/`
- `*.bt2`: Bowtie2 indicies of reference genome, only if `--save_hostremoval_index` supplied.
- `align/`
- `<sample_id>.bam`: BAM file containing reads that aligned against the user-supplied reference genome as well as unmapped reads
- `<sample_id>.bowtie2.log`: log file about the mapped reads
- `<sample_id>.unmapped.fastq.gz`: the off-target reads from the mapping that is used in downstream steps.
</details>
By default nf-core/taxprofiler will only provide the `.log` file if host removal is turned on. You will only have a `.bam` file if you specify `--save_hostremoval_bam`. This will contain _both_ mapped and unmapped reads. You will only get FASTQ files if you specify to save `--save_hostremoval_unmapped` - these contain only unmapped reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.
> Unmapped reads in FASTQ are only found in this directory for short-reads, for long-reads see [`samtools/fastq/`](#samtools-fastq)
> ⚠️ The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as run merging etc..
> While there is a dedicated section in the MultiQC HTML for Bowtie2, these values are not displayed by default in the General Stats table. Rather, alignment statistics to host genome is reported via samtools stats module in MultiQC report for direct comparison with minimap2 (see below).
### minimap2
[minimap2](https://github.com/lh3/minimap2) is an alignment tool suited to mapping long reads to reference sequences.
It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) or other possible contaminant reads from long-read `.fastq` files prior to taxonomic classification/profiling.
<details markdown="1">
<summary>Output files</summary>
- `minimap2`
- `build/`
- `*.mmi2`: minimap2 indices of reference genome, only if `--save_hostremoval_index` supplied.
- `align/`
- `<sample_id>.bam`: Alignment file in BAM format containing both mapped and unmapped reads.
</details>
By default, nf-core/taxprofiler will only provide the `.bam` file containing mapped and unmapped reads if saving of host removal for long reads is turned on via `--save_hostremoval_bam`.
> minimap2 is not yet supported as a module in MultiQC and therefore there is no dedicated section in the MultiQC HTML. Rather, alignment statistics to host genome is reported via samtools stats module in MultiQC report.
> Unlike Bowtie2, minimap2 does not produce an unmapped FASTQ file by itself. See [`samtools/fastq`](#samtools-fastq)
### SAMtools fastq
[SAMtools fastq](http://www.htslib.org/doc/1.1/samtools.html) converts a `.sam`, `.bam`, or `.cram` alignment file to FASTQ format
<details markdown="1">
<summary>Output files</summary>
- `samtoolsstats`
- `<sample_id>_interleaved.fq.gz`: Unmapped reads only in FASTQ gzip format
</details>
This directory will be present and contain the unmapped reads from the `.fastq` format from long-read minimap2 host removal, if `--save_hostremoval_unmapped` is supplied. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.
> For short-read unmapped reads, see [bowtie2](#bowtie2).
### Analysis Ready Reads
> This optional results directory will only be present in the pipeline results when supplying `--save_analysis_ready_reads`.
<details markdown="1">
<summary>Output files</summary>
- `samtoolsstats`
- `<sample_id>_{fq,fastq}.gz`: Final reads that underwent preprocessing and were sent for classification/profiling.
</details>
The results directory will contain the 'final' processed reads used as input for classification/profiling. It will _only_ include the output of the _last_ step of any combinations of preprocessing steps that may have been specified in the run configuration. For example, if you perform the read QC and host-removal preprocessing steps, the final reads that are sent to classification/profiling are the host-removed FASTQ files - those will be the ones present in this directory.
> ⚠️ If you turn off all preprocessing steps, then no results will be present in this directory. This happens independently for short- and long-reads. I.e. you will only have FASTQ files for short reads in this directory if you skip all long-read preprocessing.
### SAMtools stats
[SAMtools stats](http://www.htslib.org/doc/samtools-stats.html) collects statistics from a `.sam`, `.bam`, or `.cram` alignment file and outputs in a text format.
<details markdown="1">
<summary>Output files</summary>
- `samtools/stats`
- `<sample_id>.stats`: File containing samtools stats output.
</details>
In most cases you do not need to check this file, as it is rendered in the MultiQC run report.
### Run Merging
nf-core/taxprofiler offers the option to merge FASTQ files of multiple sequencing runs or libraries that derive from the same sample, as specified in the input samplesheet.
This is the last possible preprocessing step, so if you have multiple runs or libraries (and run merging turned on), this will represent the final reads that will go into classification/profiling steps.
<details markdown="1">
<summary>Output files</summary>
- `run_merging/`
- `*.fastq.gz`: Concatenated FASTQ files on a per-sample basis
</details>
Note that you will only find samples that went through the run merging step in this directory. For samples that had a single run or library will not go through this step of the pipeline and thus will not be present in this directory.
This directory and its FASTQ files will only be present if you supply `--save_runmerged_reads`.Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.
### Bracken
[Bracken](https://ccb.jhu.edu/software/bracken/) (Bayesian Reestimation of Abundance with Kraken) is a highly accurate statistical method that computes the abundance of species in DNA sequences from a metagenomics sample. Braken uses the taxonomy labels assigned by Kraken, a highly accurate metagenomics classification algorithm, to estimate the number of reads originating from each species present in a sample.
> 🛈 The first step of using Bracken requires running Kraken2, therefore the initial results before abundance estimation will be found in `<your_results>/kraken2/<your_bracken_db_name>`.
<details markdown="1">
<summary>Output files</summary>
- `bracken/`
- `bracken_<db_name>_combined_reports.txt`: combined bracken results as output from Bracken's `combine_bracken_outputs.py` script
- `<db_name>/`
- `<sample>_<db_name>.tsv`: TSV file containing per-sample summary of Bracken results with abundance information
</details>
The main taxonomic profiling file from Bracken is the `*.tsv` file. This provides the basic results from Kraken2 but with the corrected abundance information.
### Kraken2
[Kraken](https://ccb.jhu.edu/software/kraken2/) is a taxonomic sequence classifier that assigns taxonomic labels to DNA sequences. Kraken examines the k-mers within a query sequence and uses the information within those k-mers to query a database. That database maps -mers to the lowest common ancestor (LCA) of all genomes known to contain a given k-mer.
<details markdown="1">
<summary>Output files</summary>
- `kraken2/`
- `<db_name>_combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `krakentools`)
- `<db_name>/`
- `<sample_id>_<db_name>.classified.fastq.gz`: FASTQ file containing all reads that had a hit against a reference in the database for a given sample
- `<sample_id>_<db_name>.unclassified.fastq.gz`: FASTQ file containing all reads that did not have a hit in the database for a given sample
- `<sample_id>_<db_name>.report.txt`: A Kraken2 report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits in the Kraken2 run for a given sample
- `<sample_id>_<db_name>.classifiedreads.txt`: A list of read IDs and the hits each read had against each database for a given sample
</details>
The main taxonomic classification file from Kraken2 is the `_combined_reports.txt` or `*report.txt` file. The former provides you the broadest over view of the taxonomic classification results across all samples against a single databse, where you get two columns for each sample e.g. `2_all` and `2_lvl`, as well as a summarised column summing up across all samples `tot_all` and `tot_lvl`. The latter gives you the most information for a single sample. The report file is also used for the taxpasta step.
You will only receive the `.fastq` and `*classifiedreads.txt` file if you supply `--kraken2_save_reads` and/or `--kraken2_save_readclassification` parameters to the pipeline.
### KrakenUniq
[KrakenUniq](https://github.com/fbreitwieser/krakenuniq) (formerly KrakenHLL) is an extenson to the fast k-mer-based classification [Kraken](https://github.com/DerrickWood/kraken) with an efficient algorithm for additionally assessing the coverage of unique k-mers found in each species in a dataset.
<details markdown="1">
<summary>Output files</summary>
- `krakenuniq/`
- `<db_name>/`
- `<sample_id>_<db_name>.classified.fastq.gz`: FASTQ file containing all reads that had a hit against a reference in the database for a given sample
- `<sample_id>_<db_name>.unclassified.fastq.gz`: FASTQ file containing all reads that did not have a hit in the database for a given sample
- `<sample_id>_<db_name>.report.txt`: A Kraken2-style report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits, with an additional column for k-mer coverage, that allows for more accurate distinguishing between false-positive/true-postitive hits
- `<sample_id>_<db_name>.classifiedreads.txt`: A list of read IDs and the hits each read had against each database for a given sample
</details>
The main taxonomic classification file from KrakenUniq is the `*report.txt` file. This is an extension of the Kraken2 report with the additional k-mer coverage information that provides more information about the accuracy of hits.
You will only receive the `.fastq` and `*classifiedreads.txt` file if you supply `--krakenuniq_save_reads` and/or `--krakenuniq_save_readclassification` parameters to the pipeline.
> ⚠️ The output system of KrakenUniq can result in other `stdout` or `stderr` logging information being saved in the report file, therefore you must check your report files before downstream use!
### Centrifuge
[Centrifuge](https://github.com/DaehwanKimLab/centrifuge) is a taxonomic sequence classifier that uses a Burrows-Wheeler transform and Ferragina-Manzina index for storing and mapping sequences.
<details markdown="1">
<summary>Output files</summary>
- `centrifuge`
- `<sample_id>.centrifuge.mapped.fastq.gz`: `FASTQ` files containing all mapped reads
- `<sample_id>.centrifuge.report.txt`: A classification report that summarises the taxonomic ID, the taxonomic rank, length of genome sequence, number of classified and uniquely classified reads
- `<sample_id>.centrifuge.results.txt`: A file that summarises the classification assignment for a read, i.e read ID, sequence ID, score for the classification, score for the next best classification, number of classifications for this read
- `<sample_id>.centrifuge.txt`: A Kraken2-style report that summarises the fraction abundance, taxonomic ID, number of k-mers, taxonomic path of all the hits in the centrifuge run for a given sample
- `<sample_id>.centrifuge.unmapped.fastq.gz`: FASTQ file containing all unmapped reads
</details>
The main taxonomic classification files from Centrifuge are the `_combined_reports.txt`, `*report.txt`, `*results.txt` and the `*centrifuge.txt`. The latter is used by the taxpasta step. You will receive the `.fastq` files if you supply `--centrifuge_save_reads`.
### Kaiju
[Kaiju](https://github.com/bioinformatics-centre/kaiju) is a taxonomic classifier that finds maximum exact matches on the protein-level using the Burrows-Wheeler transform.
<details markdown="1">
<summary>Output files</summary>
- `kaiju`
- `kaiju_<db_name>_combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by kaiju2table)
- `<db_name>/`
- `<sample_id>_<db_name>.kaiju.tsv`: Raw output from Kaiju with taxonomic rank, read ID and taxonic ID
- `<sample_id>_<db_name>.kaijutable.txt`: Summarised Kaiju output with fraction abundance, taxonomic ID, number of reads, and taxonomic names (as generated by `kaiju2table`)
</details>
The most useful summary file is the `_combined_reports.txt` file which summarises hits across all reads and samples. Separate per-sample versions summaries can be seen in `<db>/*.txt`. However if you wish to look at more precise information on a per-read basis, see the `*tsv` file. The default taxonomic rank is `species`. You can provide a different one by updating the argument `--kaiju_taxon_rank`.
### DIAMOND
[DIAMOND](https://github.com/bbuchfink/diamond) is a sequence aligner for translated DNA searches or protein sequences against a protein reference database such as NR. It is a replacement for the NCBI BLAST software tools.It has many key features and it is used as taxonomic classifier in nf-core/taxprofiler.
<details markdown="1">
<summary>Output files</summary>
- `diamond`
- `<sample_id>.log`: A log file containing stdout information
- `<sample_id>*.{blast,xml,txt,daa,sam,tsv,paf}`: A file containing alignment information in various formats, or taxonomic information in a text-based format. Exact output depends on user choice.
</details>
By default you will receive a TSV output. Alternatively, you will receive a `*.sam` file if you provide the parameter `--diamond_save_reads` but in this case no taxonomic classification will be available(!), only the aligned reads in sam format.
> DIAMOND has many output formats, so depending on your [choice](https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options) with ` --diamond_output_format` you will receive the taxonomic information in a different format.
### MALT
[MALT](https://software-ab.cs.uni-tuebingen.de/download/malt) is a fast replacement for BLASTX, BLASTP and BLASTN, and provides both local and semi-global alignment capabilities.
<details markdown="1">
<summary>Output files</summary>
- `malt/`
- `<db_name>/`
- `<sample_id>.blastn.sam`: sparse SAM file containing alignments of each hit
- `<sample_id>.megan`: summary file that can be loaded into the [MEGAN6](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/) interactive viewer. Generated by MEGAN6 companion tool `rma2info`
- `<sample_id>.rma6`: binary file containing all alignments and taxonomic information of hits that can be loaded into the [MEGAN6](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/) interactive viewer
- `<sample_id>.txt.gz`: text file containing taxonomic IDs and read counts against each taxon. Generated by MEGAN6 companion tool `rma2info`
</details>
The main output of MALT is the `.rma6` file format, which can be only loaded into MEGAN and it's related tools. We provide the `rma2info` text files for improved compatibility with spreadsheet programs and other programmtic data manipulation tools, however this has only limited information compared to the 'binary' RMA6 file format (the `.txt` file only contains taxonomic ID and count, whereas RMA6 has taxonomic lineage information).
You will only receive the `.sam` and `.megan` files if you supply `--malt_save_reads` and/or `--malt_generate_megansummary` parameters to the pipeline.
### MetaPhlAn3
[MetaPhlAn3](https://github.com/biobakery/metaphlan) is a computational tool for profiling the composition of microbial communities (Bacteria, Archaea and Eukaryotes) from metagenomic shotgun sequencing data (i.e. not 16S) with species-level resolution via marker genes.
<details markdown="1">
<summary>Output files</summary>
- `metaphlan3/`
- `metaphlan3_<db_name>_combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `metaphlan_merge_tables`)
- `<db_name>/`
- `<sample_id>.biom`: taxonomic profile in BIOM format
- `<sample_id>.bowtie2out.txt`: BowTie2 alignment information (can be re-used for skipping alignment when re-running MetaPhlAn3 with different parameters)
- `<sample_id>_profile.txt`: MetaPhlAn3 taxonomic profile including abundance estimates
</details>
The main taxonomic profiling file from MetaPhlAn3 is the `*_profile.txt` file. This provides the abundance estimates from MetaPhlAn3 however does not include raw counts by default.
### mOTUs
[mOTUS](https://github.com/motu-tool/mOTUs) is a taxonomic profiler that maps reads to a unique marker specific database and estimates the relative abundance of known and unknown species.
<details markdown="1">
<summary>Output files</summary>
- `motus`
- `<sample_id>.log`: A log file that contains summary statistics
- `<sample_id>.out`: A classification file that summarises taxonomic identifiers, by default at the rank of mOTUs (i.e., species level), and their relative abundances in the profiled sample.
- `motus_<db_name>_combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `motus_merge`)
</details>
Normally `*_combined_reports.txt` is the most useful file for downstream analyses, but the per sample `.out` file can provide additional more specific information. By default, nf-core/taxprofiler is providing a column describing NCBI taxonomic ID as this is used in the taxpasta step. You can disable this column by activating the argument `--motus_remove_ncbi_ids`.
You will receive the relative abundance instead of read counts if you provide the argument `--motus_use_relative_abundance`.
### Krona
[Krona](https://github.com/marbl/Krona) allows the exploration of (metagenomic) hierarchical data with interactive zooming, multi-layered pie charts.
Krona charts will be generated by the pipeline for supported tools (Kraken2, Centrifuge, Kaiju, and MALT)
<details markdown="1">
<summary>Output files</summary>
- `krona/`
- `<tool_name>_<db_name>.html`: per-tool/per-database interactive HTML file containing hierarchical piecharts
</details>
The resulting HTML files can be loaded into your web browser for exploration. Each file will have a dropdown to allow you to switch between each sample aligned against the given database of the tool.
### TAXPASTA
[TAXPASTA](https://github.com/taxprofiler/taxpasta) standardises and merges two or more taxonomic profiles across samples into one single table. It supports multiple different classifiers simplifying comparison of taxonomic classification results between tools and databases.
<details markdown="1">
<summary>Output files</summary>
- `taxpasta`
- `<tool>_<database>*.{tsv,csv,arrow,parquet,biom}`: Standardised taxon table containing multiple samples. The standard format is the `tsv`. The first column describes the taxonomy ID and the rest of the columns describe the read counts for each sample.
</details>
By providing the path to a directory containing taxdump files to `--taxpasta_taxonomy_dir`, the taxon name, the taxon rank, the taxon's entire lineage including taxon names and/or the taxon's entire lineage including taxon identifiers can also be added in the output in addition to just the taxon ID. Addition of this extra information can be turned by using the parameters `--taxpasta_add_name`, `--taxpasta_add_rank`, `--taxpasta_add_lineage` and `--taxpasta_add_idlineage` respectively.
These files will likely be the most useful files for the comparison of differences in classification between different tools or building consensuses, with the caveat they have slightly less information than the actual output from each tool (which may have non-standard information e.g. taxonomic rank, percentage of hits, abundance estimations).
The following report files are used for the taxpasta step:
- Bracken: `<sample>_<db_name>.tsv` Taxpasta used the `new_est_reads` column for the standardised profile.
- Centrifuge: `<sample_id>.centrifuge.txt` Taxpasta uses the `direct_assigned_reads` column for the standardised profile.
- Diamond: `<sample_id>` Taxpasta summarises number of reads per NCBI taxonomy ID standardised profile.
- Kaiju: `<sample_id>_<db_name>.kaijutable.txt` Taxpasta uses the `reads` column from kaiju2table standardised profile.
- KrakenUniq: `<sample_id>_<db_name>.report.txt` Taxpasta uses the `reads` column for the standardised profile.
- Kraken2: `<sample_id>_<db_name>.report.txt` Taxpasta uses the `direct_assigned_reads` column for the standardised profile.
- MALT: `<sample_id>.txt.gz` Taxpasta uses the `count` (second) column from the output of MEGAN6's rma2info for the standardised profile.
- MetaPhlAn3: `<sample_id>_profile.txt` Taxpasta uses the `relative_abundance` column multiplied with a fixed number to yield an integer for the standardised profile.
- mOTUs: `<sample_id>.out` Taxpasta uses the `read_count` column for the standardised profile.
> ⚠️ Please aware the outputs of each tool's standardised profile _may not_ be directly comparable between each tool. Some may report raw read counts, whereas others may report abundance information. Please always refer to the list above, for which information is used for each tool.
### MultiQC
@ -53,6 +526,31 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see <http://multiqc.info>.
All tools in taxprofiler supported by MultiQC will have a dedicated section showing summary statistics of each tool based on information stored in log files.
You can expect in the MultiQC reports either sections and/or general stats columns for the following tools:
- fastqc
- adapterRemoval
- fastp
- bbduk
- prinseqplusplus
- porechop
- filtlong
- bowtie2
- minimap2
- samtools (stats)
- kraken
- bracken
- centrifuge
- kaiju
- metaphlan
- diamond
- malt
- motus
> The 'General Stats' table by default will only show statistics referring to pre-processing steps, and will not display possible values from each classifier/profiler, unless turned on by the user within the 'Configure Columns' menu or via a custom MultiQC config file (`--multiqc_config`)
### Pipeline information
<details markdown="1">

@ -6,62 +6,146 @@
## Introduction
<!-- TODO nf-core: Add documentation about anything specific to running your pipeline. For general topics, please point to (and add to) the main nf-core website. -->
nf-core/taxprofiler is a pipeline for highly-parallelised taxonomic classification and profiling of shotgun metagenomic data across multiple tools simultaneously. In addition to multiple classification and profiling tools, at the same time it allows you to performing taxonomic classification and profiling across multiple databases and settings per tool, as well as produces standardised output tables to allow immediate cross comparison of results between tools.
## Samplesheet input
To run nf-core/taxprofiler, at a minimum two you require two inputs:
You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below.
- a sequencing read samplesheet
- a database samplesheet
```bash
--input '[path to samplesheet file]'
Both contain metadata and paths to the data of your input samples and databases.
When running nf-core/taxprofiler, every step and tool is 'opt in'. To run a given classifier or profiler you must make sure to supply both a database in your `<database>.csv` and supply `--run_<profiler>` flag to your command. Omitting either will result in the profiling tool not executing.
nf-core/taxprofiler also includes optional pre-processing (adapter clipping, merge running etc.) or post-processing (visualisation) steps. These are also opt in with a `--perform_<step>` flag. In some cases, the pre- and post-processing steps may also require additional files. Please check the parameters tab of this documentation for more information.
Please see the rest of this page for information about how to prepare input samplesheets and databases and how to run Nextflow pipelines. See the [parameters](https://nf-co.re/taxprofiler/parameters) documentation for more information about specific options the pipeline also offers.
## Samplesheet inputs
nf-core/taxprofiler can accept as input raw or preprocessed single- or paired-end short-read (e.g. Illumina) FASTQ files, long-read FASTQ files (e.g. Oxford Nanopore), or FASTA sequences (available for a subset of profilers).
You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row as shown in the examples below. Furthermother, nf-core/taxprofiler also requires a second comma-separated file of 3 columns with a header row as in the examples below.
This samplesheet is then specified on the command line as follows:
```console
--input '[path to samplesheet file]' --databases '[path to database sheet file]'
```
### Multiple runs of the same sample
The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes:
The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate different runs FASTQ files of the same sample before performing profiling, when `--perform_runmerging` is supplied. Below is an example for the same sample sequenced across 3 lanes:
```console
sample,fastq_1,fastq_2
CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz
CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz
sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
2612,run1,ILLUMINA,2612_run1_R1.fq.gz,,
2612,run2,ILLUMINA,2612_run2_R1.fq.gz,,
2612,run3,ILLUMINA,2612_run3_R1.fq.gz,2612_run3_R2.fq.gz,
```
> ⚠️ Runs of the same sample sequenced on Illumina platforms with a combination of single and paired-end data will **not** be run-wise concatenated, unless pair-merging is specified. In the example above, `run3` will be profiled independently of `run1` and `run2` if pairs are not merged.
### Full samplesheet
The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below.
The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 6 columns to match those defined in the table below.
A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice.
A final samplesheet file consisting of both single- and paired-end data, as well as long-read FASTA files may look something like the one below. This is for 6 samples, where `2612` has been sequenced twice.
```console
sample,fastq_1,fastq_2
CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz
CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz
TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz,
TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz,
TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz,
TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz,
```
| Column | Description |
| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". |
| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". |
sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
2611,ERR5766174,ILLUMINA,,,/<path>/<to>/fasta/ERX5474930_ERR5766174_1.fa.gz
2612,ERR5766176,ILLUMINA,/<path>/<to>/fastq/ERX5474932_ERR5766176_1.fastq.gz,/<path>/<to>/fastq/ERX5474932_ERR5766176_2.fastq.gz,
2612,ERR5766180,ILLUMINA,/<path>/<to>/fastq/ERX5474936_ERR5766180_1.fastq.gz,,
2613,ERR5766181,ILLUMINA,/<path>/<to>/fastq/ERX5474937_ERR5766181_1.fastq.gz,/<path>/<to>/fastq/ERX5474937_ERR5766181_2.fastq.gz,
ERR3201952,ERR3201952,OXFORD_NANOPORE,/<path>/<to>/fastq/ERR3201952.fastq.gz,,
```
> ⚠️ Input FASTQ and FASTA files _must_ be gzipped
> ⚠️ While one can include both short-read and long-read data in one run, we recommend that you split these across _two_ pipeline runs and database sheets (see below). This will allow classification optimisation for each data type, and make MultiQC run-reports more readable (due to run statistics having vary large number differences).
| Column | Description |
| --------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `sample` | Unique sample name [required]. |
| `run_accession` | Run ID or name unique for each (pairs of) file(s) .Can also supply sample name again here, if only a single run was generated [required]. |
| `instrument_platform` | Sequencing platform reads generated on, selected from the EBI ENA [controlled vocabulary](https://www.ebi.ac.uk/ena/portal/api/controlledVocab?field=instrument_platform) [required]. |
| `fastq_1` | Path or URL to sequencing reads or for Illumina R1 sequencing reads in FASTQ format. GZipped compressed files accepted. Can be left empty if data in FASTA is specifed. Cannot be combined with `fasta`. |
| `fastq_2` | Path or URL to Illumina R2 sequencing reads in FASTQ format. GZipped compressed files accepted. Can be left empty if single end data. Cannot be combined with `fasta`. |
| `fasta` | Path or URL to long-reads or contigs in FASTA format. GZipped compressed files accepted. Can be left empty if data in FASTA is specifed. Cannot be combined with `fastq_1` or `fastq_2`. |
An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
### Full database sheet
nf-core/taxprofiler supports multiple databases being classified/profiled against in parallel for each tool.
Databases can be supplied either in the form of a compressed `.tar.gz` archive of a directory containing all relevant database files or the path to a directory on the filesystem.
> ⚠️ nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user. See below for more information of the expected database files.
The pipeline takes the paths and specific classification/profiling parameters of the tool of these databases as input via a four column comma-separated sheet.
> ⚠️ To allow user freedom, nf-core/taxprofiler does not check for mandatory or the validity of non-file database parameters for correct execution of the tool - excluding options offered via pipeline level parameters! Please validate your database parameters (cross-referencing [parameters](https://nf-co.re/taxprofiler/parameters, and the given tool documentation) before submitting the database sheet! For example, if you don't use the default read length - Bracken will require `-r <read_length>` in the `db_params` column.
An example database sheet can look as follows, where 7 tools are being used, and `malt` and `kraken2` will be used against two databases each.
`kraken2` will be run twice even though only having a single 'dedicated' database because specifying `bracken` implies first running `kraken2` on the `bracken` database, as required by `bracken`.
```console
tool,db_name,db_params,db_path
malt,malt85,-id 85,/<path>/<to>/malt/testdb-malt/
malt,malt95,-id 90,/<path>/<to>/malt/testdb-malt.tar.gz
bracken,db1,;-r 150,/<path>/<to>/bracken/testdb-bracken.tar.gz
kraken2,db2,--quick,/<path>/<to>/kraken2/testdb-kraken2.tar.gz
krakenuniq,db3,,/<path>/<to>/krakenuniq/testdb-krakenuniq.tar.gz
centrifuge,db1,,/<path>/<to>/centrifuge/minigut_cf.tar.gz
metaphlan3,db1,,/<path>/<to>/metaphlan3/metaphlan_database/
motus,db_mOTU,,/<path>/<to>/motus/motus_database/
```
For Bracken, if you wish to supply any parameters to either the Kraken or Bracken step you **must** have a _semi-colon_ `;` list as in `db_params`. This is to allow to specify the Kraken2 parameters before, and Bracken parameters after the `;` as Bracken is a two step process. This is particularly important if you supply a Bracken database with a non-default read length parameter. If you do not have any parameters to specify, you can leave this as empty.
Column specifications are as follows:
| Column | Description |
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tool` | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required]. Please note that `bracken` also implies running `kraken2` on the same database. |
| `db_name` | A unique name per tool for the particular database [required]. Please note that names need to be unique across both `kraken2` and `bracken` as well, even if re-using the same database. |
| `db_params` | Any parameters of the given taxonomic classifier/profiler that you wish to specify that the taxonomic classifier/profiling tool should use when profiling against this specific database. Can be empty to use taxonomic classifier/profiler defaults. Must not be surrounded by quotes [required]. We generally do not recommend specifying parameters here that turn on/off saving of output files or specifying particular file extensions - this should be already addressed via pipeline parameters. For Bracken databases, must at a minimum contain a `;` separating Kraken2 from Bracken parameters. |
| `db_path` | Path to the database. Can either be a path to a directory containing the database index files or a `.tar.gz` file which contains the compressed database directory with the same name as the tar archive, minus `.tar.gz` [required]. |
> 💡 You can also specify the same database directory/file twice (ensuring unique `db_name`s) and specify different parameters for each database to compare the effect of different parameters during classification/profiling.
nf-core/taxprofiler will automatically decompress and extract any compressed archives for you.
The (uncompressed) database paths (`db_path`) for each tool are expected to contain the contents of:
- [**Bracken**:](#bracken-custom-database) output of the combined `kraken2-build` and `bracken-build` process.
- [**Centrifuge**:](#centrifuge-custom-database) output of `centrifuge-build`.
- [**DIAMOND**:](#diamond-custom-database) output of `diamond makedb`.
- [**Kaiju**:](#kaiju-custom-database) output of `kaiju-makedb`.
- [**Kraken2**:](#kraken2-custom-database) output of `kraken2-build` command(s).
- [**KrakenUniq**:](#krakenuniq-custom-database) output of `krakenuniq-build` command(s).
- [**MALT**](#malt-custom-database) output of `malt-build`.
- [**MetaPhlAn3**:](#metaphlan3-custom-database) output of with `metaphlan --install` or downloaded from links on the [MetaPhlAn3 wiki](https://github.com/biobakery/MetaPhlAn/wiki/MetaPhlAn-3.0#customizing-the-database).
- [**mOTUs**:](#motus-custom-database) is composed of code and database together.
Click the links in the list above for short quick-reference tutorials how to generate custom databases for each tool.
## Running the pipeline
The typical command for running the pipeline is as follows:
```bash
nextflow run nf-core/taxprofiler --input samplesheet.csv --outdir <OUTDIR> --genome GRCh37 -profile docker
```console
nextflow run nf-core/taxprofiler --input samplesheet.csv --databases databases.csv --outdir <OUTDIR> -profile docker --run_<TOOL1> --run_<TOOL2>
```
This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.
When running nf-core/taxprofiler, every step and tool is 'opt in'. To run a given classifier/profiler you must make sure to supply both a database in your `<database>.csv` and supply `--run_<profiler>` flag to your command. Omitting either will result in the classification/profiling tool not executing. If you wish to perform pre-processing (adapter clipping, merge running etc.) or post-processing (visualisation) steps, these are also opt in with a `--perform_<step>` flag. In some cases, the pre- and post-processing steps may also require additional files. Please check the parameters tab of this documentation for more information.
Note that the pipeline will create the following files in your working directory:
```bash
@ -94,6 +178,164 @@ input: 'data'
You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch).
### Sequencing quality control
[`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. nf-core taxprofiler offers [`falco`](https://github.com/smithlabcode/falco) as an drop-in replacement, with supposedly better improvement particularly for long reads.
### Preprocessing Steps
nf-core/taxprofiler offers four main preprocessing steps for preprocessing raw sequencing reads:
- [**Read processing**](#read-processing): adapter clipping and pair-merging.
- [**Complexity filtering**](#complexity-filtering): removal of low-sequence complexity reads.
- [**Host read-removal**](#host-read-removal): removal of reads aligning to reference genome(s) of a host.
- [**Run merging**](#run-merging): concatenation of multiple FASTQ chunks/sequencing runs/libraries of a sample.
> You can save the 'final' reads used for classification/profiling from any combination of these steps with `--save_analysis_ready_reads`.
#### Read Processing
Raw sequencing read processing in the form of adapter clipping and paired-end read merging can be activated via the `--perform_shortread_qc` or `--perform_longread_qc` flags.
It is highly recommended to run this on raw reads to remove artifacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles. If you have public data, normally these should have been corrected for, however you should still check that these steps have indeed been already performed.
There are currently two options for short-read preprocessing: [`fastp`](https://github.com/OpenGene/fastp) or [`adapterremoval`](https://github.com/MikkelSchubert/adapterremoval).
For adapter clipping, you can either rely on the tool's default adapter sequences, or supply your own adapters (`--shortread_qc_adapter1` and `--shortread_qc_adapter2`)
By default, paired-end merging is not activated. In this case paired-end 'alignment' against the reference databases is performed where supported, and if not, supported pairs will be independently classified/profiled. If paired-end merging is activated you can also specify whether to include unmerged reads in the reads sent for classification/profiling (`--shortread_qc_mergepairs` and `--shortread_qc_includeunmerged`).
You can also turn off clipping and only perform paired-end merging, if requested. This can be useful when processing data downloaded from the ENA, SRA, or DDBJ (`--shortread_qc_skipadaptertrim`).
Both tools support length filtering of reads and can be tuned with `--shortread_qc_minlength`. Performing length filtering can be useful to remove short (often low sequencing complexity) sequences that result in unspecific classification and therefore slow down runtime during classification/profiling, with minimal gain.
There is currently one option for long-read Oxford Nanopore processing: [`porechop`](https://github.com/rrwick/Porechop).
For both short-read and long-read preprocessing, you can optionally save the resulting processed reads with `--save_preprocessed_reads`.
#### Complexity Filtering
Complexity filtering can be activated via the `--perform_shortread_complexityfilter` flag.
Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic classification/profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources.
There are currently three options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus), and [`fastp`](https://github.com/OpenGene/fastp#low-complexity-filter).
There is one option for long-read quality filtering: [`Filtlong`](https://github.com/rrwick/Filtlong)
The tools offer different algorithms and parameters for removing low complexity reads and quality filtering. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of the tools (see links above) to decide on optimal methods and parameters for your dataset.
You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. If running with `fastp`, complexity filtering happens inclusively within the earlier shortread preprocessing step. Therefore there will not be an independent pipeline step for complexity filtering, and no independent FASTQ file (i.e. `--save_complexityfiltered_reads` will be ignored) - your complexity filtered reads will also be in the `fastp/` folder in the same file(s) as the preprocessed read.
> ⚠️ For nanopore data: we do not recommend performing any read preprocessing or complexity filtering if you are using ONTs Guppy toolkit for basecalling and post-processing.
#### Host-Read Removal
Removal of possible-host reads from FASTQ files prior classification/profiling can be activated with `--perform_shortread_hostremoval` or `--perform_longread_hostremoval`.
Similarly to complexity filtering, host-removal can be useful for runtime optimisation and reduction in misclassified reads. It is not always necessary to report classification of reads from a host when you already know the host of the sample, therefore you can gain a run-time and computational advantage by removing these prior typically resource-heavy classification/profiling with more efficient methods. Furthermore, particularly with human samples, you can reduce the number of false positives during classification/profiling that occur due to host-sequence contamination in reference genomes on public databases.
nf-core/taxprofiler currently offers host-removal via alignment against a reference genome with Bowtie2 for short reads and minimap2 for long reads, and the use of the unaligned reads for downstream classification/profiling.
You can supply your reference genome in FASTA format with `--hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index` or a minimap2 `.mmi` file for `--longread_hostremoval_index`, however nf-core/taxprofiler will generate these for you if necessary. Pre-supplying the index directory or files can greatly speed up the process, and these can be re-used.
> 💡 If you have multiple taxa or sequences you wish to remove (e.g., the host genome and then also PhiX - common quality-control reagent during sequencing) you can simply concatenate the FASTAs of each taxa or sequences into a single reference file.
#### Run Merging
For samples that may have been sequenced over multiple runs, or for FASTQ files split into multiple chunks, you can activate the ability to merge across all runs or chunks with `--perform_runmerging`.
For more information how to set up your input samplesheet, see [Multiple runs of the same sample](#multiple-runs-of-the-same-sample).
Activating this functionality will concatenate the FASTQ files with the same sample name _after_ the optional preprocessing steps and _before_ classification/profiling. Note that libraries with runs of different pairing types will **not** be merged and this will be indicated on output files with a `_se` or `_pe` suffix to the sample name accordingly.
You can optionally save the FASTQ output of the run merging with the `--save_runmerged_reads`.
#### Classification and Profiling
The following sections provide tips and suggestions for running the different taxonomic classification and profiling tools _within the pipeline_. For advice and/or guidance whether you should run a particular tool on your specific data, please see the documentation of each tool!
An important distinction between the different tools in included in the pipeline is classification versus profiling. Taxonomic _classification_ is concerned with simply detecting the presence of species in a given sample. Taxonomic _profiling_ involves additionally estimating the _abundance_ of each species.
Note that not all taxonomic classification tools (e.g. Kraken, MALT, Kaiju) performs _profiling_, but all taxonomic profilers (e.g. MetaPhlAn, mOTUs, Bracken) must perform some form of _classification_ prior to profiling.
For advice as to which tool to run in your context, please see the documentation of each tool.
> 🖊️ If you would like to change this behaviour, please contact us on the [nf-core slack](https://nf-co.re/join) and we can discuss this.
Not all tools currently have dedicated tips, suggestions and/or recommendations, however we welcome further contributions for existing and additional tools via pull requests to the [nf-core/taxprofiler repository](https://github.com/nf-core/taxprofiler)!
##### Bracken
You must make sure to also activate Kraken2 to run Bracken in the pipeline.
It is unclear whether Bracken is suitable for running long reads, as it makes certain assumptions about read lengths. Furthemore, during testing we found issues where Bracken would fail on the long-read test data.
Therefore currently nf-core/taxprofiler does not run Bracken on data specified as being sequenced with `OXFORD_NANOPORE` in the input samplesheet.
##### Centrifuge
Centrifuge currently does not accept FASTA files as input, therefore no output will be produced for these input files.
##### DIAMOND
DIAMOND only allows output of a single file format at a time, therefore parameters such `--diamond_save_reads` supplied will result in only aligned reads in SAM format will be produced, no taxonomic profiles will be available. Be aware of this when setting up your pipeline runs, depending on your particular use case.
##### Kaiju
Currently, no specific tips or suggestions.
##### Kraken2
Currently, no specific tips or suggestions.
##### KrakenUniq
Currently, no specific tips or suggestions.
##### MALT
MALT does not support paired-end reads alignment (unlike other tools), therefore nf-core/taxprofiler aligns these as indepenent files if read-merging is skipped. If you skip merging, you can sum or average the results of the counts of the pairs.
Krona can only be run on MALT output if path to Krona taxonomy database supplied to `--krona_taxonomy_directory`. Therefore if you do not supply the a Krona directory, Krona plots will not be produced for MALT.
##### MetaPhlAn3
MetaPhlAn3 currently does not accept FASTA files as input, therefore no output will be produced for these input files.
##### mOTUs
mOTUs currently does not accept FASTA files as input, therefore no output will be produced for these input files.
#### Post Processing
##### Visualisation
nf-core/taxprofiler supports generation of Krona interactive pie chart plots for the following compatible tools.
- Kraken2
- Centrifuge
- Kaiju
- MALT
> ⚠️ MALT KRONA plots cannot be generated automatically, you must also specify a Krona taxonomy directory with `--krona_taxonomy_directory` if you wish to generate these.
##### Multi-Table Generation
The main multiple-sample table from nf-core/taxprofiler is from a dedicated standalone tool originally developed for the pipeline - [Taxpasta](https://taxpasta.readthedocs.io/en/latest/). When providing `--run_profile_standardisation`, every classifier/profiler and database combination will get a standardised and multi-sample taxon table in the [`taxpasta/`](https://nf-co.re/taxprofiler/output) directory. These tables are structured in the same way, to facilitate comparison between the the results of the classifier/profiler
In addition to per-sample profiles and standardised Taxpasta output, the pipeline also supports generation of 'native' multi-sample taxonomic profiles (i.e., those generated by the taxonomic profiling tools themselves or additional utility scripts provided by the tool authors), when providing `--run_profile_standardisation` to your pipeline.
These are executed on a per-database level. I.e., you will get a multi-sample taxon table for each database you provide for each tool and will be placed in the same directory as the directories containing the per-sample profiles.
The following tools will produce multi-sample taxon tables:
- **Bracken** (via bracken's `combine_bracken_outputs.py` script)
- **Centrifuge** (via KrakenTools' `combine_kreports.py` script)
- **Kaiju** (via Kaiju's `kaiju2table` tool)
- **Kraken2** (via KrakenTools' `combine_kreports.py` script)
- **MetaPhlAn3** (via MetaPhlAn's `merge_metaphlan_tables.py` script)
- **mOTUs** (via the `motus merge` command)
Note that the multi-sample tables from the 'native' tools in each folders are [not inter-operable](https://taxpasta.readthedocs.io/en/latest/tutorials/getting-started/) with each other as they can have different formats and can contain additional and different data. In this case we refer you to use the standardised and merged output from Taxpasta, as described above.
### Updating the pipeline
When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline:
@ -214,3 +456,312 @@ We recommend adding the following line to your environment to limit this (typica
```bash
NXF_OPTS='-Xms1g -Xmx4g'
```
## Tutorials
### Retrieving databases or building custom databases
Not all taxonomic profilers provide ready-made or default databases. Here we will give brief guidance on how to build custom databases for each supported taxonomic profiler.
You should always consult the documentation of each tool for more information, as here we only provide short minimal-tutorials as quick reference guides (with no guarantee they are up to date).
The following tutorials assumes you already have the tool available (e.g. installed locally, or via conda, docker etc.), and you have already downloaded the FASTA files you wish to build into a database.
#### Bracken custom database
Bracken does not require an independent database nor not provide any default databases for classification/profiling, but rather builds upon Kraken2 databases. See [Kraken2](#kraken2-custom-database) for more information on how to build these.
In addition to a Kraken2 database, you also need to have the (average) read lengths (in bp) of your sequencing experiment, the K-mer size used to build the Kraken2 database, and Kraken2 available on your machine.
```bash
bracken-build -d <KRAKEN_DB_DIR> -k <KRAKEN_DB_KMER_LENGTH> -l <READLENGTH>
```
> 🛈 You can speed up database construction by supplying the threads parameter (`-t`).
> 🛈 If you do not have Kraken2 in your `$PATH` you can point to the binary with `-x /<path>/<to>/kraken2`.
<details markdown="1">
<summary>Expected files in database directory</summary>
- `bracken`
- `hash.k2d`
- `opts.k2d`
- `taxo.k2d`
- `database.kraken`
- `database100mers.kmer_distrib`
- `database100mers.kraken`
- `database150mers.kmer_distrib`
- `database150mers.kraken`
</details>
You can follow Bracken [tutorial](https://ccb.jhu.edu/software/bracken/index.shtml?t=manual) for more information.
#### Centrifuge custom database
To build a custom Centrifuge database, a user needs to download taxonomy files, make a custom `seqid2taxid.map` and combine the fasta files together.
In total, you need four components: a tab-separated file mapping sequence IDs to taxonomy IDs (`--conversion-table`), a tab-separated file mapping taxonomy IDs to their parents and rank, up to the root of the tree (`--taxonomy-tree`), a pipe-separated file mapping taxonomy IDs to a name (`--name-table`), and the reference sequences.
An example of custom `seqid2taxid.map`:
```
NC_001133.9 4392
NC_012920.1 9606
NC_001134.8 4392
NC_001135.5 4392
```
```bash
centrifuge-download -o taxonomy taxonomy
cat *.{fa,fna} > input-sequences.fna
centrifuge-build -p 4 --conversion-table seqid2taxid.map --taxonomy-tree taxonomy/nodes.dmp --name-table taxonomy/names.dmp input-sequences.fna taxprofiler_cf
```
<details markdown="1">
<summary>Expected files in database directory</summary>
- `centrifuge`
- `<database_name>.<number>.cf`
- `<database_name>.<number>.cf`
- `<database_name>.<number>.cf`
- `<database_name>.<number>.cf`
</details>
For the Centrifuge custom database documentation, see [here](https://ccb.jhu.edu/software/centrifuge/manual.shtml#custom-database).
#### DIAMOND custom database
To create a custom database for DIAMOND, the user should download and unzip the NCBI's taxonomy files and the input FASTA files.
The download and build steps are as follows:
```bash
wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip
unzip taxdmp.zip
## warning: large file!
wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.FULL.gz
## warning: takes a long time!
cat ../raw/*.faa | diamond makedb -d testdb-diamond --taxonmap prot.accession2taxid.FULL.gz --taxonnodes nodes.dmp --taxonnames names.dmp
## clean up
rm *dmp *txt *gz *prt *zip
```
<details markdown="1">
<summary>Expected files in database directory</summary>
- `diamond`
- `<database_name>.dmnd`
</details>
A detailed description can be found [here](https://github.com/bbuchfink/diamond/wiki/1.-Tutorial)
#### Kaiju custom database
To build a kaiju database, you need three components: a FASTA file with the protein sequences ,the NCBI taxonomy dump files, and you need to define the uppercase characters of the standard 20 amino acids you wish to include.
> ⚠️ The headers of the protein fasta file must be numeric NCBI taxon identifiers of the protein sequences.
To download the NCBI taxonomy files, please run the following commands:
```bash
wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.zip
unzip new_taxdump.zip
```
To build the database, run the following command (the contents of taxdump must be in the same location where you run the command):
```bash
kaiju-mkbwt -a ACDEFGHIKLMNPQRSTVWY -o proteins proteins.faa
kaiju-mkfmi proteins
```
> 🛈 You can speed up database construction by supplying the threads parameter (`-t`).
<details markdown="1">
<summary>Expected files in database directory</summary>
- `kaiju`
- `kaiju_db_*.fmi`
- `nodes.dmp`
- `names.dmp`
</details>
For the Kaiju database construction documentation, see [here](https://github.com/bioinformatics-centre/kaiju#custom-database).
#### Kraken2 custom database
To build a Kraken2 database you need two components: a taxonomy (consisting of `names.dmp`, `nodes.dmp`, and `*accession2taxid`) files, and the FASTA files you wish to include.
To pull the NCBI taxonomy, you can run the following:
```bash
kraken2-build --download-taxonomy --db <YOUR_DB_NAME>
```
You can then add your FASTA files with the following build command.
```bash
kraken2-build --add-to-library *.fna --db <YOUR_DB_NAME>
```
You can repeat this step multiple times to iteratively add more genomes prior building.
Once all genomes are added to the library, you can build the database (and optionally clean it up):
```bash
kraken2-build --build --db <YOUR_DB_NAME>
kraken2-build --clean --db <YOUR_DB_NAME>
```
You can then add the `<YOUR_DB_NAME>/` path to your nf-core/taxprofiler database input sheet.
<details markdown="1">
<summary>Expected files in database directory</summary>
- `kraken2`
- `opts.k2d`
- `hash.k2d`
- `taxo.k2d`
</details>
You can follow the Kraken2 [tutorial](https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#custom-databases) for a more detailed description.
#### KrakenUniq custom database
For any KrakenUniq database, you require: taxonomy files, the FASTA files you wish to include, a `seqid2mapid` file, and a k-mer length.
First you must make a `seqid2taxid.map` file which is a two column text file containing the FASTA sequence header and the NCBI taxonomy ID for each sequence:
```
MT192765.1 2697049
```
Then make a directory (`<DB_DIR_NAME>/`), containing the `seqid2taxid.map` file, and your FASTA files in a subdirectory called `library/` (these FASTA files can be symlinked). You must then run the `taxonomy` command on the `<DB_DIR_NAME>/` directory, and then build it.
```bash
mkdir -p <DB_DIR_NAME>/library
mv `seqid2taxid.map` <DB_DIR_NAME>/
mv *.fna <DB_DIR_NAME>/library
krakenuniq-download --db <DB_DIR_NAME> taxonomy
krakenuniq-build --db <DB_DIR_NAME> --kmer-len 31
```
> 🛈 You can speed up database construction by supplying the threads parameter (`--threads`) to `krakenuniq-build`.
<details markdown="1">
<summary>Expected files in database directory</summary>
- `krakenuniq`
- `opts.k2d`
- `hash.k2d`
- `taxo.k2d`
- `database.idx`
- `taxDB`
</details>
Please see the [KrakenUniq documentation](https://github.com/fbreitwieser/krakenuniq#database-building) for more information.
#### MALT custom database
To build a MALT database, you need the FASTA files to include, and an (unzipped) [MEGAN mapping 'db' file](https://software-ab.informatik.uni-tuebingen.de/download/megan6/) for your FASTA type. In addition to the input directory, output directory, and the mapping file database, you also need to specify the sequence type (DNA or Protein) with the `-s` flag.
```bash
malt-build -i <path>/<to>/<fasta>/*.{fna,fa,fasta} -a2t <path>/<to>/<map>.db -d <YOUR_DB_NAME>/ -s DNA
```
You can then add the `<YOUR_DB_NAME>/` path to your nf-core/taxprofiler database input sheet.
⚠️ MALT generates very large database files and requires large amounts of RAM. You can reduce both by increasing the step size `-st` (with a reduction in sensitivity).
> 🛈 MALT-build can be multi-threaded with `-t` to speed up building.
<details markdown="1">
<summary>Expected files in database directory</summary>
- `malt`
- `ref.idx`
- `taxonomy.idx`
- `taxonomy.map`
- `index0.idx`
- `table0.idx`
- `table0.db`
- `ref.inf`
- `ref.db`
- `taxonomy.tre`
</details>
See the [MALT manual](https://software-ab.informatik.uni-tuebingen.de/download/malt/manual.pdf) for more information.
#### MetaPhlAn3 custom database
MetaPhlAn3 does not allow (easy) construction of custom databases. Therefore we recommend to use the prebuilt database of marker genes that is provided by the developers.
To do this you need to have `MetaPhlAn3` installed on your machine.
```bash
metaphlan --install --bowtie2db <YOUR_DB_NAME>/
```
You can then add the `<YOUR_DB_NAME>/` path to your nf-core/taxprofiler database input sheet.
> 🛈 It is generally not recommended to modify this database yourself, thus this is currently not supported in the pipeline. However, it is possible to customise the existing database by adding your own marker genomes following the instructions [here](https://github.com/biobakery/MetaPhlAn/wiki/MetaPhlAn-3.1#customizing-the-database).
> 🖊️ If using your own database is relevant for you, please contact the nf-core/taxprofiler developers on the [nf-core slack](https://nf-co.re/join) and we will investigate supporting this.
<details markdown="1">
<summary>Expected files in database directory</summary>
- `metaphlan3`
- `mpa_v30_CHOCOPhlAn_201901.pkl`
- `mpa_v30_CHOCOPhlAn_201901.pkl`
- `mpa_v30_CHOCOPhlAn_201901.fasta`
- `mpa_v30_CHOCOPhlAn_201901.3.bt2`
- `mpa_v30_CHOCOPhlAn_201901.4.bt2`
- `mpa_v30_CHOCOPhlAn_201901.1.bt2`
- `mpa_v30_CHOCOPhlAn_201901.2.bt2`
- `mpa_v30_CHOCOPhlAn_201901.rev.1.bt2`
- `mpa_v30_CHOCOPhlAn_201901.rev.2.bt2`
- `mpa_latest`
</details>
More information on the MetaPhlAn3 database can be found [here](https://github.com/biobakery/MetaPhlAn/wiki/MetaPhlAn-3.1#installation).
#### mOTUs custom database
mOTUs does not provide the ability to construct custom databases. Therefore we recommend to use the the prebuilt database of marker genes provided by the developers.
> ⚠️ **Do not change the directory name of the resulting database if moving to a central location** The database name of `db_mOTU/` is hardcoded in the mOTUs tool
To do this you need to have `mOTUs` installed on your machine.
```bash
motus downloadDB
```
Then supply the `db_mOTU/` path to your nf-core/taxprofiler database input sheet.
> ⚠️ The `db_mOTU/` directory may be downloaded to somewhere in your Python's `site-package` directory. You will have to find this yourself as the exact location varies depends on installation method.
More information on the mOTUs database can be found [here](https://motu-tool.org/installation.html).
## Troubleshooting and FAQs
### I get a warning during centrifuge_kreport process with exit status 255
When a sample has insufficient hits for abundance estimation, the resulting `report.txt` file will be empty.
When trying to convert this to a kraken-style report, the conversion tool will exit with a status code `255`, and provide a `WARN`.
This is **not** an error nor a failure of the pipeline, just your sample has no hits to the provided database when using centrifuge.

@ -14,9 +14,9 @@ class WorkflowMain {
// TODO nf-core: Add Zenodo DOI for pipeline after first release
//"* The pipeline\n" +
//" https://doi.org/10.5281/zenodo.XXXXXXX\n\n" +
"* The nf-core framework\n" +
" https://doi.org/10.1038/s41587-020-0439-x\n\n" +
"* Software dependencies\n" +
'* The nf-core framework\n' +
' https://doi.org/10.1038/s41587-020-0439-x\n\n' +
'* Software dependencies\n' +
" https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md"
}
@ -97,4 +97,5 @@ class WorkflowMain {
}
return null
}
}

@ -13,7 +13,6 @@ class WorkflowTaxprofiler {
public static void initialise(params, log) {
genomeExistsError(params, log)
if (!params.fasta) {
Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file."
}

@ -5,19 +5,216 @@
"https://github.com/nf-core/modules.git": {
"modules": {
"nf-core": {
"adapterremoval": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"bbmap/bbduk": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"bowtie2/align": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"bowtie2/build": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"bracken/bracken": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"bracken/combinebrackenoutputs": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"cat/fastq": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"centrifuge/centrifuge": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"centrifuge/kreport": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"custom/dumpsoftwareversions": {
"branch": "master",
"git_sha": "76cc4938c1f6ea5c7d83fed1eeffc146787f9543",
"installed_by": ["modules"]
},
"diamond/blastx": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"falco": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"],
"patch": "modules/nf-core/falco/falco.diff"
},
"fastp": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"fastqc": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"filtlong": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"gunzip": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"kaiju/kaiju": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"kaiju/kaiju2krona": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"kaiju/kaiju2table": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"kraken2/kraken2": {
"branch": "master",
"git_sha": "7c695e0147df1157413e06246d9b0094617d3e6b",
"installed_by": ["modules"]
},
"krakentools/combinekreports": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"krakentools/kreport2krona": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"krakenuniq/preloadedkrakenuniq": {
"branch": "master",
"git_sha": "a6eb17f65b3ee5761c25c075a6166c9f76733cee",
"installed_by": ["modules"]
},
"krona/ktimporttaxonomy": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"krona/ktimporttext": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"malt/run": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"megan/rma2info": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"metaphlan3/mergemetaphlantables": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"metaphlan3/metaphlan3": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"minimap2/align": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"minimap2/index": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"motus/merge": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"motus/profile": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"multiqc": {
"branch": "master",
"git_sha": "f2d63bd5b68925f98f572eed70993d205cc694b7",
"git_sha": "ee80d14721e76e2e079103b8dcd5d57129e584ba",
"installed_by": ["modules"]
},
"porechop/porechop": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"],
"patch": "modules/nf-core/porechop/porechop/porechop-porechop.diff"
},
"prinseqplusplus": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"samtools/fastq": {
"branch": "master",
"git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b",
"installed_by": ["modules"]
},
"samtools/index": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"samtools/stats": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"samtools/view": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"taxpasta/merge": {
"branch": "master",
"git_sha": "ffa9641ee18f88aff974257cb50ba3cf8f7d143c",
"installed_by": ["modules"]
},
"untar": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
}
}

@ -0,0 +1,32 @@
process KRAKEN2_STANDARD_REPORT {
tag "$meta.id"
label 'process_single'
conda "conda-forge::sed=4.7"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
'ubuntu:20.04' }"
input:
tuple val(meta), path(report)
output:
tuple val(meta), path(result), emit: report
path 'versions.yml' , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def prefix = task.ext.prefix ?: "${meta.id}"
result = "${prefix}_standardized.kraken2.report.txt"
"""
cut -f1-3,6-8 '${report}' > '${result}'
cat <<-END_VERSIONS > versions.yml
"${task.process}":
cut: \$(echo \$(cut --version 2>&1) | sed 's/^.*(GNU coreutils) //; s/ Copyright.*\$//')
END_VERSIONS
"""
}

@ -0,0 +1,40 @@
process KRONA_CLEANUP {
tag "$meta.id"
label 'process_low'
conda "conda-forge::sed=4.7"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
'ubuntu:20.04' }"
input:
tuple val(meta), path(krona, stageAs: 'uncleaned.krona.txt')
output:
tuple val(meta), path("*.txt"), emit: txt
path "versions.yml", emit: versions
when:
task.ext.when == null || task.ext.when
script:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
# Copy the file to a new name
cp ${krona} ${prefix}.txt
# Remove ugly 'x__' prefixes for each of the taxonomic levels
LEVELS=(d k p c o f g s)
for L in "\${LEVELS[@]}"; do
sed -i "s/\${L}__//g" ${prefix}.txt
done
# Remove underscores that are standing in place of spaces
sed -i "s/_/ /g" ${prefix}.txt
cat <<-END_VERSIONS > versions.yml
"${task.process}":
sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//')
END_VERSIONS
"""
}

@ -0,0 +1,92 @@
process ADAPTERREMOVAL {
tag "$meta.id"
label 'process_medium'
conda "bioconda::adapterremoval=2.3.2"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/adapterremoval:2.3.2--hb7ba0dd_0' :
'quay.io/biocontainers/adapterremoval:2.3.2--hb7ba0dd_0' }"
input:
tuple val(meta), path(reads)
path(adapterlist)
output:
tuple val(meta), path("${prefix}.truncated.fastq.gz") , optional: true, emit: singles_truncated
tuple val(meta), path("${prefix}.discarded.fastq.gz") , optional: true, emit: discarded
tuple val(meta), path("${prefix}.pair{1,2}.truncated.fastq.gz") , optional: true, emit: paired_truncated
tuple val(meta), path("${prefix}.collapsed.fastq.gz") , optional: true, emit: collapsed
tuple val(meta), path("${prefix}.collapsed.truncated.fastq.gz") , optional: true, emit: collapsed_truncated
tuple val(meta), path("${prefix}.paired.fastq.gz") , optional: true, emit: paired_interleaved
tuple val(meta), path('*.settings') , emit: settings
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def list = adapterlist ? "--adapter-list ${adapterlist}" : ""
prefix = task.ext.prefix ?: "${meta.id}"
if (meta.single_end) {
"""
AdapterRemoval \\
--file1 $reads \\
$args \\
$list \\
--basename ${prefix} \\
--threads ${task.cpus} \\
--seed 42 \\
--gzip
ensure_fastq() {
if [ -f "\${1}" ]; then
mv "\${1}" "\${1::-3}.fastq.gz"
fi
}
ensure_fastq '${prefix}.truncated.gz'
ensure_fastq '${prefix}.discarded.gz'
cat <<-END_VERSIONS > versions.yml
"${task.process}":
adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g")
END_VERSIONS
"""
} else {
"""
AdapterRemoval \\
--file1 ${reads[0]} \\
--file2 ${reads[1]} \\
$args \\
$list \\
--basename ${prefix} \\
--threads $task.cpus \\
--seed 42 \\
--gzip
ensure_fastq() {
if [ -f "\${1}" ]; then
mv "\${1}" "\${1::-3}.fastq.gz"
fi
}
ensure_fastq '${prefix}.truncated.gz'
ensure_fastq '${prefix}.discarded.gz'
ensure_fastq '${prefix}.pair1.truncated.gz'
ensure_fastq '${prefix}.pair2.truncated.gz'
ensure_fastq '${prefix}.collapsed.gz'
ensure_fastq '${prefix}.collapsed.truncated.gz'
ensure_fastq '${prefix}.paired.gz'
cat <<-END_VERSIONS > versions.yml
"${task.process}":
adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g")
END_VERSIONS
"""
}
}

@ -0,0 +1,90 @@
name: adapterremoval
description: Trim sequencing adapters and collapse overlapping reads
keywords:
- trimming
- adapters
- merging
- fastq
tools:
- adapterremoval:
description: The AdapterRemoval v2 tool for merging and clipping reads.
homepage: https://github.com/MikkelSchubert/adapterremoval
documentation: https://adapterremoval.readthedocs.io
licence: ["GPL v3"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: |
List of input FastQ files of size 1 and 2 for single-end and paired-end data,
respectively.
pattern: "*.{fq,fastq,fq.gz,fastq.gz}"
- adapterlist:
type: file
description: Optional text file containing list of adapters to look for for removal
with one adapter per line. Otherwise will look for default adapters (see
AdapterRemoval man page), or can be modified to remove user-specified
adapters via ext.args.
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- singles_truncated:
type: file
description: |
Adapter trimmed FastQ files of either single-end reads, or singleton
'orphaned' reads from merging of paired-end data (i.e., one of the pair
was lost due to filtering thresholds).
pattern: "*.truncated.fastq.gz"
- discarded:
type: file
description: |
Adapter trimmed FastQ files of reads that did not pass filtering
thresholds.
pattern: "*.discarded.fastq.gz"
- pair1_truncated:
type: file
description: |
Adapter trimmed R1 FastQ files of paired-end reads that did not merge
with their respective R2 pair due to long templates. The respective pair
is stored in 'pair2_truncated'.
pattern: "*.pair1.truncated.fastq.gz"
- pair2_truncated:
type: file
description: |
Adapter trimmed R2 FastQ files of paired-end reads that did not merge
with their respective R1 pair due to long templates. The respective pair
is stored in 'pair1_truncated'.
pattern: "*.pair2.truncated.fastq.gz"
- collapsed:
type: file
description: |
Collapsed FastQ of paired-end reads that successfully merged with their
respective R1 pair but were not trimmed.
pattern: "*.collapsed.fastq.gz"
- collapsed_truncated:
type: file
description: |
Collapsed FastQ of paired-end reads that successfully merged with their
respective R1 pair and were trimmed of adapter due to sufficient overlap.
pattern: "*.collapsed.truncated.fastq.gz"
- log:
type: file
description: AdapterRemoval log file
pattern: "*.settings"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@maxibor"
- "@jfy133"

@ -0,0 +1,43 @@
process BBMAP_BBDUK {
tag "$meta.id"
label 'process_medium'
conda "bioconda::bbmap=39.01"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/bbmap:39.01--h5c4e2a8_0':
'quay.io/biocontainers/bbmap:39.01--h5c4e2a8_0' }"
input:
tuple val(meta), path(reads)
path contaminants
output:
tuple val(meta), path('*.fastq.gz'), emit: reads
tuple val(meta), path('*.log') , emit: log
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def raw = meta.single_end ? "in=${reads[0]}" : "in1=${reads[0]} in2=${reads[1]}"
def trimmed = meta.single_end ? "out=${prefix}.fastq.gz" : "out1=${prefix}_1.fastq.gz out2=${prefix}_2.fastq.gz"
def contaminants_fa = contaminants ? "ref=$contaminants" : ''
"""
maxmem=\$(echo \"$task.memory\"| sed 's/ GB/g/g')
bbduk.sh \\
-Xmx\$maxmem \\
$raw \\
$trimmed \\
threads=$task.cpus \\
$args \\
$contaminants_fa \\
&> ${prefix}.bbduk.log
cat <<-END_VERSIONS > versions.yml
"${task.process}":
bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset")
END_VERSIONS
"""
}

@ -0,0 +1,53 @@
name: bbmap_bbduk
description: Adapter and quality trimming of sequencing reads
keywords:
- trimming
- adapter trimming
- quality trimming
- fastq
tools:
- bbmap:
description: BBMap is a short read aligner, as well as various other bioinformatic tools.
homepage: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/
documentation: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/
tool_dev_url: None
doi: ""
licence: ["UC-LBL license (see package)"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: |
List of input FastQ files of size 1 and 2 for single-end and paired-end data,
respectively.
- contaminants:
type: file
description: |
Reference files containing adapter and/or contaminant sequences for sequence kmer matching
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: The trimmed/modified fastq reads
pattern: "*fastq.gz"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- log:
type: file
description: Bbduk log file
pattern: "*bbduk.log"
authors:
- "@MGordon09"

@ -0,0 +1,71 @@
process BOWTIE2_ALIGN {
tag "$meta.id"
label "process_high"
conda "bioconda::bowtie2=2.4.4 bioconda::samtools=1.16.1 conda-forge::pigz=2.6"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:a0ffedb52808e102887f6ce600d092675bf3528a-0' :
'quay.io/biocontainers/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:a0ffedb52808e102887f6ce600d092675bf3528a-0' }"
input:
tuple val(meta) , path(reads)
tuple val(meta2), path(index)
val save_unaligned
val sort_bam
output:
tuple val(meta), path("*.bam") , emit: bam
tuple val(meta), path("*.log") , emit: log
tuple val(meta), path("*fastq.gz"), emit: fastq, optional:true
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ""
def args2 = task.ext.args2 ?: ""
def prefix = task.ext.prefix ?: "${meta.id}"
def unaligned = ""
def reads_args = ""
if (meta.single_end) {
unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : ""
reads_args = "-U ${reads}"
} else {
unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : ""
reads_args = "-1 ${reads[0]} -2 ${reads[1]}"
}
def samtools_command = sort_bam ? 'sort' : 'view'
"""
INDEX=`find -L ./ -name "*.rev.1.bt2" | sed "s/\\.rev.1.bt2\$//"`
[ -z "\$INDEX" ] && INDEX=`find -L ./ -name "*.rev.1.bt2l" | sed "s/\\.rev.1.bt2l\$//"`
[ -z "\$INDEX" ] && echo "Bowtie2 index files not found" 1>&2 && exit 1
bowtie2 \\
-x \$INDEX \\
$reads_args \\
--threads $task.cpus \\
$unaligned \\
$args \\
2> ${prefix}.bowtie2.log \\
| samtools $samtools_command $args2 --threads $task.cpus -o ${prefix}.bam -
if [ -f ${prefix}.unmapped.fastq.1.gz ]; then
mv ${prefix}.unmapped.fastq.1.gz ${prefix}.unmapped_1.fastq.gz
fi
if [ -f ${prefix}.unmapped.fastq.2.gz ]; then
mv ${prefix}.unmapped.fastq.2.gz ${prefix}.unmapped_2.fastq.gz
fi
cat <<-END_VERSIONS > versions.yml
"${task.process}":
bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//')
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
END_VERSIONS
"""
}

@ -0,0 +1,67 @@
name: bowtie2_align
description: Align reads to a reference genome using bowtie2
keywords:
- align
- map
- fasta
- fastq
- genome
- reference
tools:
- bowtie2:
description: |
Bowtie 2 is an ultrafast and memory-efficient tool for aligning
sequencing reads to long reference sequences.
homepage: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml
documentation: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml
doi: 10.1038/nmeth.1923
licence: ["GPL-3.0-or-later"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: |
List of input FastQ files of size 1 and 2 for single-end and paired-end data,
respectively.
- meta2:
type: map
description: |
Groovy Map containing reference information
e.g. [ id:'test', single_end:false ]
- index:
type: file
description: Bowtie2 genome index files
pattern: "*.ebwt"
- save_unaligned:
type: boolean
description: |
Save reads that do not map to the reference (true) or discard them (false)
(default: false)
- sort_bam:
type: boolean
description: use samtools sort (true) or samtools view (false)
pattern: "true or false"
output:
- bam:
type: file
description: Output BAM file containing read alignments
pattern: "*.{bam}"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- fastq:
type: file
description: Unaligned FastQ files
pattern: "*.fastq.gz"
- log:
type: file
description: Aligment log
pattern: "*.log"
authors:
- "@joseespinosa"
- "@drpatelh"

@ -0,0 +1,30 @@
process BOWTIE2_BUILD {
tag "$fasta"
label 'process_high'
conda "bioconda::bowtie2=2.4.4"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/bowtie2:2.4.4--py39hbb4e92a_0' :
'quay.io/biocontainers/bowtie2:2.4.4--py39hbb4e92a_0' }"
input:
tuple val(meta), path(fasta)
output:
tuple val(meta), path('bowtie2') , emit: index
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
"""
mkdir bowtie2
bowtie2-build $args --threads $task.cpus $fasta bowtie2/${fasta.baseName}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//')
END_VERSIONS
"""
}

@ -0,0 +1,43 @@
name: bowtie2_build
description: Builds bowtie index for reference genome
keywords:
- build
- index
- fasta
- genome
- reference
tools:
- bowtie2:
description: |
Bowtie 2 is an ultrafast and memory-efficient tool for aligning
sequencing reads to long reference sequences.
homepage: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml
documentation: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml
doi: 10.1038/nmeth.1923
licence: ["GPL-3.0-or-later"]
input:
- meta:
type: map
description: |
Groovy Map containing reference information
e.g. [ id:'test', single_end:false ]
- fasta:
type: file
description: Input genome fasta file
output:
- meta:
type: map
description: |
Groovy Map containing reference information
e.g. [ id:'test', single_end:false ]
- index:
type: file
description: Bowtie2 genome index files
pattern: "*.bt2"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@joseespinosa"
- "@drpatelh"

@ -0,0 +1,42 @@
process BRACKEN_BRACKEN {
tag "$meta.id"
label 'process_low'
// WARN: Version information not provided by tool on CLI.
// Please update version string below when bumping container versions.
conda "bioconda::bracken=2.7"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/bracken:2.7--py39hc16433a_0':
'quay.io/biocontainers/bracken:2.7--py39hc16433a_0' }"
input:
tuple val(meta), path(kraken_report)
path database
output:
tuple val(meta), path(bracken_report), emit: reports
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ""
def prefix = task.ext.prefix ?: "${meta.id}"
bracken_report = "${prefix}.tsv"
// WARN: Version information not provided by tool on CLI.
// Please update version string below when bumping container versions.
def VERSION = '2.7'
"""
bracken \\
${args} \\
-d '${database}' \\
-i '${kraken_report}' \\
-o '${bracken_report}'
cat <<-END_VERSIONS > versions.yml
"${task.process}":
bracken: ${VERSION}
END_VERSIONS
"""
}

@ -0,0 +1,48 @@
name: bracken_bracken
description: Re-estimate taxonomic abundance of metagenomic samples analyzed by kraken.
keywords:
- bracken
- metagenomics
- abundance
- kraken2
tools:
- bracken:
description: Bracken (Bayesian Reestimation of Abundance with KrakEN) is a highly accurate statistical method that computes the abundance of species in DNA sequences from a metagenomics sample.
homepage: https://ccb.jhu.edu/software/bracken/
documentation: https://ccb.jhu.edu/software/bracken/index.shtml?t=manual
tool_dev_url: https://github.com/jenniferlu717/Bracken
doi: "10.7717/peerj-cs.104"
licence: ["GPL v3"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- kraken_report:
type: file
description: TSV file with six columns coming from kraken2 output
pattern: "*.{tsv}"
- database:
type: file
description: Directory containing the kraken2/Bracken files for analysis
pattern: "*"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- reports:
type: file
description: TSV output report of the re-estimated abundances
pattern: "*.{tsv}"
authors:
- "@Midnighter"

@ -0,0 +1,37 @@
process BRACKEN_COMBINEBRACKENOUTPUTS {
tag "$meta.id"
label 'process_low'
conda "bioconda::bracken=2.7"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/bracken:2.7--py39hc16433a_0':
'quay.io/biocontainers/bracken:2.7--py39hc16433a_0' }"
input:
tuple val(meta), path(input)
output:
tuple val(meta), path("*.txt"), emit: txt
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
// WARN: Version information not provided by tool on CLI.
// Please update version string below when bumping container versions.
def VERSION = '2.7'
"""
combine_bracken_outputs.py \\
$args \\
--files ${input} \\
-o ${prefix}.txt
cat <<-END_VERSIONS > versions.yml
"${task.process}":
combine_bracken_output: ${VERSION}
END_VERSIONS
"""
}

@ -0,0 +1,41 @@
name: "bracken_combinebrackenoutputs"
description: Combine output of metagenomic samples analyzed by bracken.
keywords:
- sort
tools:
- "bracken":
description: Bracken (Bayesian Reestimation of Abundance with KrakEN) is a highly accurate statistical method that computes the abundance of species in DNA sequences from a metagenomics sample.
homepage: https://ccb.jhu.edu/software/bracken/
documentation: https://ccb.jhu.edu/software/bracken/index.shtml?t=manual
tool_dev_url: https://github.com/jenniferlu717/Bracken
doi: "10.7717/peerj-cs.104"
licence: ["GPL v3"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- input:
type: file
description: List of output files from bracken
pattern: "*"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- txt:
type: file
description: Combined output in table format
pattern: "*.txt"
authors:
- "@jfy133"

@ -0,0 +1,80 @@
process CAT_FASTQ {
tag "$meta.id"
label 'process_single'
conda "conda-forge::sed=4.7"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
'ubuntu:20.04' }"
input:
tuple val(meta), path(reads, stageAs: "input*/*")
output:
tuple val(meta), path("*.merged.fastq.gz"), emit: reads
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()]
if (meta.single_end) {
if (readList.size >= 1) {
"""
cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz
cat <<-END_VERSIONS > versions.yml
"${task.process}":
cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//')
END_VERSIONS
"""
}
} else {
if (readList.size >= 2) {
def read1 = []
def read2 = []
readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v }
"""
cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz
cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz
cat <<-END_VERSIONS > versions.yml
"${task.process}":
cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//')
END_VERSIONS
"""
}
}
stub:
def prefix = task.ext.prefix ?: "${meta.id}"
def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()]
if (meta.single_end) {
if (readList.size > 1) {
"""
touch ${prefix}.merged.fastq.gz
cat <<-END_VERSIONS > versions.yml
"${task.process}":
cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//')
END_VERSIONS
"""
}
} else {
if (readList.size > 2) {
"""
touch ${prefix}_1.merged.fastq.gz
touch ${prefix}_2.merged.fastq.gz
cat <<-END_VERSIONS > versions.yml
"${task.process}":
cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//')
END_VERSIONS
"""
}
}
}

@ -0,0 +1,39 @@
name: cat_fastq
description: Concatenates fastq files
keywords:
- fastq
- concatenate
tools:
- cat:
description: |
The cat utility reads files sequentially, writing them to the standard output.
documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html
licence: ["GPL-3.0-or-later"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: list
description: |
List of input FastQ files to be concatenated.
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: Merged fastq file
pattern: "*.{merged.fastq.gz}"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@joseespinosa"
- "@drpatelh"

@ -0,0 +1,61 @@
process CENTRIFUGE_CENTRIFUGE {
tag "$meta.id"
label 'process_high'
conda "bioconda::centrifuge=1.0.4_beta"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6' :
'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }"
input:
tuple val(meta), path(reads)
path db
val save_unaligned
val save_aligned
val sam_format
output:
tuple val(meta), path('*report.txt') , emit: report
tuple val(meta), path('*results.txt') , emit: results
tuple val(meta), path('*.sam') , optional: true, emit: sam
tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_mapped
tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def paired = meta.single_end ? "-U ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}"
def unaligned = ''
def aligned = ''
if (meta.single_end) {
unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : ''
aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : ''
} else {
unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : ''
aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : ''
}
def sam_output = sam_format ? "--out-fmt 'sam'" : ''
"""
## we add "-no-name ._" to ensure silly Mac OSX metafiles files aren't included
db_name=`find -L ${db} -name "*.1.cf" -not -name "._*" | sed 's/\\.1.cf\$//'`
centrifuge \\
-x \$db_name \\
-p $task.cpus \\
$paired \\
--report-file ${prefix}.report.txt \\
-S ${prefix}.results.txt \\
$unaligned \\
$aligned \\
$sam_output \\
$args
cat <<-END_VERSIONS > versions.yml
"${task.process}":
centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //')
END_VERSIONS
"""
}

@ -0,0 +1,66 @@
name: centrifuge_centrifuge
description: Classifies metagenomic sequence data
keywords:
- classify
- metagenomics
- fastq
- db
tools:
- centrifuge:
description: Centrifuge is a classifier for metagenomic sequences.
homepage: https://ccb.jhu.edu/software/centrifuge/
documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml
doi: 10.1101/gr.210641.116
licence: ["GPL v3"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: |
List of input FastQ files of size 1 and 2 for single-end and paired-end data,
respectively.
- db:
type: directory
description: Path to directory containing centrifuge database files
- save_unaligned:
type: value
description: If true unmapped fastq files are saved
- save_aligned:
type: value
description: If true mapped fastq files are saved
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- report:
type: file
description: |
File containing a classification summary
pattern: "*.{report.txt}"
- results:
type: file
description: |
File containing classification results
pattern: "*.{results.txt}"
- fastq_unmapped:
type: file
description: Unmapped fastq files
pattern: "*.unmapped.fastq.gz"
- fastq_mapped:
type: file
description: Mapped fastq files
pattern: "*.mapped.fastq.gz"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@sofstam"
- "@jfy133"
- "@sateeshperi"

@ -0,0 +1,33 @@
process CENTRIFUGE_KREPORT {
tag "$meta.id"
label 'process_single'
conda "bioconda::centrifuge=1.0.4_beta"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6':
'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }"
input:
tuple val(meta), path(report)
path db
output:
tuple val(meta), path('*.txt') , emit: kreport
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
db_name=`find -L ${db} -name "*.1.cf" -not -name "._*" | sed 's/\\.1.cf\$//'`
centrifuge-kreport -x \$db_name ${report} > ${prefix}.txt
cat <<-END_VERSIONS > versions.yml
"${task.process}":
centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //')
END_VERSIONS
"""
}

@ -0,0 +1,41 @@
name: "centrifuge_kreport"
description: Creates Kraken-style reports from centrifuge out files
keywords:
- metagenomics
tools:
- centrifuge:
description: Centrifuge is a classifier for metagenomic sequences.
homepage: https://ccb.jhu.edu/software/centrifuge/
documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml
doi: 10.1101/gr.210641.116
licence: ["GPL v3"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- report:
type: file
description: File containing the centrifuge classification report
pattern: "*.{txt}"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- kreport:
type: file
description: |
File containing kraken-style report from centrifuge
out files.
pattern: "*.{txt}"
authors:
- "@sofstam"
- "@jfy133"

@ -0,0 +1,68 @@
process DIAMOND_BLASTX {
tag "$meta.id"
label 'process_medium'
conda "bioconda::diamond=2.0.15"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/diamond:2.0.15--hb97b32f_0' :
'quay.io/biocontainers/diamond:2.0.15--hb97b32f_0' }"
input:
tuple val(meta), path(fasta)
path db
val out_ext
val blast_columns
output:
tuple val(meta), path('*.blast'), optional: true, emit: blast
tuple val(meta), path('*.xml') , optional: true, emit: xml
tuple val(meta), path('*.txt') , optional: true, emit: txt
tuple val(meta), path('*.daa') , optional: true, emit: daa
tuple val(meta), path('*.sam') , optional: true, emit: sam
tuple val(meta), path('*.tsv') , optional: true, emit: tsv
tuple val(meta), path('*.paf') , optional: true, emit: paf
tuple val(meta), path("*.log") , emit: log
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def columns = blast_columns ? "${blast_columns}" : ''
switch ( out_ext ) {
case "blast": outfmt = 0; break
case "xml": outfmt = 5; break
case "txt": outfmt = 6; break
case "daa": outfmt = 100; break
case "sam": outfmt = 101; break
case "tsv": outfmt = 102; break
case "paf": outfmt = 103; break
default:
outfmt = '6';
out_ext = 'txt';
log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)");
break
}
"""
DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'`
diamond \\
blastx \\
--threads $task.cpus \\
--db \$DB \\
--query $fasta \\
--outfmt ${outfmt} ${columns} \\
$args \\
--out ${prefix}.${out_ext} \\
--log
mv diamond.log ${prefix}.log
cat <<-END_VERSIONS > versions.yml
"${task.process}":
diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //')
END_VERSIONS
"""
}

@ -0,0 +1,81 @@
name: diamond_blastx
description: Queries a DIAMOND database using blastx mode
keywords:
- fasta
- diamond
- blastx
- DNA sequence
tools:
- diamond:
description: Accelerated BLAST compatible local sequence aligner
homepage: https://github.com/bbuchfink/diamond
documentation: https://github.com/bbuchfink/diamond/wiki
tool_dev_url: https://github.com/bbuchfink/diamond
doi: "doi:10.1038/s41592-021-01101-x"
licence: ["GPL v3.0"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- fasta:
type: file
description: Input fasta file containing query sequences
pattern: "*.{fa,fasta}"
- db:
type: directory
description: Directory containing the nucelotide blast database
pattern: "*"
- out_ext:
type: string
description: |
Specify the type of output file to be generated. `blast` corresponds to
BLAST pairwise format. `xml` corresponds to BLAST xml format.
`txt` corresponds to to BLAST tabular format. `tsv` corresponds to
taxonomic classification format.
pattern: "blast|xml|txt|daa|sam|tsv|paf"
output:
- blast:
type: file
description: File containing blastp hits
pattern: "*.{blast}"
- xml:
type: file
description: File containing blastp hits
pattern: "*.{xml}"
- txt:
type: file
description: File containing hits in tabular BLAST format.
pattern: "*.{txt}"
- daa:
type: file
description: File containing hits DAA format
pattern: "*.{daa}"
- sam:
type: file
description: File containing aligned reads in SAM format
pattern: "*.{sam}"
- tsv:
type: file
description: Tab separated file containing taxonomic classification of hits
pattern: "*.{tsv}"
- paf:
type: file
description: File containing aligned reads in pairwise mapping format format
pattern: "*.{paf}"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- log:
type: file
description: Log file containing stdout information
pattern: "*.{log}"
authors:
- "@spficklin"
- "@jfy133"
- "@mjamy"

@ -0,0 +1,16 @@
Changes in module 'nf-core/falco'
--- modules/nf-core/falco/main.nf
+++ modules/nf-core/falco/main.nf
@@ -33,7 +33,9 @@
"""
} else {
"""
- falco $args --threads $task.cpus ${reads}
+ [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz
+ [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz
+ falco $args --threads $task.cpus ${prefix}_1.fastq.gz ${prefix}_2.fastq.gz
cat <<-END_VERSIONS > versions.yml
"${task.process}":
************************************************************

@ -0,0 +1,59 @@
process FALCO {
tag "$meta.id"
label 'process_single'
conda "bioconda::falco=1.2.1"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/falco:1.2.1--h867801b_3':
'quay.io/biocontainers/falco:1.2.1--h867801b_3' }"
input:
tuple val(meta), path(reads)
output:
tuple val(meta), path("*.html"), emit: html
tuple val(meta), path("*.txt") , emit: txt
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
if ( reads.toList().size() == 1 ) {
"""
falco $args --threads $task.cpus ${reads} -D ${prefix}_data.txt -S ${prefix}_summary.txt -R ${prefix}_report.html
cat <<-END_VERSIONS > versions.yml
"${task.process}":
falco:\$( falco --version | sed -e "s/falco//g" )
END_VERSIONS
"""
} else {
"""
[ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz
[ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz
falco $args --threads $task.cpus ${prefix}_1.fastq.gz ${prefix}_2.fastq.gz
cat <<-END_VERSIONS > versions.yml
"${task.process}":
falco:\$( falco --version | sed -e "s/falco//g" )
END_VERSIONS
"""
}
stub:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
touch ${prefix}_data.txt
touch ${prefix}_fastqc_data.html
touch ${prefix}_summary.txt
cat <<-END_VERSIONS > versions.yml
"${task.process}":
falco: \$( falco --version | sed -e "s/falco v//g" )
END_VERSIONS
"""
}

@ -0,0 +1,52 @@
name: falco
description: Run falco on sequenced reads
keywords:
- quality control
- qc
- adapters
- fastq
tools:
- fastqc:
description: "falco is a drop-in C++ implementation of FastQC to assess the quality of sequence reads."
homepage: "https://falco.readthedocs.io/"
documentation: "https://falco.readthedocs.io/"
tool_dev_url: "None"
doi: ""
licence: "['GPL v3']"
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: |
List of input FastQ files of size 1 and 2 for single-end and paired-end data,
respectively.
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- html:
type: file
description: FastQC like report
pattern: "*_{fastqc_report.html}"
- txt:
type: file
description: falco report data
pattern: "*_{data.txt}"
- txt:
type: file
description: falco summary file
pattern: "*_{summary.txt}"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@lucacozzuto"

@ -0,0 +1,103 @@
process FASTP {
tag "$meta.id"
label 'process_medium'
conda "bioconda::fastp=0.23.2"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/fastp:0.23.2--h79da9fb_0' :
'quay.io/biocontainers/fastp:0.23.2--h79da9fb_0' }"
input:
tuple val(meta), path(reads)
path adapter_fasta
val save_trimmed_fail
val save_merged
output:
tuple val(meta), path('*.fastp.fastq.gz') , optional:true, emit: reads
tuple val(meta), path('*.json') , emit: json
tuple val(meta), path('*.html') , emit: html
tuple val(meta), path('*.log') , emit: log
path "versions.yml" , emit: versions
tuple val(meta), path('*.fail.fastq.gz') , optional:true, emit: reads_fail
tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def adapter_list = adapter_fasta ? "--adapter_fasta ${adapter_fasta}" : ""
def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : ''
// Added soft-links to original fastqs for consistent naming in MultiQC
// Use single ended for interleaved. Add --interleaved_in in config.
if ( task.ext.args?.contains('--interleaved_in') ) {
"""
[ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz
fastp \\
--stdout \\
--in1 ${prefix}.fastq.gz \\
--thread $task.cpus \\
--json ${prefix}.fastp.json \\
--html ${prefix}.fastp.html \\
$adapter_list \\
$fail_fastq \\
$args \\
2> ${prefix}.fastp.log \\
| gzip -c > ${prefix}.fastp.fastq.gz
cat <<-END_VERSIONS > versions.yml
"${task.process}":
fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
END_VERSIONS
"""
} else if (meta.single_end) {
"""
[ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz
fastp \\
--stdout \\
--in1 ${prefix}.fastq.gz \\
--out1 ${prefix}.fastp.fastq.gz \\
--thread $task.cpus \\
--json ${prefix}.fastp.json \\
--html ${prefix}.fastp.html \\
$adapter_list \\
$fail_fastq \\
$args \\
2> ${prefix}.fastp.log
cat <<-END_VERSIONS > versions.yml
"${task.process}":
fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
END_VERSIONS
"""
} else {
def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : ''
"""
[ ! -f ${prefix}_1.fastq.gz ] && ln -sf ${reads[0]} ${prefix}_1.fastq.gz
[ ! -f ${prefix}_2.fastq.gz ] && ln -sf ${reads[1]} ${prefix}_2.fastq.gz
fastp \\
--in1 ${prefix}_1.fastq.gz \\
--in2 ${prefix}_2.fastq.gz \\
--out1 ${prefix}_1.fastp.fastq.gz \\
--out2 ${prefix}_2.fastp.fastq.gz \\
--json ${prefix}.fastp.json \\
--html ${prefix}.fastp.html \\
$adapter_list \\
$fail_fastq \\
$merge_fastq \\
--thread $task.cpus \\
--detect_adapter_for_pe \\
$args \\
2> ${prefix}.fastp.log
cat <<-END_VERSIONS > versions.yml
"${task.process}":
fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
END_VERSIONS
"""
}
}

@ -0,0 +1,73 @@
name: fastp
description: Perform adapter/quality trimming on sequencing reads
keywords:
- trimming
- quality control
- fastq
tools:
- fastp:
description: |
A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance.
documentation: https://github.com/OpenGene/fastp
doi: https://doi.org/10.1093/bioinformatics/bty560
licence: ["MIT"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information. Use 'single_end: true' to specify single ended or interleaved FASTQs. Use 'single_end: false' for paired-end reads.
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: |
List of input FastQ files of size 1 and 2 for single-end and paired-end data,
respectively. If you wish to run interleaved paired-end data, supply as single-end data
but with `--interleaved_in` in your `modules.conf`'s `ext.args` for the module.
- adapter_fasta:
type: file
description: File in FASTA format containing possible adapters to remove.
pattern: "*.{fasta,fna,fas,fa}"
- save_trimmed_fail:
type: boolean
description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz`
- save_merged:
type: boolean
description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz`
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: The trimmed/modified/unmerged fastq reads
pattern: "*fastp.fastq.gz"
- json:
type: file
description: Results in JSON format
pattern: "*.json"
- html:
type: file
description: Results in HTML format
pattern: "*.html"
- log:
type: file
description: fastq log file
pattern: "*.log"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- reads_fail:
type: file
description: Reads the failed the preprocessing
pattern: "*fail.fastq.gz"
- reads_merged:
type: file
description: Reads that were successfully merged
pattern: "*.{merged.fastq.gz}"
authors:
- "@drpatelh"
- "@kevinmenden"

@ -0,0 +1,39 @@
process FILTLONG {
tag "$meta.id"
label 'process_low'
conda "bioconda::filtlong=0.2.1"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/filtlong:0.2.1--h9a82719_0' :
'quay.io/biocontainers/filtlong:0.2.1--h9a82719_0' }"
input:
tuple val(meta), path(shortreads), path(longreads)
output:
tuple val(meta), path("*.fastq.gz"), emit: reads
tuple val(meta), path("*.log") , emit: log
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def short_reads = !shortreads ? "" : meta.single_end ? "-1 $shortreads" : "-1 ${shortreads[0]} -2 ${shortreads[1]}"
if ("$longreads" == "${prefix}.fastq.gz") error "Longread FASTQ input and output names are the same, set prefix in module configuration to disambiguate!"
"""
filtlong \\
$short_reads \\
$args \\
$longreads \\
2> ${prefix}.log \\
| gzip -n > ${prefix}.fastq.gz
cat <<-END_VERSIONS > versions.yml
"${task.process}":
filtlong: \$( filtlong --version | sed -e "s/Filtlong v//g" )
END_VERSIONS
"""
}

@ -0,0 +1,55 @@
name: filtlong
description: Filtlong filters long reads based on quality measures or short read data.
keywords:
- nanopore
- quality control
- QC
- filtering
- long reads
- short reads
tools:
- filtlong:
description: Filtlong is a tool for filtering long reads. It can take a set of long reads and produce a smaller, better subset. It uses both read length (longer is better) and read identity (higher is better) when choosing which reads pass the filter.
homepage: https://anaconda.org/bioconda/filtlong
documentation: None
tool_dev_url: https://github.com/rrwick/Filtlong
doi: ""
licence: ["GPL v3"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- shortreads:
type: file
description: fastq file
pattern: "*.{fq,fastq,fq.gz,fastq.gz}"
- longreads:
type: file
description: fastq file
pattern: "*.{fq,fastq,fq.gz,fastq.gz}"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- reads:
type: file
description: Filtered (compressed) fastq file
pattern: "*.fastq.gz"
- log:
type: file
description: Standard error logging file containing summary statistics
pattern: "*.log"
authors:
- "@d4straub"
- "@sofstam"

@ -0,0 +1,44 @@
process GUNZIP {
tag "$archive"
label 'process_single'
conda "conda-forge::sed=4.7"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
'ubuntu:20.04' }"
input:
tuple val(meta), path(archive)
output:
tuple val(meta), path("$gunzip"), emit: gunzip
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
gunzip = archive.toString() - '.gz'
"""
gunzip \\
-f \\
$args \\
$archive
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//')
END_VERSIONS
"""
stub:
gunzip = archive.toString() - '.gz'
"""
touch $gunzip
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//')
END_VERSIONS
"""
}

@ -0,0 +1,34 @@
name: gunzip
description: Compresses and decompresses files.
keywords:
- gunzip
- compression
tools:
- gunzip:
description: |
gzip is a file format and a software application used for file compression and decompression.
documentation: https://www.gnu.org/software/gzip/manual/gzip.html
licence: ["GPL-3.0-or-later"]
input:
- meta:
type: map
description: |
Optional groovy Map containing meta information
e.g. [ id:'test', single_end:false ]
- archive:
type: file
description: File to be compressed/uncompressed
pattern: "*.*"
output:
- gunzip:
type: file
description: Compressed/uncompressed file
pattern: "*.*"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@joseespinosa"
- "@drpatelh"
- "@jfy133"

@ -0,0 +1,41 @@
process KAIJU_KAIJU {
tag "$meta.id"
label 'process_high'
conda "bioconda::kaiju=1.8.2"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/kaiju:1.8.2--h5b5514e_1':
'quay.io/biocontainers/kaiju:1.8.2--h5b5514e_1' }"
input:
tuple val(meta), path(reads)
path(db)
output:
tuple val(meta), path('*.tsv'), emit: results
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def input = meta.single_end ? "-i ${reads}" : "-i ${reads[0]} -j ${reads[1]}"
"""
dbnodes=`find -L ${db} -name "*nodes.dmp"`
dbname=`find -L ${db} -name "*.fmi" -not -name "._*"`
kaiju \\
$args \\
-z $task.cpus \\
-t \$dbnodes \\
-f \$dbname \\
-o ${prefix}.tsv \\
$input
cat <<-END_VERSIONS > versions.yml
"${task.process}":
kaiju: \$(echo \$( kaiju -h 2>&1 | sed -n 1p | sed 's/^.*Kaiju //' ))
END_VERSIONS
"""
}

@ -0,0 +1,53 @@
name: kaiju_kaiju
description: Taxonomic classification of metagenomic sequence data using a protein reference database
keywords:
- classify
- metagenomics
- fastq
- taxonomic profiling
tools:
- kaiju:
description: Fast and sensitive taxonomic classification for metagenomics
homepage: https://kaiju.binf.ku.dk/
documentation: https://github.com/bioinformatics-centre/kaiju/blob/master/README.md
tool_dev_url: https://github.com/bioinformatics-centre/kaiju
doi: "10.1038/ncomms11257"
licence: ["GNU GPL v3"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: |
List of input fastq/fasta files of size 1 and 2 for single-end and paired-end data,
respectively.
pattern: "*.{fastq,fq,fasta,fa,fsa,fas,fna,fastq.gz,fq.gz,fasta.gz,fa.gz,fsa.gz,fas.gz,fna.gz}"
- db:
type: files
description: |
List containing the database and nodes files for Kaiju
e.g. [ 'database.fmi', 'nodes.dmp' ]
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- results:
type: file
description: Results with taxonomic classification of each read
pattern: "*.tsv"
authors:
- "@talnor"
- "@sofstam"
- "@jfy133"

@ -0,0 +1,39 @@
process KAIJU_KAIJU2KRONA {
tag "$meta.id"
label 'process_single'
conda "bioconda::kaiju=1.8.2"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/kaiju:1.8.2--h5b5514e_1':
'quay.io/biocontainers/kaiju:1.8.2--h5b5514e_1' }"
input:
tuple val(meta), path(tsv)
path(db)
output:
tuple val(meta), path("*.txt"), emit: txt
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
dbnodes=`find -L ${db} -name "*nodes.dmp"`
dbnames=`find -L ${db} -name "*names.dmp"`
kaiju2krona \\
$args \\
-t \$dbnodes \\
-n \$dbnames \\
-i ${tsv} \\
-o ${prefix}.txt
cat <<-END_VERSIONS > versions.yml
"${task.process}":
kaiju: \$(echo \$( kaiju -h 2>&1 | sed -n 1p | sed 's/^.*Kaiju //' ))
END_VERSIONS
"""
}

@ -0,0 +1,44 @@
name: kaiju_kaiju2krona
description: Convert Kaiju's tab-separated output file into a tab-separated text file which can be imported into Krona.
keywords:
- taxonomy
- visualisation
- krona chart
- metagenomics
tools:
- "kaiju":
description: Fast and sensitive taxonomic classification for metagenomics
homepage: https://kaiju.binf.ku.dk/
documentation: https://github.com/bioinformatics-centre/kaiju/blob/master/README.md
tool_dev_url: https://github.com/bioinformatics-centre/kaiju
doi: "10.1038/ncomms11257"
licence: ["GNU GPL v3"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- tsv:
type: file
description: Kaiju tab-separated output file
pattern: "*.{tsv,txt}"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- txt:
type: file
description: Krona text-based input file converted from Kaiju report
pattern: "*.{txt,krona}"
authors:
- "@MillironX"

@ -0,0 +1,40 @@
process KAIJU_KAIJU2TABLE {
tag "$meta.id"
label 'process_single'
conda "bioconda::kaiju=1.8.2"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/kaiju:1.8.2--h5b5514e_1':
'quay.io/biocontainers/kaiju:1.8.2--h2e03b76_0' }"
input:
tuple val(meta), path(results)
path db
val taxon_rank
output:
tuple val(meta), path('*.txt'), emit: summary
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
dbnodes=`find -L ${db} -name "*nodes.dmp"`
dbname=`find -L ${db} -name "*.fmi" -not -name "._*"`
kaiju2table $args \\
-t \$dbnodes \\
-n \$dbname \\
-r ${taxon_rank} \\
-o ${prefix}.txt \\
${results}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
kaiju: \$(echo \$( kaiju -h 2>&1 | sed -n 1p | sed 's/^.*Kaiju //' ))
END_VERSIONS
"""
}

@ -0,0 +1,50 @@
name: "kaiju_kaiju2table"
description: write your description here
keywords:
- classify
- metagenomics
tools:
- kaiju:
description: Fast and sensitive taxonomic classification for metagenomics
homepage: https://kaiju.binf.ku.dk/
documentation: https://github.com/bioinformatics-centre/kaiju/blob/master/README.md
tool_dev_url: https://github.com/bioinformatics-centre/kaiju
doi: "10.1038/ncomms11257"
licence: ["GNU GPL v3"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- results:
type: file
description: File containing the kaiju classification results
pattern: "*.{txt}"
- taxon_rank:
type: string
description: |
Taxonomic rank to display in report
pattern: "phylum|class|order|family|genus|species"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- results:
type: file
description: |
Summary table for a given taxonomic rank
pattern: "*.{tsv}"
authors:
- "@sofstam"
- "@talnor"
- "@jfy133"

@ -0,0 +1,58 @@
process KRAKEN2_KRAKEN2 {
tag "$meta.id"
label 'process_high'
conda "bioconda::kraken2=2.1.2 conda-forge::pigz=2.6"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' :
'quay.io/biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }"
input:
tuple val(meta), path(reads)
path db
val save_output_fastqs
val save_reads_assignment
output:
tuple val(meta), path('*.classified{.,_}*') , optional:true, emit: classified_reads_fastq
tuple val(meta), path('*.unclassified{.,_}*') , optional:true, emit: unclassified_reads_fastq
tuple val(meta), path('*classifiedreads.txt') , optional:true, emit: classified_reads_assignment
tuple val(meta), path('*report.txt') , emit: report
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def paired = meta.single_end ? "" : "--paired"
def classified = meta.single_end ? "${prefix}.classified.fastq" : "${prefix}.classified#.fastq"
def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq"
def classified_option = save_output_fastqs ? "--classified-out ${classified}" : ""
def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : ""
def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "--output /dev/null"
def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : ""
"""
kraken2 \\
--db $db \\
--threads $task.cpus \\
--report ${prefix}.kraken2.report.txt \\
--gzip-compressed \\
$unclassified_option \\
$classified_option \\
$readclassification_option \\
$paired \\
$args \\
$reads
$compress_reads_command
cat <<-END_VERSIONS > versions.yml
"${task.process}":
kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//')
pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
END_VERSIONS
"""
}

@ -0,0 +1,75 @@
name: kraken2_kraken2
description: Classifies metagenomic sequence data
keywords:
- classify
- metagenomics
- fastq
- db
tools:
- kraken2:
description: |
Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads
homepage: https://ccb.jhu.edu/software/kraken2/
documentation: https://github.com/DerrickWood/kraken2/wiki/Manual
doi: 10.1186/s13059-019-1891-0
licence: ["MIT"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: |
List of input FastQ files of size 1 and 2 for single-end and paired-end data,
respectively.
- db:
type: directory
description: Kraken2 database
- save_output_fastqs:
type: boolean
description: |
If true, optional commands are added to save classified and unclassified reads
as fastq files
- save_reads_assignment:
type: boolean
description: |
If true, an optional command is added to save a file reporting the taxonomic
classification of each input read
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- classified_reads_fastq:
type: file
description: |
Reads classified as belonging to any of the taxa
on the Kraken2 database.
pattern: "*{fastq.gz}"
- unclassified_reads_fastq:
type: file
description: |
Reads not classified to any of the taxa
on the Kraken2 database.
pattern: "*{fastq.gz}"
- classified_reads_assignment:
type: file
description: |
Kraken2 output file indicating the taxonomic assignment of
each input read
- report:
type: file
description: |
Kraken2 report containing stats about classified
and not classifed reads.
pattern: "*.{report.txt}"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@joseespinosa"
- "@drpatelh"

@ -0,0 +1,34 @@
process KRAKENTOOLS_COMBINEKREPORTS {
label 'process_single'
conda "bioconda::krakentools=1.2"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/krakentools:1.2--pyh5e36f6f_0':
'quay.io/biocontainers/krakentools:1.2--pyh5e36f6f_0' }"
input:
tuple val(meta), path(kreports)
output:
tuple val(meta), path("*.txt"), emit: txt
path "versions.yml", emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: "${meta.id}"
def VERSION = '1.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
"""
combine_kreports.py \\
-r ${kreports} \\
-o ${prefix}.txt \\
${args}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
combine_kreports.py: ${VERSION}
END_VERSIONS
"""
}

@ -0,0 +1,43 @@
name: krakentools_combinekreports
description: Takes a Kraken report file and prints out a krona-compatible TEXT file
keywords:
- kraken
- krakentools
- metagenomics
- table
- combining
- merging
tools:
- krakentools:
description: KrakenTools is a suite of scripts to be used for post-analysis of Kraken/KrakenUniq/Kraken2/Bracken results. Please cite the relevant paper if using KrakenTools with any of the listed programs.
homepage: https://github.com/jenniferlu717/KrakenTools
licence: ["GPL v3"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- kreports:
type: file
description: List of kraken-style report files
pattern: "*.{txt,kreport}"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- txt:
type: file
description: Combined kreport file of all input files
pattern: "*.txt"
authors:
- "@jfy133"

@ -0,0 +1,36 @@
process KRAKENTOOLS_KREPORT2KRONA {
tag "$meta.id"
label 'process_single'
// WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions.
conda "bioconda::krakentools=1.2"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/krakentools:1.2--pyh5e36f6f_0':
'quay.io/biocontainers/krakentools:1.2--pyh5e36f6f_0' }"
input:
tuple val(meta), path(kreport)
output:
tuple val(meta), path("*.txt"), emit: txt
path "versions.yml", emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def VERSION = '1.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
"""
kreport2krona.py \\
-r ${kreport} \\
-o ${prefix}.txt \\
${args}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
kreport2krona.py: ${VERSION}
END_VERSIONS
"""
}

@ -0,0 +1,41 @@
name: krakentools_kreport2krona
description: Takes a Kraken report file and prints out a krona-compatible TEXT file
keywords:
- kraken
- krona
- metagenomics
- visualization
tools:
- krakentools:
description: KrakenTools is a suite of scripts to be used for post-analysis of Kraken/KrakenUniq/Kraken2/Bracken results. Please cite the relevant paper if using KrakenTools with any of the listed programs.
homepage: https://github.com/jenniferlu717/KrakenTools
licence: ["GPL v3"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- kreport:
type: file
description: Kraken report
pattern: "*.{txt,kreport}"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- krona:
type: file
description: Krona text-based input file converted from Kraken report
pattern: "*.{txt,krona}"
authors:
- "@MillironX"

@ -0,0 +1,224 @@
process KRAKENUNIQ_PRELOADEDKRAKENUNIQ {
tag "$meta.id"
label 'process_high'
conda "bioconda::krakenuniq=1.0.2"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/krakenuniq:1.0.2--pl5321h19e8d03_0':
'quay.io/biocontainers/krakenuniq:1.0.2--pl5321h19e8d03_0' }"
input:
tuple val(meta), path(fastqs)
path db
val ram_chunk_size
val save_output_fastqs
val report_file
val save_output
output:
tuple val(meta), path('*.classified{.,_}*') , optional:true, emit: classified_reads_fastq
tuple val(meta), path('*.unclassified{.,_}*') , optional:true, emit: unclassified_reads_fastq
tuple val(meta), path('*classified.txt') , optional:true, emit: classified_assignment
tuple val(meta), path('*report.txt') , emit: report
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def args2 = task.ext.args ?: ''
def classified = meta.single_end ? '"\${PREFIX}.classified.fastq"' : '"\${PREFIX}.classified#.fastq"'
def unclassified = meta.single_end ? '"\${PREFIX}.unclassified.fastq"' : '"\${PREFIX}.unclassified#.fastq"'
def classified_option = save_output_fastqs ? "--classified-out ${classified}" : ''
def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : ''
def output_option = save_output ? '--output "\${PREFIX}.krakenuniq.classified.txt"' : ''
def report = report_file ? '--report-file "\${PREFIX}.krakenuniq.report.txt"' : ''
def compress_reads_command = save_output_fastqs ? 'gzip --no-name *.fastq' : ''
if (meta.single_end) {
"""
krakenuniq \\
--db $db \\
--preload \\
--preload-size $ram_chunk_size \\
--threads $task.cpus \\
$args
strip_suffix() {
local result=\$1
# Strip any file extensions.
echo "\${result%%.*}"
}
printf "%s\\n" ${fastqs} | while read FASTQ; do \\
PREFIX="\$(strip_suffix "\${FASTQ}")"
krakenuniq \\
--db $db \\
--threads $task.cpus \\
$report \\
$output_option \\
$unclassified_option \\
$classified_option \\
$output_option \\
$args2 \\
"\${FASTQ}"
done
$compress_reads_command
cat <<-END_VERSIONS > versions.yml
"${task.process}":
krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//')
END_VERSIONS
"""
} else {
"""
krakenuniq \\
--db $db \\
--preload \\
--preload-size $ram_chunk_size \\
--threads $task.cpus \\
$args
strip_suffix() {
local result
read result
# Strip any trailing dot or underscore.
result="\${result%_}"
echo "\${result%.}"
}
printf "%s %s\\n" ${fastqs} | while read FASTQ; do \\
read -r -a FASTQ <<< "\${FASTQ}"
PREFIX="\$(printf "%s\\n" "\${FASTQ[@]}" | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' | strip_suffix)"
krakenuniq \\
--db $db \\
--threads $task.cpus \\
$report \\
$output_option \\
$unclassified_option \\
$classified_option \\
$output_option \\
--paired \\
$args2 \\
"\${FASTQ[@]}"
done
$compress_reads_command
cat <<-END_VERSIONS > versions.yml
"${task.process}":
krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//')
END_VERSIONS
"""
}
stub:
def args = task.ext.args ?: ''
def args2 = task.ext.args ?: ''
def classified = meta.single_end ? '"\${PREFIX}.classified.fastq"' : '"\${PREFIX}.classified#.fastq"'
def unclassified = meta.single_end ? '"\${PREFIX}.unclassified.fastq"' : '"\${PREFIX}.unclassified#.fastq"'
def classified_option = save_output_fastqs ? "--classified-out ${classified}" : ''
def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : ''
def output_option = save_output ? '--output "\${PREFIX}.krakenuniq.classified.txt"' : ''
def report = report_file ? '--report-file "\${PREFIX}.krakenuniq.report.txt"' : ''
def compress_reads_command = save_output_fastqs ? 'gzip --no-name *.fastq' : ''
if (meta.single_end) {
"""
echo krakenuniq \\
--db $db \\
--preload \\
--preload-size $ram_chunk_size \\
--threads $task.cpus \\
$args
strip_suffix() {
local result=\$1
# Strip any file extensions.
echo "\${result%%.*}"
}
printf "%s\\n" ${fastqs} | while read FASTQ; do \\
echo "\${FASTQ}"
PREFIX="\$(strip_suffix "\${FASTQ}")"
echo "\${PREFIX}"
echo krakenuniq \\
--db $db \\
--threads $task.cpus \\
$report \\
$output_option \\
$unclassified_option \\
$classified_option \\
$output_option \\
$args2 \\
"\${FASTQ}"
touch "\${PREFIX}.classified.fastq.gz"
touch "\${PREFIX}.krakenuniq.classified.txt"
touch "\${PREFIX}.krakenuniq.report.txt"
touch "\${PREFIX}.unclassified.fastq.gz"
done
echo $compress_reads_command
cat <<-END_VERSIONS > versions.yml
"${task.process}":
krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//')
END_VERSIONS
"""
} else {
"""
echo krakenuniq \\
--db $db \\
--preload \\
--preload-size $ram_chunk_size \\
--threads $task.cpus \\
$args
strip_suffix() {
local result
read result
# Strip any trailing dot or underscore.
result="\${result%_}"
echo "\${result%.}"
}
printf "%s %s\\n" ${fastqs} | while read FASTQ; do \\
read -r -a FASTQ <<< "\${FASTQ}"
echo "\${FASTQ[@]}"
PREFIX="\$(printf "%s\\n" "\${FASTQ[@]}" | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' | strip_suffix)"
echo "\${PREFIX}"
echo krakenuniq \\
--db $db \\
--threads $task.cpus \\
$report \\
$output_option \\
$unclassified_option \\
$classified_option \\
$output_option \\
--paired \\
$args2 \\
"\${FASTQ[@]}"
touch "\${PREFIX}.classified_1.fastq.gz" "\${PREFIX}.classified_2.fastq.gz"
touch "\${PREFIX}.krakenuniq.classified.txt"
touch "\${PREFIX}.krakenuniq.report.txt"
touch "\${PREFIX}.unclassified_1.fastq.gz" "\${PREFIX}.unclassified_2.fastq.gz"
done
echo $compress_reads_command
cat <<-END_VERSIONS > versions.yml
"${task.process}":
krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//')
END_VERSIONS
"""
}
}

@ -0,0 +1,78 @@
name: "krakenuniq_preloadedkrakenuniq"
description: Classifies metagenomic sequence data using unique k-mer counts
keywords:
- classify
- metagenomics
- kmers
- fastq
- db
tools:
- "krakenuniq":
description: "Metagenomics classifier with unique k-mer counting for more specific results"
homepage: https://github.com/fbreitwieser/krakenuniq
documentation: https://github.com/fbreitwieser/krakenuniq
doi: 10.1186/s13059-018-1568-0
licence: ["MIT"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- fastqs:
type: file
description: List of input FastQ files
- db:
type: directory
description: KrakenUniq database
- ram_chunk_size:
type: val
description: Amount of maximum amount of RAM each chunk of database that should be loaded at any one time
pattern: "*GB"
- save_output_fastqs:
type: boolean
description: |
If true, optional commands are added to save classified and unclassified reads
as fastq files
- save_reads_assignment:
type: boolean
description: |
If true, an optional command is added to save a file reporting the taxonomic
classification of each input read
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- classified_reads_fastq:
type: file
description: |
Reads classified as belonging to any of the taxa
on the KrakenUniq database.
pattern: "*.fastq.gz"
- unclassified_reads_fastq:
type: file
description: |
Reads not classified to any of the taxa
on the KrakenUniq database.
pattern: "*.fastq.gz"
- classified_assignment:
type: file
description: |
KrakenUniq output file indicating the taxonomic assignment of
each input read ## DOUBLE CHECK!!
- report:
type: file
description: |
KrakenUniq report containing stats about classified
and not classifed reads.
pattern: "*.report.txt"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@mjamy"
- "@Midnighter"

@ -0,0 +1,41 @@
process KRONA_KTIMPORTTAXONOMY {
tag "${meta.id}"
label 'process_single'
// WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions.
conda "bioconda::krona=2.8"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/krona:2.8--pl5262hdfd78af_2' :
'quay.io/biocontainers/krona:2.8--pl5262hdfd78af_2' }"
input:
tuple val(meta), path(report)
path taxonomy, stageAs: 'taxonomy.tab'
output:
tuple val(meta), path ('*.html'), emit: html
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def VERSION = '2.8' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
"""
TAXONOMY=\$(find -L . -name '*.tab' -exec dirname {} \\;)
echo \$TAXONOMY
ktImportTaxonomy \\
$args \\
-o ${prefix}.html \\
-tax \$TAXONOMY/ \\
$report
cat <<-END_VERSIONS > versions.yml
"${task.process}":
krona: $VERSION
END_VERSIONS
"""
}

@ -0,0 +1,48 @@
name: krona_ktimporttaxonomy
description: KronaTools Import Taxonomy imports taxonomy classifications and produces an interactive Krona plot.
keywords:
- plot
- taxonomy
- interactive
- html
- visualisation
- krona chart
tools:
- krona:
description: Krona Tools is a set of scripts to create Krona charts from several Bioinformatics tools as well as from text and XML files.
homepage: https://github.com/marbl/Krona/wiki/KronaTools
documentation: http://manpages.ubuntu.com/manpages/impish/man1/ktImportTaxonomy.1.html
tool_dev_url:
doi: https://doi.org/10.1186/1471-2105-12-385
licence:
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test']
- database:
type: file
description: |
Path to a Krona taxonomy .tab file normally downloaded and generated by
krona/ktUpdateTaxonomy. Custom taxonomy files can have any name, but
must end in `.tab`.
pattern: "*tab"
- report:
type: file
description: "A tab-delimited file with taxonomy IDs and (optionally) query IDs, magnitudes, and scores. Query IDs are taken from column 1, taxonomy IDs from column 2, and scores from column 3. Lines beginning with # will be ignored."
pattern: "*.{tsv}"
output:
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- html:
type: file
description: A html file containing an interactive krona plot.
pattern: "*.{html}"
authors:
- "@mjakobs"

@ -0,0 +1,34 @@
process KRONA_KTIMPORTTEXT {
tag "$meta.id"
label 'process_single'
conda "bioconda::krona=2.8.1"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/krona:2.8.1--pl5321hdfd78af_1':
'quay.io/biocontainers/krona:2.8.1--pl5321hdfd78af_1' }"
input:
tuple val(meta), path(report)
output:
tuple val(meta), path ('*.html'), emit: html
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
ktImportText \\
$args \\
-o ${prefix}.html \\
$report
cat <<-END_VERSIONS > versions.yml
"${task.process}":
krona: \$( echo \$(ktImportText 2>&1) | sed 's/^.*KronaTools //g; s/- ktImportText.*\$//g')
END_VERSIONS
"""
}

@ -0,0 +1,47 @@
name: "krona_ktimporttext"
description: Creates a Krona chart from text files listing quantities and lineages.
keywords:
- plot
- taxonomy
- interactive
- html
- visualisation
- krona chart
- metagenomics
tools:
- krona:
description: Krona Tools is a set of scripts to create Krona charts from several Bioinformatics tools as well as from text and XML files.
homepage: https://github.com/marbl/Krona/wiki/KronaTools
documentation: http://manpages.ubuntu.com/manpages/impish/man1/ktImportTaxonomy.1.html
tool_dev_url: https://github.com/marbl/Krona
doi: 10.1186/1471-2105-12-385
licence: https://raw.githubusercontent.com/marbl/Krona/master/KronaTools/LICENSE.txt
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test']
- report:
type: file
description: "Tab-delimited text file. Each line should be a number followed by a list of wedges to contribute to (starting from the highest level). If no wedges are listed (and just a quantity is given), it will contribute to the top level. If the same lineage is listed more than once, the values will be added. Quantities can be omitted if -q is specified. Lines beginning with '#' will be ignored."
pattern: "*.{txt}"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test' ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- html:
type: file
description: A html file containing an interactive krona plot.
pattern: "*.{html}"
authors:
- "@jianhong"

@ -0,0 +1,47 @@
process MALT_RUN {
tag "$meta.id"
label 'process_high'
conda "bioconda::malt=0.61"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/malt:0.61--hdfd78af_0' :
'quay.io/biocontainers/malt:0.61--hdfd78af_0' }"
input:
tuple val(meta), path(fastqs)
path index
output:
tuple val(meta), path("*.rma6") , emit: rma6
tuple val(meta), path("*.{tab,text,sam}"), optional:true, emit: alignments
tuple val(meta), path("*.log") , emit: log
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def avail_mem = 6
if (!task.memory) {
log.info '[MALT_RUN] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.'
} else {
avail_mem = task.memory.giga
}
"""
malt-run \\
-t $task.cpus \\
-v \\
-o . \\
$args \\
--inFile ${fastqs.join(' ')} \\
--index $index/ |&tee ${prefix}-malt-run.log
cat <<-END_VERSIONS > versions.yml
"${task.process}":
malt: \$(malt-run --help 2>&1 | grep -o 'version.* ' | cut -f 1 -d ',' | cut -f2 -d ' ')
END_VERSIONS
"""
}

@ -0,0 +1,54 @@
name: malt_run
description: MALT, an acronym for MEGAN alignment tool, is a sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics.
keywords:
- malt
- alignment
- metagenomics
- ancient DNA
- aDNA
- palaeogenomics
- archaeogenomics
- microbiome
tools:
- malt:
description: A tool for mapping metagenomic data
homepage: https://www.wsi.uni-tuebingen.de/lehrstuehle/algorithms-in-bioinformatics/software/malt/
documentation: https://software-ab.informatik.uni-tuebingen.de/download/malt/manual.pdf
tool_dev_url: None
doi: "10.1038/s41559-017-0446-6"
licence: ["GPL v3"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- fastqs:
type: file
description: Input FASTQ files
pattern: "*.{fastq.gz,fq.gz}"
- index:
type: directory
description: Index/database directory from malt-build
pattern: "*/"
output:
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- rma6:
type: file
description: MEGAN6 RMA6 file
pattern: "*.rma6"
- sam:
type: file
description: Alignment files in Tab, Text or MEGAN-compatible SAM format
pattern: "*.{tab,txt,sam}"
- log:
type: file
description: Log of verbose MALT stdout
pattern: "*-malt-run.log"
authors:
- "@jfy133"

@ -0,0 +1,38 @@
process MEGAN_RMA2INFO {
tag "$meta.id"
label 'process_single'
conda "bioconda::megan=6.21.7"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/megan:6.21.7--h9ee0642_0':
'quay.io/biocontainers/megan:6.21.7--h9ee0642_0' }"
input:
tuple val(meta), path(rma6)
val(megan_summary)
output:
tuple val(meta), path("*.txt.gz") , emit: txt
tuple val(meta), path("*.megan"), optional: true, emit: megan_summary
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def summary = megan_summary ? "-es ${prefix}.megan" : ""
"""
rma2info \\
-i ${rma6} \\
-o ${prefix}.txt.gz \\
${summary} \\
$args
cat <<-END_VERSIONS > versions.yml
"${task.process}":
megan: \$(echo \$(rma2info 2>&1) | grep version | sed 's/.*version //g;s/, built.*//g')
END_VERSIONS
"""
}

@ -0,0 +1,51 @@
name: "megan_rma2info"
description: Analyses an RMA file and exports information in text format
keywords:
- megan
- rma6
- classification
- conversion
tools:
- "megan":
description: "A tool for studying the taxonomic content of a set of DNA reads"
homepage: "https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/"
documentation: "https://software-ab.informatik.uni-tuebingen.de/download/megan6/welcome.html"
tool_dev_url: "https://github.com/husonlab/megan-ce"
doi: "10.1371/journal.pcbi.1004957"
licence: "['GPL >=3']"
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- rma6:
type: file
description: RMA6 file from MEGAN or MALT
pattern: "*.rma6"
- megan_summary:
type: boolean
description: Specify whether to generate an MEGAN summary file
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- txt:
type: file
description: Compressed text file
pattern: "*.txt.gz"
- megan_summary:
type: file
description: Optionally generated MEGAN summary file
pattern: "*.megan"
authors:
- "@jfy133"

@ -0,0 +1,33 @@
process METAPHLAN3_MERGEMETAPHLANTABLES {
label 'process_single'
conda "bioconda::metaphlan=3.0.12"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/metaphlan:3.0.12--pyhb7b1952_0' :
'quay.io/biocontainers/metaphlan:3.0.12--pyhb7b1952_0' }"
input:
tuple val(meta), path(profiles)
output:
tuple val(meta), path("${prefix}.txt") , emit: txt
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: "${meta.id}"
"""
merge_metaphlan_tables.py \\
$args \\
-o ${prefix}.txt \\
${profiles}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
metaphlan3: \$(metaphlan --version 2>&1 | awk '{print \$3}')
END_VERSIONS
"""
}

@ -0,0 +1,44 @@
name: "metaphlan3_mergemetaphlantables"
description: Merges output abundance tables from MetaPhlAn3
keywords:
- metagenomics
- classification
- merge
- table
- profiles
tools:
- metaphlan3:
description: Identify clades (phyla to species) present in the metagenome obtained from a microbiome sample and their relative abundance
homepage: https://huttenhower.sph.harvard.edu/metaphlan/
documentation: https://github.com/biobakery/MetaPhlAn
doi: "10.7554/eLife.65088"
licence: ["MIT License"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- profiles:
type: file
description: List of per-sample MetaPhlAn3 taxonomic abundance tables
pattern: "*"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- txt:
type: txt
description: Combined MetaPhlAn3 table
pattern: "*.txt"
authors:
- "@jfy133"

@ -0,0 +1,48 @@
process METAPHLAN3_METAPHLAN3 {
tag "$meta.id"
label 'process_high'
conda "bioconda::metaphlan=3.0.12"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/metaphlan:3.0.12--pyhb7b1952_0' :
'quay.io/biocontainers/metaphlan:3.0.12--pyhb7b1952_0' }"
input:
tuple val(meta), path(input)
path metaphlan_db
output:
tuple val(meta), path("*_profile.txt") , emit: profile
tuple val(meta), path("*.biom") , emit: biom
tuple val(meta), path('*.bowtie2out.txt'), optional:true, emit: bt2out
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def input_type = ("$input".endsWith(".fastq.gz") || "$input".endsWith(".fq.gz")) ? "--input_type fastq" : ("$input".contains(".fasta")) ? "--input_type fasta" : ("$input".endsWith(".bowtie2out.txt")) ? "--input_type bowtie2out" : "--input_type sam"
def input_data = ("$input_type".contains("fastq")) && !meta.single_end ? "${input[0]},${input[1]}" : "$input"
def bowtie2_out = "$input_type" == "--input_type bowtie2out" || "$input_type" == "--input_type sam" ? '' : "--bowtie2out ${prefix}.bowtie2out.txt"
"""
BT2_DB=`find -L "${metaphlan_db}" -name "*rev.1.bt2" -exec dirname {} \\;`
metaphlan \\
--nproc $task.cpus \\
$input_type \\
$input_data \\
$args \\
$bowtie2_out \\
--bowtie2db \$BT2_DB \\
--biom ${prefix}.biom \\
--output_file ${prefix}_profile.txt
cat <<-END_VERSIONS > versions.yml
"${task.process}":
metaphlan3: \$(metaphlan --version 2>&1 | awk '{print \$3}')
END_VERSIONS
"""
}

@ -0,0 +1,58 @@
name: metaphlan3_metaphlan3
description: MetaPhlAn is a tool for profiling the composition of microbial communities from metagenomic shotgun sequencing data.
keywords:
- metagenomics
- classification
- fastq
- bam
- fasta
tools:
- metaphlan3:
description: Identify clades (phyla to species) present in the metagenome obtained from a microbiome sample and their relative abundance
homepage: https://huttenhower.sph.harvard.edu/metaphlan/
documentation: https://github.com/biobakery/MetaPhlAn
doi: "10.7554/eLife.65088"
licence: ["MIT License"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- input:
type: file
description: Metaphlan 3.0 can classify the metagenome from a variety of input data types, including FASTQ files (single-end and paired-end), FASTA, bowtie2-produced SAM files (produced from alignments to the MetaPHlAn marker database) and intermediate bowtie2 alignment files (bowtie2out)
pattern: "*.{fastq.gz, fasta, fasta.gz, sam, bowtie2out.txt}"
- metaphlan_db:
type: file
description: |
Directory containing pre-downloaded and uncompressed MetaPhlAn3 database downloaded from: http://cmprod1.cibio.unitn.it/biobakery3/metaphlan_databases/.
Note that you will also need to specify `--index` and the database version name (e.g. 'mpa_v31_CHOCOPhlAn_201901') in your module.conf ext.args for METAPHLAN3_METAPHLAN3!
pattern: "*/"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- profile:
type: file
description: Tab-separated output file of the predicted taxon relative abundances
pattern: "*.{txt}"
- biom:
type: file
description: General-use format for representing biological sample by observation contingency tables
pattern: "*.{biom}"
- bowtie2out:
type: file
description: Intermediate Bowtie2 output produced from mapping the metagenome against the MetaPHlAn marker database ( not compatible with `bowtie2out` files generated with MetaPhlAn versions below 3 )
pattern: "*.{bowtie2out.txt}"
authors:
- "@MGordon09"

@ -0,0 +1,48 @@
process MINIMAP2_ALIGN {
tag "$meta.id"
label 'process_medium'
// Note: the versions here need to match the versions used in the mulled container below and minimap2/index
conda "bioconda::minimap2=2.24 bioconda::samtools=1.14"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' :
'quay.io/biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }"
input:
tuple val(meta), path(reads)
path reference
val bam_format
val cigar_paf_format
val cigar_bam
output:
tuple val(meta), path("*.paf"), optional: true, emit: paf
tuple val(meta), path("*.bam"), optional: true, emit: bam
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def bam_output = bam_format ? "-a | samtools sort | samtools view -@ ${task.cpus} -b -h -o ${prefix}.bam" : "-o ${prefix}.paf"
def cigar_paf = cigar_paf_format && !bam_format ? "-c" : ''
def set_cigar_bam = cigar_bam && bam_format ? "-L" : ''
"""
minimap2 \\
$args \\
-t $task.cpus \\
"${reference ?: reads}" \\
"$reads" \\
$cigar_paf \\
$set_cigar_bam \\
$bam_output
cat <<-END_VERSIONS > versions.yml
"${task.process}":
minimap2: \$(minimap2 --version 2>&1)
END_VERSIONS
"""
}

@ -0,0 +1,65 @@
name: minimap2_align
description: A versatile pairwise aligner for genomic and spliced nucleotide sequences
keywords:
- align
- fasta
- fastq
- genome
- paf
- reference
tools:
- minimap2:
description: |
A versatile pairwise aligner for genomic and spliced nucleotide sequences.
homepage: https://github.com/lh3/minimap2
documentation: https://github.com/lh3/minimap2#uguide
licence: ["MIT"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: |
List of input FASTA or FASTQ files of size 1 and 2 for single-end
and paired-end data, respectively.
- reference:
type: file
description: |
Reference database in FASTA format.
- bam_format:
type: boolean
description: Specify that output should be in BAM format
- cigar_paf_format:
type: boolean
description: Specify that output CIGAR should be in PAF format
- cigar_bam:
type: boolean
description: |
Write CIGAR with >65535 ops at the CG tag. This is recommended when
doing XYZ (https://github.com/lh3/minimap2#working-with-65535-cigar-operations)
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- paf:
type: file
description: Alignment in PAF format
pattern: "*.paf"
- bam:
type: file
description: Alignment in BAM format
pattern: "*.bam"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@heuermh"
- "@sofstam"
- "@sateeshperi"
- "@jfy133"

@ -0,0 +1,34 @@
process MINIMAP2_INDEX {
label 'process_medium'
// Note: the versions here need to match the versions used in minimap2/align
conda "bioconda::minimap2=2.24"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/minimap2:2.24--h7132678_1' :
'quay.io/biocontainers/minimap2:2.24--h7132678_1' }"
input:
tuple val(meta), path(fasta)
output:
tuple val(meta), path("*.mmi"), emit: index
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
"""
minimap2 \\
-t $task.cpus \\
-d ${fasta.baseName}.mmi \\
$args \\
$fasta
cat <<-END_VERSIONS > versions.yml
"${task.process}":
minimap2: \$(minimap2 --version 2>&1)
END_VERSIONS
"""
}

@ -0,0 +1,40 @@
name: minimap2_index
description: Provides fasta index required by minimap2 alignment.
keywords:
- index
- fasta
- reference
tools:
- minimap2:
description: |
A versatile pairwise aligner for genomic and spliced nucleotide sequences.
homepage: https://github.com/lh3/minimap2
documentation: https://github.com/lh3/minimap2#uguide
licence: ["MIT"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- fasta:
type: file
description: |
Reference database in FASTA format.
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- index:
type: file
description: Minimap2 fasta index.
pattern: "*.mmi"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@yuukiiwa"
- "@drpatelh"

@ -0,0 +1,45 @@
process MOTUS_MERGE {
tag "$meta.id"
label 'process_single'
conda "bioconda::motus=3.0.3"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/motus:3.0.3--pyhdfd78af_0':
'quay.io/biocontainers/motus:3.0.3--pyhdfd78af_0' }"
input:
tuple val(meta), path(input)
path db // to stop docker saying it can't find it... would have to have the module in upstream steps anyway
path profile_version_yml, stageAs: 'profile_version.yml'
output:
tuple val(meta), path("*.txt") , optional: true, emit: txt
tuple val(meta), path("*.biom"), optional: true, emit: biom
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def cmd_input = input.size() > 1 ? "-i ${input.join(',')}" : input.isDirectory() ? "-d ${input}" : "-i ${input}"
def suffix = task.ext.args?.contains("-B") ? "biom" : "txt"
"""
motus \\
merge \\
-db $db \\
${cmd_input} \\
$args \\
-o ${prefix}.${suffix}
## Take version from the mOTUs/profile module output, as cannot reconstruct
## version without having database staged in this directory.
VERSION=\$(cat ${profile_version_yml} | grep '/*motus:.*' | sed 's/.*otus: //g')
cat <<-END_VERSIONS > versions.yml
"${task.process}":
motus: \$VERSION
END_VERSIONS
"""
}

@ -0,0 +1,54 @@
name: "motus_merge"
description: Taxonomic meta-omics profiling using universal marker genes
keywords:
- classify
- metagenomics
- fastq
- taxonomic profiling
- merging
- merge
- otu table
tools:
- "motus":
description: "Marker gene-based OTU (mOTU) profiling"
homepage: "https://motu-tool.org/"
documentation: "https://github.com/motu-tool/mOTUs/wiki"
tool_dev_url: "https://github.com/motu-tool/mOTUs"
doi: "10.1186/s40168-022-01410-z"
licence: "['GPL v3']"
input:
- input:
type: file
description: |
List of output files (more than one) from motus profile,
or a single directory containing motus output files.
- db:
type: directory
description: |
mOTUs database downloaded by `motus downloadDB`
pattern: "db_mOTU/"
- profile_version_yml:
type: file
description: |
A single versions.yml file output from motus/profile. motus/merge cannot reconstruct
this itself without having the motus database present and configured with the tool
so here we take it from what is already reported by the upstream module.
pattern: "versions.yml"
output:
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- txt:
type: file
description: OTU table in txt format, if BIOM format not requested
pattern: "*.txt"
- biom:
type: file
description: OTU table in biom format, if BIOM format requested
pattern: "*.biom"
authors:
- "@jfy133"

@ -0,0 +1,56 @@
process MOTUS_PROFILE {
tag "$meta.id"
label 'process_medium'
conda "bioconda::motus=3.0.3"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/motus:3.0.3--pyhdfd78af_0':
'quay.io/biocontainers/motus:3.0.3--pyhdfd78af_0' }"
input:
tuple val(meta), path(reads)
path db
output:
tuple val(meta), path("*.out"), emit: out
tuple val(meta), path("*.bam"), optional: true, emit: bam
tuple val(meta), path("*.mgc"), optional: true, emit: mgc
tuple val(meta), path("*.log") , emit: log
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def inputs = reads[0].getExtension() == 'bam' ?
"-i ${reads}" :
reads[0].getExtension() == 'mgc' ? "-m $reads" :
meta.single_end ?
"-s $reads" : "-f ${reads[0]} -r ${reads[1]}"
def refdb = db ? "-db ${db}" : ""
"""
motus profile \\
$args \\
$inputs \\
$refdb \\
-t $task.cpus \\
-n $prefix \\
-o ${prefix}.out \\
2> ${prefix}.log
## mOTUs version number is not available from command line.
## mOTUs save the version number in index database folder.
## mOTUs will check the database version is same version as exec version.
if [ "$db" == "" ]; then
VERSION=\$(echo \$(motus -h 2>&1) | sed 's/^.*Version: //; s/References.*\$//')
else
VERSION=\$(grep motus $db/db_mOTU_versions | sed 's/motus\\t//g')
fi
cat <<-END_VERSIONS > versions.yml
"${task.process}":
motus: \$VERSION
END_VERSIONS
"""
}

@ -0,0 +1,65 @@
name: "motus_profile"
description: Taxonomic meta-omics profiling using universal marker genes
keywords:
- classify
- metagenomics
- fastq
- taxonomic profiling
tools:
- "motus":
description: "Marker gene-based OTU (mOTU) profiling"
homepage: "https://motu-tool.org/"
documentation: "https://github.com/motu-tool/mOTUs/wiki"
tool_dev_url: "https://github.com/motu-tool/mOTUs"
doi: "10.1186/s40168-022-01410-z"
licence: "['GPL v3']"
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: |
List of input fastq/fasta files of size 1 and 2 for single-end and paired-end data,
respectively.
Or the intermediate bam file mapped by bwa to the mOTUs database or
the output bam file from motus profile.
Or the intermediate mgc read counts table.
pattern: "*.{fastq,fq,fasta,fa,fastq.gz,fq.gz,fasta.gz,fa.gz,.bam,.mgc}"
- db:
type: directory
description: |
mOTUs database downloaded by `motus downloadDB`
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- out:
type: file
description: Results with taxonomic classification of each read
pattern: "*.out"
- bam:
type: file
description: Optional intermediate sorted BAM file from BWA
pattern: "*.{bam}"
- mgc:
type: file
description: Optional intermediate mgc read count table file saved with `-M`.
pattern: "*.{mgc}"
- log:
type: file
description: Standard error logging file containing summary statistics
pattern: "*.log"
authors:
- "@jianhong"

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save