diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 82bdbc3..5cea5b7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,6 +22,21 @@ jobs: NXF_VER: - "21.10.3" - "latest-everything" + parameters: + - "--perform_longread_qc false" + - "--perform_shortread_qc false" + - "--shortread_qc_tool fastp" + - "--shortread_qc_tool fastp --shortread_qc_mergepairs --shortread_qc_excludeunmerged" + - "--shortread_qc_tool fastp --shortread_qc_mergepairs" + - "--shortread_qc_tool adapterremoval" + - "--shortread_qc_tool adapterremoval --shortread_qc_mergepairs --shortread_qc_excludeunmerged" + - "--shortread_qc_tool adapterremoval --shortread_qc_mergepairs" + - "--shortread_complexityfilter_tool bbduk" + - "--shortread_complexityfilter_tool prinseqplusplus" + - "--perform_runmerging" + - "--perform_runmerging --shortread_qc_mergepairs" + - "--shortread_complexityfilter false --perform_shortread_hostremoval" + steps: - name: Check out pipeline code uses: actions/checkout@v2 @@ -31,9 +46,56 @@ jobs: with: version: "${{ matrix.NXF_VER }}" - - name: Run pipeline with test data - # TODO nf-core: You can customise CI pipeline run tests as required - # For example: adding multiple test runs with different parameters - # Remember that you can parallelise this by using strategy.matrix + - name: Show current locale + run: locale + + - name: Set UTF-8 enabled locale run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + sudo locale-gen en_US.UTF-8 + sudo update-locale LANG=en_US.UTF-8 + + - name: Run pipeline with test data + uses: Wandalen/wretry.action@v1.0.11 + with: + command: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results ${{ matrix.parameters }} + attempt_limit: 3 + + motus: + name: Test mOTUs with workflow parameters + if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/taxprofiler') }} + runs-on: ubuntu-latest + strategy: + matrix: + NXF_VER: + - "21.10.3" + - "latest-everything" + + steps: + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Install Nextflow + uses: nf-core/setup-nextflow@v1 + with: + version: "${{ matrix.NXF_VER }}" + + - name: Show current locale + run: locale + + - name: Set UTF-8 enabled locale + run: | + sudo locale-gen en_US.UTF-8 + sudo update-locale LANG=en_US.UTF-8 + + - name: Prepare the database + run: | + wget https://raw.githubusercontent.com/motu-tool/mOTUs/master/motus/downloadDB.py + python downloadDB.py > download_db_log.txt + echo 'tool,db_name,db_params,db_path' > 'database_motus.csv' + echo 'motus,db_mOTU,,db_mOTU' >> 'database_motus.csv' + + - name: Run pipeline with test data + uses: Wandalen/wretry.action@v1.0.11 + with: + command: nextflow run ${GITHUB_WORKSPACE} -profile test_motus,docker --outdir ./results --databases ./database_motus.csv + attempt_limit: 3 diff --git a/.prettierignore b/.prettierignore index eb74a57..f865061 100644 --- a/.prettierignore +++ b/.prettierignore @@ -8,3 +8,4 @@ results/ testing/ testing* *.pyc +tests/ diff --git a/CITATIONS.md b/CITATIONS.md index c5463be..1ce4ec2 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -13,8 +13,55 @@ - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) + > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +- [fastp](https://doi.org/10.1093/bioinformatics/bty560) + + > Chen, Shifu, Yanqing Zhou, Yaru Chen, and Jia Gu. 2018. Fastp: An Ultra-Fast All-in-One FASTQ Preprocessor. Bioinformatics 34 (17): i884-90. 10.1093/bioinformatics/bty560. + +- [AdapterRemoval2](https://doi.org/10.1186/s13104-016-1900-2) + + > Schubert, Mikkel, Stinus Lindgreen, and Ludovic Orlando. 2016. AdapterRemoval v2: Rapid Adapter Trimming, Identification, and Read Merging. BMC Research Notes 9 (February): 88. doi:10.1186/s13104-016-1900-2. + +- [Porechop](https://github.com/rrwick/Porechop) + +- [BBTools](http://sourceforge.net/projects/bbmap/) + +- [PRINSEQ++](https://doi.org/10.7287/peerj.preprints.27553v1) + + > Cantu, Vito Adrian, Jeffrey Sadural, and Robert Edwards. 2019. PRINSEQ++, a Multi-Threaded Tool for Fast and Efficient Quality Control and Preprocessing of Sequencing Datasets. e27553v1. PeerJ Preprints. doi: 10.7287/peerj.preprints.27553v1. + +- [Kraken2](https://doi.org/10.1186/s13059-019-1891-0) + + > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. Improved Metagenomic Analysis with Kraken 2. Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0. + +- [Krona](https://doi.org/10.1186/1471-2105-12-385) + + > Ondov, Brian D., Nicholas H. Bergman, and Adam M. Phillippy. 2011. Interactive metagenomic visualization in a Web browser. BMC Bioinformatics 12 (1): 385. doi: 10.1186/1471-2105-12-385. + +- [MALT](https://doi.org/10.1038/s41559-017-0446-6) + + > Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico. Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6. + +- [MEGAN](https://doi.org/10.1371/journal.pcbi.1004957) + + > Huson, Daniel H., Sina Beier, Isabell Flade, Anna Górska, Mohamed El-Hadidi, Suparna Mitra, Hans-Joachim Ruscheweyh, and Rewati Tappu. 2016. “MEGAN Community Edition - Interactive Exploration and Analysis of Large-Scale Microbiome Sequencing Data.” PLoS Computational Biology 12 (6): e1004957. doi: 10.1371/journal.pcbi.1004957. + +- [MetaPhlAn3](https://doi.org/10.7554/eLife.65088) + + > Beghini, Francesco, Lauren J McIver, Aitor Blanco-Míguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, et al. 2021. “Integrating Taxonomic, Functional, and Strain-Level Profiling of Diverse Microbial Communities with BioBakery 3.” Edited by Peter Turnbaugh, Eduardo Franco, and C Titus Brown. ELife 10 (May): e65088. doi: 10.7554/eLife.65088 + +- [Centrifuge](https://doi.org/10.1101/gr.210641.116) + + > Kim, Daehwan, Li Song, Florian P. Breitwieser, and Steven L. Salzberg. 2016. “Centrifuge: Rapid and Sensitive Classification of Metagenomic Sequences.” Genome Research 26 (12): 1721-29. doi: 10.1101/gr.210641.116. + +- [DIAMOND](https://doi.org/10.1038/nmeth.3176) + +> Buchfink, Benjamin, Chao Xie, and Daniel H. Huson. 2015. “Fast and Sensitive Protein Alignment Using DIAMOND.” Nature Methods 12 (1): 59-60. doi: 10.1038/nmeth.3176. + +- [FILTLONG](https://github.com/rrwick/Filtlong) + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index a8e2956..11eb9a3 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# ![nf-core/taxprofiler](docs/images/nf-core-taxprofiler_logo_light.png#gh-light-mode-only) ![nf-core/taxprofiler](docs/images/nf-core-taxprofiler_logo_dark.png#gh-dark-mode-only) +# ![nf-core/taxprofiler](docs/images/nf-core-taxprofiler_logo_custom_light.png#gh-light-mode-only) ![nf-core/taxprofiler](docs/images/nf-core-taxprofiler_logo_custom_dark.png#gh-dark-mode-only) [![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/taxprofiler/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) @@ -12,9 +12,11 @@ ## Introduction +> ⚠️ This pipeline is still under development! While the pipeline is usable, not all functionality will be available! + -**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for Taxonomic profiling of shotgun metagenomic data. +**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling with multiple profiling tools against multiple databases, produces standardised output tables. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! @@ -26,8 +28,27 @@ On release, automated continuous integration tests run the pipeline on a full-si +![](docs/images/taxprofiler_tube.png) + 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +2. Performs optional read pre-processing + - Adapter clipping and merging (short read: [fastp](https://github.com/OpenGene/fastp), [AdapterRemoval2](https://github.com/MikkelSchubert/adapterremoval); long read: [porechop](https://github.com/rrwick/Porechop)) + - Low complexity filtering ([bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus)) + - Host read removal ([BowTie2](http://bowtie-bio.sourceforge.net/bowtie2/)) + - Run merging +3. Performs taxonomic profiling using one or more of: + - [Kraken2](https://ccb.jhu.edu/software/kraken2/) + - [MetaPhlAn3](https://huttenhower.sph.harvard.edu/metaphlan/) + - [MALT](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/malt/) + - [DIAMOND](https://github.com/bbuchfink/diamond) + - [Centrifuge](https://ccb.jhu.edu/software/centrifuge/) + - [Kaiju](https://kaiju.binf.ku.dk/) + - [mOTUs](https://motu-tool.org/) + - [MetaMaps](https://github.com/DiltheyLab/MetaMaps) +4. Perform optional post-processing with: + - [bracken](https://ccb.jhu.edu/software/bracken/) +5. Standardises output tables +6. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) ## Quick Start @@ -50,12 +71,12 @@ On release, automated continuous integration tests run the pipeline on a full-si 4. Start running your own analysis! - - - ```bash - nextflow run nf-core/taxprofiler --input samplesheet.csv --outdir --genome GRCh37 -profile + ```console + nextflow run nf-core/taxprofiler --input samplesheet.csv --databases database.csv --outdir --run_ --run_ -profile ``` +Note pipeline supports both CSV and PEP input sample sheets. Find out more [here](http://pep.databio.org/en/2.1.0/specification/). + ## Documentation The nf-core/taxprofiler pipeline comes with documentation about the pipeline [usage](https://nf-co.re/taxprofiler/usage), [parameters](https://nf-co.re/taxprofiler/parameters) and [output](https://nf-co.re/taxprofiler/output). @@ -66,7 +87,7 @@ nf-core/taxprofiler was originally written by nf-core community. We thank the following people for their extensive assistance in the development of this pipeline: - +[James A. Fellows Yates](https://github.com/jfy133), [Moritz Beber](https://github.com/Midnighter), [Lauri Mesilaakso](https://github.com/ljmesi), [Sofia Stamouli](https://github.com/sofsam), [Maxime Borry](https://github.com/maxibor). ## Contributions and Support diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 1db0023..e2b5a6e 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -11,3 +11,46 @@ report_section_order: order: -1002 export_plots: true + +custom_logo: "nf-core-taxprofiler_logo_custom_light.png" +custom_logo_url: https://nf-co.re/taxprofiler +custom_logo_title: "nf-core/taxprofiler" + +run_modules: + - fastqc + - adapterRemoval + - fastp + - bowtie2 + - kraken + - malt + - custom_content + +#extra_fn_clean_exts: +# - '_fastp' +# - '.pe.settings' +# - '.se.settings' + +top_modules: + - "fastqc": + name: "FastQC (pre-Trimming)" + path_filters: + - "*raw_*fastqc.zip" + - "fastp" + - "adapterRemoval" + - "fastqc": + name: "FastQC (post-Trimming)" + path_filters: + - "*raw_*processed.zip" + - "kraken": + name: "Kraken" + path_filters: + - "*.kraken2.report.txt" + - "kraken": + name: "Centrifuge" + anchor: "centrifuge" + target: "Centrifuge" + doi: "10.1101/gr.210641.116" + info: "is a very rapid and memory-efficient system for the classification of DNA sequences from microbial samples. The system uses a novel indexing scheme based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index. Note: Figure title" + extra: "Note: plot title will say Kraken2 due to Centrifuge producing the same output format as Kraken. If activated, see the actual Kraken2 results in the section above." + path_filters: + - "*.centrifuge.txt" diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab..82565b1 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,6 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta +2611,ERR5766174,ILLUMINA,,,///fasta/ERX5474930_ERR5766174_1.fa.gz +2612,ERR5766176,ILLUMINA,///fastq/ERX5474932_ERR5766176_1.fastq.gz,///fastq/ERX5474932_ERR5766176_2.fastq.gz, +2612,ERR5766180,ILLUMINA,///fastq/ERX5474936_ERR5766180_1.fastq.gz,, +2613,ERR5766181,ILLUMINA,///fastq/ERX5474937_ERR5766181_1.fastq.gz,///fastq/ERX5474937_ERR5766181_2.fastq.gz, +ERR3201952,ERR3201952,OXFORD_NANOPORE,///fastq/ERR3201952.fastq.gz,, diff --git a/assets/samplesheet_schema.yaml b/assets/samplesheet_schema.yaml new file mode 100644 index 0000000..88ff451 --- /dev/null +++ b/assets/samplesheet_schema.yaml @@ -0,0 +1,55 @@ +description: A schema for validation of samplesheet.csv for taxprofiler pipeline. +imports: + - https://schema.databio.org/pep/2.1.0.yaml +properties: + samples: + type: array + items: + type: object + properties: + sample: + type: string + description: "Sample identifier." + pattern: "^\\S*$" + run_accession: + type: string + description: "Run accession number." + instrument_platform: + type: string + description: "Name of the platform that sequenced the samples." + enum: + [ + "ABI_SOLID", + "BGISEQ", + "CAPILLARY", + "COMPLETE_GENOMICS", + "DNBSEQ", + "HELICOS", + "ILLUMINA", + "ION_TORRENT", + "LS454", + "OXFORD_NANOPORE", + "PACBIO_SMRT", + ] + fastq1: + type: ["string", "null"] + description: "Optional FASTQ file for read 1 of paired-end sequenced libraries." + pattern: "^[\\S]+.(fq\\.gz|fastq\\.gz)$" + fastq2: + type: ["string", "null"] + description: "Optional FASTQ file for read 2 of paired-end sequenced libraries." + pattern: "^[\\S]+.(fq\\.gz|fastq\\.gz)$" + fasta: + type: ["string", "null"] + description: "Optional FASTA file." + pattern: "^[\\S]+.(fa\\.gz|fasta\\.gz)$" + required: + - sample + - run_accession + - instrument_platform + files: + - fastq1 + - fastq2 + - fasta +required: + - samples diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py deleted file mode 100755 index 11b1557..0000000 --- a/bin/check_samplesheet.py +++ /dev/null @@ -1,262 +0,0 @@ -#!/usr/bin/env python - - -"""Provide a command line tool to validate and transform tabular samplesheets.""" - - -import argparse -import csv -import logging -import sys -from collections import Counter -from pathlib import Path - -logger = logging.getLogger() - - -class RowChecker: - """ - Define a service that can validate and transform each given row. - - Attributes: - modified (list): A list of dicts, where each dict corresponds to a previously - validated and transformed row. The order of rows is maintained. - - """ - - VALID_FORMATS = ( - ".fq.gz", - ".fastq.gz", - ) - - def __init__( - self, - sample_col="sample", - first_col="fastq_1", - second_col="fastq_2", - single_col="single_end", - **kwargs, - ): - """ - Initialize the row checker with the expected column names. - - Args: - sample_col (str): The name of the column that contains the sample name - (default "sample"). - first_col (str): The name of the column that contains the first (or only) - FASTQ file path (default "fastq_1"). - second_col (str): The name of the column that contains the second (if any) - FASTQ file path (default "fastq_2"). - single_col (str): The name of the new column that will be inserted and - records whether the sample contains single- or paired-end sequencing - reads (default "single_end"). - - """ - super().__init__(**kwargs) - self._sample_col = sample_col - self._first_col = first_col - self._second_col = second_col - self._single_col = single_col - self._seen = set() - self.modified = [] - - def validate_and_transform(self, row): - """ - Perform all validations on the given row and insert the read pairing status. - - Args: - row (dict): A mapping from column headers (keys) to elements of that row - (values). - - """ - self._validate_sample(row) - self._validate_first(row) - self._validate_second(row) - self._validate_pair(row) - self._seen.add((row[self._sample_col], row[self._first_col])) - self.modified.append(row) - - def _validate_sample(self, row): - """Assert that the sample name exists and convert spaces to underscores.""" - if len(row[self._sample_col]) <= 0: - raise AssertionError("Sample input is required.") - # Sanitize samples slightly. - row[self._sample_col] = row[self._sample_col].replace(" ", "_") - - def _validate_first(self, row): - """Assert that the first FASTQ entry is non-empty and has the right format.""" - if len(row[self._first_col]) <= 0: - raise AssertionError("At least the first FASTQ file is required.") - self._validate_fastq_format(row[self._first_col]) - - def _validate_second(self, row): - """Assert that the second FASTQ entry has the right format if it exists.""" - if len(row[self._second_col]) > 0: - self._validate_fastq_format(row[self._second_col]) - - def _validate_pair(self, row): - """Assert that read pairs have the same file extension. Report pair status.""" - if row[self._first_col] and row[self._second_col]: - row[self._single_col] = False - first_col_suffix = Path(row[self._first_col]).suffixes[-2:] - second_col_suffix = Path(row[self._second_col]).suffixes[-2:] - if first_col_suffix != second_col_suffix: - raise AssertionError("FASTQ pairs must have the same file extensions.") - else: - row[self._single_col] = True - - def _validate_fastq_format(self, filename): - """Assert that a given filename has one of the expected FASTQ extensions.""" - if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): - raise AssertionError( - f"The FASTQ file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FORMATS)}" - ) - - def validate_unique_samples(self): - """ - Assert that the combination of sample name and FASTQ filename is unique. - - In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the - number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment. - - """ - if len(self._seen) != len(self.modified): - raise AssertionError("The pair of sample name and FASTQ must be unique.") - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - row[self._sample_col] = f"{sample}_T{seen[sample]}" - - -def read_head(handle, num_lines=10): - """Read the specified number of lines from the current position in the file.""" - lines = [] - for idx, line in enumerate(handle): - if idx == num_lines: - break - lines.append(line) - return "".join(lines) - - -def sniff_format(handle): - """ - Detect the tabular format. - - Args: - handle (text file): A handle to a `text file`_ object. The read position is - expected to be at the beginning (index 0). - - Returns: - csv.Dialect: The detected tabular format. - - .. _text file: - https://docs.python.org/3/glossary.html#term-text-file - - """ - peek = read_head(handle) - handle.seek(0) - sniffer = csv.Sniffer() - if not sniffer.has_header(peek): - logger.critical("The given sample sheet does not appear to contain a header.") - sys.exit(1) - dialect = sniffer.sniff(peek) - return dialect - - -def check_samplesheet(file_in, file_out): - """ - Check that the tabular samplesheet has the structure expected by nf-core pipelines. - - Validate the general shape of the table, expected columns, and each row. Also add - an additional column which records whether one or two FASTQ reads were found. - - Args: - file_in (pathlib.Path): The given tabular samplesheet. The format can be either - CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. - file_out (pathlib.Path): Where the validated and transformed samplesheet should - be created; always in CSV format. - - Example: - This function checks that the samplesheet follows the following structure, - see also the `viral recon samplesheet`_:: - - sample,fastq_1,fastq_2 - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz - SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz, - - .. _viral recon samplesheet: - https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv - - """ - required_columns = {"sample", "fastq_1", "fastq_2"} - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_in.open(newline="") as in_handle: - reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) - # Validate the existence of the expected header columns. - if not required_columns.issubset(reader.fieldnames): - req_cols = ", ".join(required_columns) - logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") - sys.exit(1) - # Validate each row. - checker = RowChecker() - for i, row in enumerate(reader): - try: - checker.validate_and_transform(row) - except AssertionError as error: - logger.critical(f"{str(error)} On line {i + 2}.") - sys.exit(1) - checker.validate_unique_samples() - header = list(reader.fieldnames) - header.insert(1, "single_end") - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_out.open(mode="w", newline="") as out_handle: - writer = csv.DictWriter(out_handle, header, delimiter=",") - writer.writeheader() - for row in checker.modified: - writer.writerow(row) - - -def parse_args(argv=None): - """Define and immediately parse command line arguments.""" - parser = argparse.ArgumentParser( - description="Validate and transform a tabular samplesheet.", - epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", - ) - parser.add_argument( - "file_in", - metavar="FILE_IN", - type=Path, - help="Tabular input samplesheet in CSV or TSV format.", - ) - parser.add_argument( - "file_out", - metavar="FILE_OUT", - type=Path, - help="Transformed output samplesheet in CSV format.", - ) - parser.add_argument( - "-l", - "--log-level", - help="The desired log level (default WARNING).", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), - default="WARNING", - ) - return parser.parse_args(argv) - - -def main(argv=None): - """Coordinate argument parsing and program execution.""" - args = parse_args(argv) - logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") - if not args.file_in.is_file(): - logger.error(f"The given input file {args.file_in} was not found!") - sys.exit(2) - args.file_out.parent.mkdir(parents=True, exist_ok=True) - check_samplesheet(args.file_in, args.file_out) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/conf/modules.config b/conf/modules.config index da58a5d..d2a0051 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,13 +12,7 @@ process { - publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - - withName: SAMPLESHEET_CHECK { + withName: DATABASE_CHECK { publishDir = [ path: { "${params.outdir}/pipeline_info" }, mode: params.publish_dir_mode, @@ -28,6 +22,409 @@ process { withName: FASTQC { ext.args = '--quiet' + ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } + publishDir = [ + path: { "${params.outdir}/fastqc/raw" }, + mode: params.publish_dir_mode, + pattern: '*.html' + ] + } + + withName: FASTQC_PROCESSED { + ext.args = '--quiet' + ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } + publishDir = [ + path: { "${params.outdir}/fastqc/processed" }, + mode: params.publish_dir_mode, + pattern: '*.html' + ] + } + + withName: FASTP_SINGLE { + ext.args = [ + // trimming options + params.shortread_qc_skipadaptertrim ? "--disable_adapter_trimming" : "", + params.shortread_qc_adapter1 ? "--adapter_sequence ${params.shortread_qc_adapter1}" : "", + // filtering options + "--length_required ${params.shortread_qc_minlength}", + (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp') ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' + ].join(' ').trim() + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/fastp" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads + ] + } + + withName: FASTP_PAIRED { + ext.args = [ + // collapsing options - option to retain singletons + params.shortread_qc_excludeunmerged ? '' : "--include_unmerged", + // trimming options + params.shortread_qc_skipadaptertrim ? "--disable_adapter_trimming" : "", + params.shortread_qc_adapter1 ? "--adapter_sequence ${params.shortread_qc_adapter1}" : "", + params.shortread_qc_adapter2 ? "--adapter_sequence_r2 ${params.shortread_qc_adapter2}" : "--detect_adapter_for_pe", + // filtering options + "--length_required ${params.shortread_qc_minlength}", + params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' + ].join(' ').trim() + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/fastp" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads + ] + } + + withName: ADAPTERREMOVAL_SINGLE { + ext.args = [ + // trimming options + params.shortread_qc_skipadaptertrim ? "--adapter1 '' --adapter2 ''" : "", + params.shortread_qc_adapter1 ? "--adapter1 ${params.shortread_qc_adapter1}" : "", + // filtering options + "--minlength ${params.shortread_qc_minlength}" + ].join(' ').trim() + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/adapterremoval" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads + ] + } + + withName: ADAPTERREMOVAL_PAIRED { + ext.args = [ + // collapsing options + params.shortread_qc_mergepairs ? "--collapse" : "", + // trimming options + params.shortread_qc_skipadaptertrim ? "--adapter1 '' --adapter2 ''" : "", + params.shortread_qc_adapter1 ? "--adapter1 ${params.shortread_qc_adapter1}" : "", + params.shortread_qc_adapter2 ? "--adapter2 ${params.shortread_qc_adapter2}" : "", + // filtering options + "--minlength ${params.shortread_qc_minlength}" + ].join(' ').trim() + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/adapterremoval" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads + ] + } + + withName: PORECHOP { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/porechop" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads + ] + } + + withName: FILTLONG { + ext.args = [ + "--min_length ${params.longread_qc_qualityfilter_minlength}", + "--keep_percent ${params.longread_qc_qualityfilter_keeppercent}", + "--target_bases ${params.longread_qc_qualityfilter_targetbases}" + ] + .join(' ').trim() + ext.prefix = { "${meta.id}_${meta.run_accession}_filtered" } + publishDir = [ + path: { "${params.outdir}/filtlong" }, + mode: params.publish_dir_mode, + pattern: '*.{fastq.gz,log}', + enabled: params.save_preprocessed_reads + ] + } + + withName: BOWTIE2_BUILD { + publishDir = [ + path: { "${params.outdir}/bowtie2/build" }, + mode: params.publish_dir_mode, + enabled: params.save_hostremoval_index, + pattern: 'bowtie2' + ] + } + + withName: BOWTIE2_ALIGN { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + [ + path: { "${params.outdir}/bowtie2/align" }, + mode: params.publish_dir_mode, + pattern: '*.log' + ], + [ + path: { "${params.outdir}/bowtie2/align" }, + mode: params.publish_dir_mode, + enabled: params.save_hostremoval_mapped, + pattern: '*.bam' + ], + [ + path: { "${params.outdir}/bowtie2/align" }, + mode: params.publish_dir_mode, + enabled: params.save_hostremoval_unmapped, + pattern: '*.fastq.gz' + ] + ] + } + + withName: MINIMAP2_INDEX { + ext.args = '-x map-ont' + publishDir = [ + path: { "${params.outdir}/minimap2/index" }, + mode: params.publish_dir_mode, + enabled: params.save_hostremoval_index, + pattern: 'minimap2' + ] + } + + withName: MINIMAP2_ALIGN { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/minimap2/align" }, + mode: params.publish_dir_mode, + enabled: params.save_hostremoval_mapped, + pattern: '*.bam' + ] + } + + withName: SAMTOOLS_VIEW { + ext.args = '-f 4' + ext.prefix = { "${meta.id}.mapped.sorted" } + publishDir = [ + path: { "${params.outdir}/samtools/view" }, + mode: params.publish_dir_mode, + enabled: params.save_hostremoval_unmapped, + pattern: '*.bam' + ] + } + + withName: SAMTOOLS_BAM2FQ { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/samtools/bam2fq" }, + mode: params.publish_dir_mode, + enabled: params.save_hostremoval_unmapped, + pattern: '*.fq.gz' + ] + } + + withName: BBMAP_BBDUK { + ext.args = [ + "entropy=${params.shortread_complexityfilter_entropy}", + "entropywindow=${params.shortread_complexityfilter_bbduk_windowsize}", + params.shortread_complexityfilter_bbduk_mask ? "entropymask=t" : "entropymask=f" + ].join(' ').trim() + ext.prefix = { "${meta.id}-${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/bbduk/" }, + mode: params.publish_dir_mode, + pattern: '*.{fastq.gz,log}', + enabled: params.save_complexityfiltered_reads + ] + } + + withName: PRINSEQPLUSPLUS { + ext.args = [ + params.shortread_complexityfilter_prinseqplusplus_mode == 'dust' ? "-lc_dust=${params.shortread_complexityfilter_prinseqplusplus_dustscore}" : "-lc_entropy=${params.shortread_complexityfilter_entropy}", + "-trim_qual_left=0 -trim_qual_left=0 -trim_qual_window=0 -trim_qual_step=0", + "-VERBOSE 2" + ].join(' ').trim() + ext.prefix = { "${meta.id}-${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/prinseqplusplus/" }, + mode: params.publish_dir_mode, + pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz,log}', + enabled: params.save_complexityfiltered_reads + ] + } + + withName: CAT_FASTQ { + ext.prefix = { "${meta.id}" } + publishDir = [ + path: { "${params.outdir}/run_merging/" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_runmerged_reads + ] + } + + withName: MALT_RUN { + ext.args = { "${meta.db_params}" } + // one run with multiple samples, so fix ID to just db name to ensure clean log name + ext.prefix = { "${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/malt/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{rma6,log,sam}' + ] + } + + withName: 'MEGAN_RMA2INFO_TSV' { + ext.args = "-c2c Taxonomy" + ext.prefix = { "${meta.id}" } + publishDir = [ + path: { "${params.outdir}/malt/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt.gz,megan}' + ] + } + + withName: KRAKEN2_KRAKEN2 { + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/kraken2/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt,report,fastq.gz}' + ] + } + + withName: KRAKENTOOLS_COMBINEKREPORTS { + ext.prefix = { "kraken2_${meta.id}_combined_reports" } + publishDir = [ + path: { "${params.outdir}/kraken2/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt}' + ] + } + + withName: KRONA_CLEANUP { + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/krona/" }, + mode: params.publish_dir_mode, + pattern: '*.{html}' + ] + } + + withName: KRONA_KTIMPORTTEXT { + ext.prefix = { "${meta.tool}-${meta.id}" } + publishDir = [ + path: { "${params.outdir}/krona/" }, + mode: params.publish_dir_mode, + pattern: '*.{html}' + ] + } + + withName: 'MEGAN_RMA2INFO_KRONA' { + ext.args = { "--read2class Taxonomy" } + ext.prefix = { "${meta.id}-${meta.db_name}" } + } + + withName: KRONA_KTIMPORTTAXONOMY { + ext.args = "-i" + ext.prefix = { "${meta.tool}-${meta.id}" } + publishDir = [ + path: { "${params.outdir}/krona/" }, + mode: params.publish_dir_mode, + pattern: '*.{html}' + ] + } + + withName: METAPHLAN3_METAPHLAN3 { + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/metaphlan3/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{biom,txt}' + ] + } + + withName: METAPHLAN3_MERGEMETAPHLANTABLES { + ext.prefix = { "metaphlan3_${meta.id}_combined_reports" } + publishDir = [ + path: { "${params.outdir}/metaphlan3/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt}' + ] + } + + withName: CENTRIFUGE_CENTRIFUGE { + publishDir = [ + path: { "${params.outdir}/centrifuge/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt,sam,gz}' + ] + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}.centrifuge" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}.centrifuge" } + } + + withName: CENTRIFUGE_KREPORT { + errorStrategy = {task.exitStatus == 255 ? 'ignore' : 'retry'} + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}.centrifuge" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}.centrifuge" } + publishDir = [ + path: { "${params.outdir}/centrifuge/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt}' + ] + } + + withName: KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE { + ext.prefix = { "centrifuge_${meta.id}_combined_reports" } + publishDir = [ + path: { "${params.outdir}/centrifuge/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt}' + ] + } + + withName: KAIJU_KAIJU { + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/kaiju/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.tsv' + ] + ext.args = { "${meta.db_params}" } + } + + withName: KAIJU_KAIJU2TABLE { + ext.prefix = { "kaiju_${meta.id}_combined_reports" } + publishDir = [ + path: { "${params.outdir}/kaiju/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt}' + ] + } + + withName: KAIJU_KAIJU2KRONA { + ext.args = '-v -u' + } + + withName: DIAMOND_BLASTX { + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/diamond/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{blast,xml,txt,daa,sam,tsv,paf,log}' + ] + } + + withName: MOTUS_PROFILE { + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/motus/${meta.db_name}/" }, + mode: params.publish_dir_mode + ] + } + + withName: MOTUS_MERGE { + ext.args = { params.generate_biom_output ? "-B" : "" } + ext.prefix = { "motus_${meta.id}_combined_reports" } + publishDir = [ + path: { "${params.outdir}/motus/" }, + mode: params.publish_dir_mode + ] } withName: CUSTOM_DUMPSOFTWAREVERSIONS { @@ -38,4 +435,19 @@ process { ] } + withName: MULTIQC { + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'EIDO_VALIDATE' { + ext.args = '--st-index sample' + } + + withName: 'EIDO_CONVERT' { + ext.args = '--st-index sample' + } } diff --git a/conf/test.config b/conf/test.config index 42772cf..d5dcd67 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,8 +22,45 @@ params { // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' - - // Genome references - genome = 'R64-1-1' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + perform_shortread_qc = true + perform_longread_qc = true + shortread_qc_mergepairs = true + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_malt = true + run_metaphlan3 = true + run_centrifuge = true + run_diamond = true + run_motus = false + run_krona = true + krona_taxonomy_directory = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/metagenome/krona_taxonomy.tab' + malt_save_reads = true + kraken2_save_reads = true + centrifuge_save_reads = true + diamond_save_reads = true +} + +process { + withName: MALT_RUN { + maxForks = 1 + } + withName: MEGAN_RMA2INFO_TSV { + maxForks = 1 + } + withName: MEGAN_RMA2INFO_KRONA { + maxForks = 1 + } + withName: 'EIDO_VALIDATE' { + ext.args = '--st-index sample' + } + withName: 'EIDO_CONVERT' { + ext.args = '--st-index sample' + } } diff --git a/conf/test_motus.config b/conf/test_motus.config new file mode 100644 index 0000000..d5eb8f8 --- /dev/null +++ b/conf/test_motus.config @@ -0,0 +1,42 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'mOTUs Test profile' + config_profile_description = 'Minimal test to check mOTUs function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'database_motus.csv' + perform_shortread_qc = false + perform_longread_qc = false + perform_shortread_complexityfilter = false + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + perform_runmerging = false + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = false + run_kraken2 = false + run_malt = false + run_metaphlan3 = false + run_centrifuge = false + run_diamond = false + run_motus = true + run_profile_standardisation = true +} diff --git a/conf/test_nopreprocessing.config b/conf/test_nopreprocessing.config new file mode 100644 index 0000000..3908b56 --- /dev/null +++ b/conf/test_nopreprocessing.config @@ -0,0 +1,48 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset skipping all preprocessing to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + perform_shortread_qc = false + perform_longread_qc = false + perform_shortread_complexityfilter = false + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + perform_runmerging = false + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_malt = true + run_metaphlan3 = true + run_centrifuge = true + run_diamond = true + run_motus = false + run_krona = true +} + +process { + withName: MALT_RUN { + maxForks = 1 + } +} diff --git a/conf/test_noprofiling.config b/conf/test_noprofiling.config new file mode 100644 index 0000000..12c7185 --- /dev/null +++ b/conf/test_noprofiling.config @@ -0,0 +1,48 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset without performing any profiling to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + perform_shortread_qc = true + perform_longread_qc = true + shortread_qc_mergepairs = true + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = false + run_kraken2 = false + run_malt = false + run_metaphlan3 = false + run_centrifuge = false + run_diamond = false + run_motus = false +} + +process { + withName: MALT_RUN { + maxForks = 1 + } +} diff --git a/conf/test_nothing.config b/conf/test_nothing.config new file mode 100644 index 0000000..c0ecece --- /dev/null +++ b/conf/test_nothing.config @@ -0,0 +1,47 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = "Minimal test dataset without performing any preprocessing nor profiling to check pipeline function. Useful when you only wish to test a single profiler without having to 'opt-out' of all the others" + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + perform_shortread_qc = false + perform_longread_qc = false + perform_shortread_complexityfilter = false + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + perform_runmerging = false + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = false + run_kraken2 = false + run_malt = false + run_metaphlan3 = false + run_centrifuge = false + run_diamond = false + run_motus = false +} + +process { + withName: MALT_RUN { + maxForks = 1 + } +} diff --git a/conf/test_pep.config b/conf/test_pep.config new file mode 100644 index 0000000..7f8c95d --- /dev/null +++ b/conf/test_pep.config @@ -0,0 +1,43 @@ +params { + config_profile_name = 'Test PEP profile' + config_profile_description = 'Minimal test dataset to check pipeline function with PEP file as an input.' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/pep/test_pep_format_files/config.yaml' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + perform_shortread_qc = true + perform_longread_qc = true + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_malt = true + run_metaphlan3 = true + run_centrifuge = true + run_diamond = true + run_motus = false + run_krona = true + krona_taxonomy_directory = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/metagenome/krona_taxonomy.tab' + malt_save_reads = true + kraken2_save_reads = true + centrifuge_save_reads = true + diamond_save_reads = true +} + + +process { + withName: MALT_RUN { + maxForks = 1 + } + withName: MEGAN_RMA2INFO { + maxForks = 1 + } +} diff --git a/docs/images/nf-core-taxprofiler_icon.png b/docs/images/nf-core-taxprofiler_icon.png new file mode 100644 index 0000000..c639fb6 Binary files /dev/null and b/docs/images/nf-core-taxprofiler_icon.png differ diff --git a/docs/images/nf-core-taxprofiler_icon.svg b/docs/images/nf-core-taxprofiler_icon.svg new file mode 100644 index 0000000..24e615f --- /dev/null +++ b/docs/images/nf-core-taxprofiler_icon.svg @@ -0,0 +1,444 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/nf-core-taxprofiler_icon_border.svg b/docs/images/nf-core-taxprofiler_icon_border.svg new file mode 100644 index 0000000..887e8e8 --- /dev/null +++ b/docs/images/nf-core-taxprofiler_icon_border.svg @@ -0,0 +1,445 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/nf-core-taxprofiler_logo_custom_dark.png b/docs/images/nf-core-taxprofiler_logo_custom_dark.png new file mode 100644 index 0000000..6b089fc Binary files /dev/null and b/docs/images/nf-core-taxprofiler_logo_custom_dark.png differ diff --git a/docs/images/nf-core-taxprofiler_logo_custom_dark.svg b/docs/images/nf-core-taxprofiler_logo_custom_dark.svg new file mode 100644 index 0000000..3d47b4c --- /dev/null +++ b/docs/images/nf-core-taxprofiler_logo_custom_dark.svg @@ -0,0 +1,2302 @@ + + + +taxfindertaxprofiler/ diff --git a/docs/images/nf-core-taxprofiler_logo_custom_light.png b/docs/images/nf-core-taxprofiler_logo_custom_light.png new file mode 100644 index 0000000..2dc85b8 Binary files /dev/null and b/docs/images/nf-core-taxprofiler_logo_custom_light.png differ diff --git a/docs/images/nf-core-taxprofiler_logo_custom_light.svg b/docs/images/nf-core-taxprofiler_logo_custom_light.svg new file mode 100644 index 0000000..dae1fbe --- /dev/null +++ b/docs/images/nf-core-taxprofiler_logo_custom_light.svg @@ -0,0 +1,2305 @@ + + + +taxfindertaxprofiler/ diff --git a/docs/images/nf_core_taxprofiler_icon_border.png b/docs/images/nf_core_taxprofiler_icon_border.png new file mode 100644 index 0000000..c513de0 Binary files /dev/null and b/docs/images/nf_core_taxprofiler_icon_border.png differ diff --git a/docs/images/taxprofiler_logo.svg b/docs/images/taxprofiler_logo.svg new file mode 100644 index 0000000..0cb901d --- /dev/null +++ b/docs/images/taxprofiler_logo.svg @@ -0,0 +1,3223 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + taxfinder + + + + + + + + + + + + taxprofiler + / + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + taxfinder + + + + + + + + + + + + taxprofiler + / + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/taxprofiler_tube.pdf b/docs/images/taxprofiler_tube.pdf new file mode 100644 index 0000000..63d8c06 Binary files /dev/null and b/docs/images/taxprofiler_tube.pdf differ diff --git a/docs/images/taxprofiler_tube.png b/docs/images/taxprofiler_tube.png new file mode 100644 index 0000000..0d0a4aa Binary files /dev/null and b/docs/images/taxprofiler_tube.png differ diff --git a/docs/images/taxprofiler_tube.svg b/docs/images/taxprofiler_tube.svg new file mode 100644 index 0000000..0ea3460 --- /dev/null +++ b/docs/images/taxprofiler_tube.svg @@ -0,0 +1,4407 @@ + + + +SEQUENCINGQUALITY CONTROLfastqfastqfastqfasta(ADAPTER TRIMMING & MERGING)(COMPLEXITY FILTERING)(HOST REMOVAL)(RUN MERGING)TAXONOMIC CLASSIFICATIONSUMMARY STATISTICSfastpAdapterRemovalBBDukPRINSEQ++Short ReadsAll ReadsProfileMultiple databasesLog FilesLong ReadsBowtie2catminimap2FiltlongMultiQCKronaPorechopBrackenKraken2KrakenUniqCentrifugeKaijumOTUsMetaPhlAn3MALTDIAMONDFastQCFastQCFastQC(LENGTH FILTERING)taxfindertaxprofiler/dbtsvtsvtsvtsvtsvtsvtsvtsvtsvtsvMANDATORY STEP(OPTIONAL STEP)htmlhtml diff --git a/docs/usage.md b/docs/usage.md index 74d3cce..8b8b588 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,56 +8,150 @@ -## Samplesheet input +## Samplesheet inputs -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +nf-core/taxprofiler can accept as input raw or preprocessed single- or paired-end short-read (e.g. Illumina) FASTQ files, long-read FASTQ files (e.g. Oxford Nanopore), or FASTA sequences (available for a subset of profilers). -```bash ---input '[path to samplesheet file]' +> ⚠️ Input FASTQ files _must_ be gzipped, while FASTA files may optionally be uncompressed (although this is not recommended) + +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row as shown in the examples below. Furthermother, nf-core/taxprofiler also requires a second comma-separated file of 3 columns with a header row as in the examples below. + +This samplesheet is then specified on the command line as follows: + +```console +--input '[path to samplesheet file]' --databases '[path to database sheet file]' ``` +Note pipeline supports both CSV and PEP input sample sheets. Find out more [here](http://pep.databio.org/en/2.1.0/specification/). +When using PEP as an input, the `samplesheet.csv` must be placed in the same folder +as `config.yaml` file. A path to `samplesheet.csv` within the config must be absolute. + ### Multiple runs of the same sample -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate different runs FASTQ files of the same sample before performing profiling, when `--perform_runmerging` is supplied. Below is an example for the same sample sequenced across 3 lanes: ```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz +sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta +2612,run1,ILLUMINA,2612_run1_R1.fq.gz,, +2612,run2,ILLUMINA,2612_run2_R1.fq.gz,, +2612,run3,ILLUMINA,2612_run3_R1.fq.gz,2612_run3_R2.fq.gz, + ``` +> ⚠️ Runs of the same sample sequenced on Illumina platforms with a combination of single and paired-end data will **not** be run-wise concatenated, unless pair-merging is specified. In the example above, `run3` will be profiled independently of `run1` and `run2` if pairs are not merged. + ### Full samplesheet -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 6 columns to match those defined in the table below. -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +A final samplesheet file consisting of both single- and paired-end data, as well as long-read FASTA files may look something like the one below. This is for 6 samples, where `2612` has been sequenced twice. ```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +2611,ERR5766174,ILLUMINA,,,///fasta/ERX5474930_ERR5766174_1.fa.gz +2612,ERR5766176,ILLUMINA,///fastq/ERX5474932_ERR5766176_1.fastq.gz,///fastq/ERX5474932_ERR5766176_2.fastq.gz, +2612,ERR5766180,ILLUMINA,///fastq/ERX5474936_ERR5766180_1.fastq.gz,, +2613,ERR5766181,ILLUMINA,///fastq/ERX5474937_ERR5766181_1.fastq.gz,///fastq/ERX5474937_ERR5766181_2.fastq.gz, +ERR3201952,ERR3201952,OXFORD_NANOPORE,///fastq/ERR3201952.fastq.gz,, ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| Column | Description | +| --------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Unique sample name [required]. | +| `run_accession` | Run ID or name unique for each (pairs of) file(s) .Can also supply sample name again here, if only a single run was generated [required]. | +| `instrument_platform` | Sequencing platform reads generated on, selected from the EBI ENA [controlled vocabulary](https://www.ebi.ac.uk/ena/portal/api/controlledVocab?field=instrument_platform) [required]. | +| `fastq_1` | Path or URL to sequencing reads or for Illumina R1 sequencing reads in FASTQ format. GZipped compressed files accepted. Can be left empty if data in FASTA is specifed. Cannot be combined with `fasta`. | +| `fastq_2` | Path or URL to Illumina R2 sequencing reads in FASTQ format. GZipped compressed files accepted. Can be left empty if single end data. Cannot be combined with `fasta`. | +| `fasta` | Path or URL to long-reads or contigs in FASTA format. GZipped compressed files accepted. Can be left empty if data in FASTA is specifed. Cannot be combined with `fastq_1` or `fastq_2`. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +### Full database sheet + +nf-core/taxprofiler supports multiple databases being profiled in parallel for each tool. +Databases can be supplied either in the form of a compressed `.tar.gz` archive of a directory containing all relevant database files or the path to a directory on the filesystem. +The pipeline takes the locations and specific profiling parameters of the tool of these databases as input via a four column comma-separated sheet. + +> ⚠️ nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user. See below for more information of the expected database files. + +An example database sheet can look as follows, where 4 tools are being used, and `malt` and `kraken2` will be used against two databases each. + +```console +tool,db_name,db_params,db_path +malt,malt85,-id 85,///malt/testdb-malt/ +malt,malt95,-id 90,///malt/testdb-malt.tar.gz +kraken2,db1,,///kraken2/testdb-kraken2.tar.gz +kraken2,db2,--quick,///kraken2/testdb-kraken2.tar.gz +centrifuge,db1,,///centrifuge/minigut_cf.tar.gz +metaphlan3,db1,,///metaphlan3/metaphlan_database/ +motus,db_mOTU,,///motus/motus_database/ +``` + +Column specifications are as follows: + +| Column | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tool` | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required]. | +| `db_name` | A unique name of the particular database [required]. | +| `db_params` | Any parameters of the given taxonomic profiler that you wish to specify that the taxonomic profiling tool should use when profiling against this specific. Can be empty to use taxonomic profiler defaults. Must not be surrounded by quotes [required]. We generally do not recommend specifying parameters here that turn on/off saving of output files or specifying particular file extensions - this should be already addressed via pipeline parameters. | +| `db_path` | Path to the database. Can either be a path to a directory containing the database index files or a `.tar.gz` file which contains the compressed database directory with the same name as the tar archive, minus `.tar.gz` [required]. | + +> 💡 You can also specify the same database directory/file twice (ensuring unique `db_name`s) and specify different parameters for each database to compare the effect of different parameters during profiling. + +nf-core/taxprofiler will automatically decompress and extract any compressed archives for you. + +Expected (uncompressed) database files for each tool are as follows: + +- **MALT** output of `malt-build`. A directory containing: + - `ref.idx` + - `taxonomy.idx` + - `taxonomy.map` + - `index0.idx` + - `table0.idx` + - `table0.db` + - `ref.inf` + - `ref.db` + - `taxonomy.tre` +- **Kraken2** output of `kraken2-build` command(s) A directory containing: + - `opts.k2d` + - `hash.k2d` + - `taxo.k2d` +- **Centrifuge** output of `centrifuge-build`. A directory containing: + - `..cf` + - `..cf` + - `..cf` + - `..cf` +- **MetaPhlAn3** generated with `metaphlan --install` or downloaded from links on the [MetaPhlAn3 wiki](https://github.com/biobakery/MetaPhlAn/wiki/MetaPhlAn-3.0#customizing-the-database). A directory containing: + - `mpa_v30_CHOCOPhlAn_201901.pkl` + - `mpa_v30_CHOCOPhlAn_201901.pkl` + - `mpa_v30_CHOCOPhlAn_201901.fasta` + - `mpa_v30_CHOCOPhlAn_201901.3.bt2` + - `mpa_v30_CHOCOPhlAn_201901.4.bt2` + - `mpa_v30_CHOCOPhlAn_201901.1.bt2` + - `mpa_v30_CHOCOPhlAn_201901.2.bt2` + - `mpa_v30_CHOCOPhlAn_201901.rev.1.bt2` + - `mpa_v30_CHOCOPhlAn_201901.rev.2.bt2` + - `mpa_latest` +- **Kaiju** output of `kaiju-makedb`. A directory containing: + - `kaiju_db_*.fmi` + - `nodes.dmp` + - `names.dmp` +- **DIAMOND** output of `diamond makedb`. Note: requires building with taxonomy files + to generate taxonomic profile. See [DIAMOND documentation](https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#makedb-options). A file named: + - `.dmnd` +- **mOTUs** is composed of code and database together. The mOTUs tools + [`downloadDB`](https://github.com/motu-tool/mOTUs/blob/master/motus/downloadDB.py) + is used to prepare the mOTUs database and create a file with the version information. + The database download step can be time consuming and the database will be consisting + with same release version of the mOTUs tools. The database for same version tools + can be thus reused for multiple runs. Users can download the database once using the script above and + specify the path the database to the TSV table provided to `--databases`. + ## Running the pipeline The typical command for running the pipeline is as follows: -```bash -nextflow run nf-core/taxprofiler --input samplesheet.csv --outdir --genome GRCh37 -profile docker +```console +nextflow run nf-core/taxprofiler --input samplesheet.csv --databases databases.csv --outdir -profile docker --run_ --run_ ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -71,6 +165,72 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` +### Preprocessing Steps + +nf-core/taxprofiler offers four main preprocessing steps + +- Read processing: adapter clipping and pair-merging. +- Complexity filtering: removal of low-sequence complexity reads. +- Host read-removal: removal of reads aligning to reference genome(s) of a host. +- Run merging: concatenation of multiple FASTQ chunks/sequencing runs/libraries of a sample. + +#### Read Processing + +Raw sequencing read processing in the form of adapter clipping and paired-end read merging can be activated via the `--perform_shortread_qc` or `--perform_longread_qc` flags. + +It is highly recommended to run this on raw reads to remove artefacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles. + +There are currently two options for short-read preprocessing: `fastp` or `adapterremoval`. + +For adapter clipping, you can either rely on tool default adapter sequences, or supply your own adapters (`--shortread_qc_adapter1` and `--shortread_qc_adapter2`) +By default, paired-end merging is not activated and paired-end profiling is performed where supported otherwise pairs will be independently profiled. If paired-end merging is activated you can also specify whether to exclude unmerged reads in the reads sent for profiling (`--shortread_qc_mergepairs` and `--shortread_qc_excludeunmerged`). +You can also turn off clipping and only perform paired-end merging, if requested. This can be useful when processing data downloaded from the ENA, SRA, or DDBJ (`--shortread_qc_skipadaptertrim`). +Both tools support length filtering of reads and can be tuned with `--shortread_qc_minlength`. Performing length filtering can be useful to remove short (often low sequencing complexity) sequences that result in unspecific classification and therefore slow down runtime during profiling, with minimal gain. + +There is currently one option for long-read Oxford Nanopore processing: `porechop`. + +For both short-read and long-read preprocessing, you can optionally save the resulting processed reads with `--save_preprocessed_reads`. + +#### Complexity Filtering + +Complexity filtering can be activated via the `--perform_shortread_complexityfilter` flag. + +Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources. + +There are currently three options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus), and [`fastp`](https://github.com/OpenGene/fastp#low-complexity-filter). + +The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of the tools (see links above) to decide on optimal methods and parameters for your dataset. + +You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. If running with `fastp`, complexity filtering happens inclusively within the earlier shortread preprocessing step. Therefore there will not be an independent pipeline step for complexity filtering, and no independent FASTQ file (i.e. `--save_complexityfiltered_reads` will be ignored) - your complexity filtered reads will also be in the `fastp/` folder in the same file(s) as the preprocessed read. + +#### Host Removal + +Removal of possible-host reads from FASTQ files prior profiling can be activated with `--perform_shortread_hostremoval` or `--perform_longread_hostremoval`. + +Similarly to complexity filtering, host-removal can be useful for runtime optimisation and reduction in misclassified reads. It is not always necessary to report classification of reads from a host when you already know the host of the sample, therefore you can gain a run-time and computational advantage by removing these prior typically resource-heavy profiling with more efficient methods. Furthermore, particularly with human samples, you can reduce the number of false positives during profiling that occur due to host-sequence contamination in reference genomes on public databases. + +nf-core/taxprofiler currently offers host-removal via alignment against a reference genome with Bowtie2, and the use of the unaligned reads for downstream profiling. + +You can supply your reference genome in FASTA format with `--hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index` or a minimap2 `.mmi` file for `--longread_hostremoval_index`, however nf-core/taxprofiler will generate these for you if necessary. Pre-supplying the index directory or files can greatly speed up the process, and these can be re-used. + +> 💡 If you have multiple taxa or sequences you wish to remove (e.g., the host genome and then also PhiX - common quality-control reagent during sequencing) you can simply concatenate the FASTAs of each taxa or sequences into a single reference file. + +#### Run Merging + +For samples that may have been sequenced over multiple runs, or for FASTQ files split into multiple chunks, you can activate the ability to merge across all runs or chunks with `--perform_runmerging`. + +For more information how to set up your input samplesheet, see [Multiple runs of the same sample](#multiple-runs-of-the-same-sample). + +Activating this functionality will concatenate the FASTQ files with the same sample name _after_ the optional preprocessing steps and _before_ profiling. Note that libraries with runs of different pairing types will **not** be merged and this will be indicated on output files with a `_se` or `_pe` suffix to the sample name accordingly. + +You can optionally save the FASTQ output of the run merging with the `--save_runmerged_reads`. + +##### Profiling + +###### MALT + +nf-core/taxprofiler uses MALT 0.4.1, which is a compatively old version. However it has been found that the most recent version of MALT (0.5.\*), at the time of writing, is broken. [The the LCA step appears not to be executed](http://megan.informatik.uni-tuebingen.de/t/lca-placement-failure-with-malt-v-0-5-2-and-0-5-3/1996/3), pushing all hits to the leaves of the taxonomy. However, if you need to use a more recent taxonomy map file with your databases, the output of `malt-build` from MALT 0.5.3 should be still be compatible with `malt-run` of 0.4.1. + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: @@ -121,6 +281,9 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - `test` - A profile with a complete configuration for automated testing - Includes links to test data so needs no other parameters +- `test_pep` + - A profile with a complete configuration for running a pipeline with PEP as input + - Includes links to test data so needs no other parameters ### `-resume` @@ -262,3 +425,13 @@ We recommend adding the following line to your environment to limit this (typica ```bash NXF_OPTS='-Xms1g -Xmx4g' ``` + +## Troubleshooting and FAQs + +### I get a warning during centrifuge_kreport process with exit status 255. + +When a sample has insufficient hits for abundance estimation, the resulting `report.txt` file will be empty. + +When trying to convert this to a kraken-style report, the conversion tool will exit with a status code `255`, and provide a `WARN`. + +This is **not** an error nor a failure of the pipeline, just your sample has no hits to the provided database when using centrifuge. diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index c90cf49..7883d70 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -75,7 +75,7 @@ class WorkflowMain { // Check input has been provided if (!params.input) { - log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'" + log.error "Please provide an input samplesheet or PEP to the pipeline e.g. '--input samplesheet.csv'" System.exit(1) } } diff --git a/lib/WorkflowTaxprofiler.groovy b/lib/WorkflowTaxprofiler.groovy index bbe7c2b..7bf44c9 100755 --- a/lib/WorkflowTaxprofiler.groovy +++ b/lib/WorkflowTaxprofiler.groovy @@ -12,11 +12,11 @@ class WorkflowTaxprofiler { public static void initialise(params, log) { genomeExistsError(params, log) - - if (!params.fasta) { - log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." - System.exit(1) - } + // TODO update as necessary + //if (!params.fasta) { + // log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." + // System.exit(1) + //} } // diff --git a/modules.json b/modules.json index bcea8f6..10d6c74 100644 --- a/modules.json +++ b/modules.json @@ -5,17 +5,153 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "adapterremoval": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "bbmap/bbduk": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "bowtie2/align": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "bowtie2/build": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "cat/fastq": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "centrifuge/centrifuge": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "centrifuge/kreport": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" }, + "diamond/blastx": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "eido/convert": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "eido/validate": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "fastp": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, "fastqc": { "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" }, + "filtlong": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "gunzip": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "kaiju/kaiju": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "kaiju/kaiju2krona": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "kaiju/kaiju2table": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "kraken2/kraken2": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "krakentools/combinekreports": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "krakentools/kreport2krona": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "krona/ktimporttaxonomy": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "krona/ktimporttext": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "malt/run": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "megan/rma2info": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "metaphlan3/mergemetaphlantables": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "metaphlan3/metaphlan3": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "minimap2/align": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "minimap2/index": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "motus/merge": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "motus/profile": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, "multiqc": { "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "porechop": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "prinseqplusplus": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "samtools/bam2fq": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "samtools/view": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "untar": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" } } } diff --git a/modules/local/samplesheet_check.nf b/modules/local/database_check.nf similarity index 80% rename from modules/local/samplesheet_check.nf rename to modules/local/database_check.nf index dea4362..4da4313 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/database_check.nf @@ -1,5 +1,5 @@ -process SAMPLESHEET_CHECK { - tag "$samplesheet" +process DATABASE_CHECK { + tag "$databasesheet" conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -7,7 +7,7 @@ process SAMPLESHEET_CHECK { 'quay.io/biocontainers/python:3.8.3' }" input: - path samplesheet + path databasesheet output: path '*.csv' , emit: csv @@ -15,9 +15,7 @@ process SAMPLESHEET_CHECK { script: // This script is bundled with the pipeline, in nf-core/taxprofiler/bin/ """ - check_samplesheet.py \\ - $samplesheet \\ - samplesheet.valid.csv + cat $databasesheet >> database_sheet.valid.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/ensure_fastq_extension.nf b/modules/local/ensure_fastq_extension.nf new file mode 100644 index 0000000..6de223b --- /dev/null +++ b/modules/local/ensure_fastq_extension.nf @@ -0,0 +1,31 @@ +process ENSURE_FASTQ_EXTENSION { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "conda-forge::bash=5.0" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv2/biocontainers_v1.2.0_cv2.img' : + 'biocontainers/biocontainers:v1.2.0_cv2' }" + + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path('*.fastq.gz'), emit: reads + + script: + if (meta.single_end) { + fastq = "${reads.baseName}.fastq.gz" + """ + ln -s '${reads}' '${fastq}' + """ + } else { + first = "${reads[0].baseName}.fastq.gz" + second = "${reads[1].baseName}.fastq.gz" + """ + ln -s '${reads[0]}' '${first}' + ln -s '${reads[1]}' '${second}' + """ + } +} diff --git a/modules/local/krona_cleanup.nf b/modules/local/krona_cleanup.nf new file mode 100644 index 0000000..804cb69 --- /dev/null +++ b/modules/local/krona_cleanup.nf @@ -0,0 +1,40 @@ +process KRONA_CLEANUP { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "conda-forge::sed=4.7" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img' : + 'biocontainers/biocontainers:v1.2.0_cv1' }" + + input: + tuple val(meta), path(krona, stageAs: 'uncleaned.krona.txt') + + output: + tuple val(meta), path("*.txt"), emit: txt + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + # Copy the file to a new name + cp ${krona} ${prefix}.txt + + # Remove ugly 'x__' prefixes for each of the taxonomic levels + LEVELS=(d k p c o f g s) + for L in "\${LEVELS[@]}"; do + sed -i "s/\${L}__//g" ${prefix}.txt + done + + # Remove underscores that are standing in place of spaces + sed -i "s/_/ /g" ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/adapterremoval/main.nf b/modules/nf-core/adapterremoval/main.nf new file mode 100644 index 0000000..0e17c05 --- /dev/null +++ b/modules/nf-core/adapterremoval/main.nf @@ -0,0 +1,92 @@ +process ADAPTERREMOVAL { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? "bioconda::adapterremoval=2.3.2" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/adapterremoval:2.3.2--hb7ba0dd_0' : + 'quay.io/biocontainers/adapterremoval:2.3.2--hb7ba0dd_0' }" + + input: + tuple val(meta), path(reads) + path(adapterlist) + + output: + tuple val(meta), path("${prefix}.truncated.fastq.gz") , optional: true, emit: singles_truncated + tuple val(meta), path("${prefix}.discarded.fastq.gz") , optional: true, emit: discarded + tuple val(meta), path("${prefix}.pair{1,2}.truncated.fastq.gz") , optional: true, emit: paired_truncated + tuple val(meta), path("${prefix}.collapsed.fastq.gz") , optional: true, emit: collapsed + tuple val(meta), path("${prefix}.collapsed.truncated.fastq.gz") , optional: true, emit: collapsed_truncated + tuple val(meta), path("${prefix}.paired.fastq.gz") , optional: true, emit: paired_interleaved + tuple val(meta), path('*.settings') , emit: settings + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def list = adapterlist ? "--adapter-list ${adapterlist}" : "" + prefix = task.ext.prefix ?: "${meta.id}" + + if (meta.single_end) { + """ + AdapterRemoval \\ + --file1 $reads \\ + $args \\ + $adapterlist \\ + --basename ${prefix} \\ + --threads ${task.cpus} \\ + --seed 42 \\ + --gzip + + ensure_fastq() { + if [ -f "\${1}" ]; then + mv "\${1}" "\${1::-3}.fastq.gz" + fi + + } + + ensure_fastq '${prefix}.truncated.gz' + ensure_fastq '${prefix}.discarded.gz' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g") + END_VERSIONS + """ + } else { + """ + AdapterRemoval \\ + --file1 ${reads[0]} \\ + --file2 ${reads[1]} \\ + $args \\ + $adapterlist \\ + --basename ${prefix} \\ + --threads $task.cpus \\ + --seed 42 \\ + --gzip + + ensure_fastq() { + if [ -f "\${1}" ]; then + mv "\${1}" "\${1::-3}.fastq.gz" + fi + + } + + ensure_fastq '${prefix}.truncated.gz' + ensure_fastq '${prefix}.discarded.gz' + ensure_fastq '${prefix}.pair1.truncated.gz' + ensure_fastq '${prefix}.pair2.truncated.gz' + ensure_fastq '${prefix}.collapsed.gz' + ensure_fastq '${prefix}.collapsed.truncated.gz' + ensure_fastq '${prefix}.paired.gz' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g") + END_VERSIONS + """ + } + +} diff --git a/modules/nf-core/adapterremoval/meta.yml b/modules/nf-core/adapterremoval/meta.yml new file mode 100644 index 0000000..77273f6 --- /dev/null +++ b/modules/nf-core/adapterremoval/meta.yml @@ -0,0 +1,90 @@ +name: adapterremoval +description: Trim sequencing adapters and collapse overlapping reads +keywords: + - trimming + - adapters + - merging + - fastq +tools: + - adapterremoval: + description: The AdapterRemoval v2 tool for merging and clipping reads. + homepage: https://github.com/MikkelSchubert/adapterremoval + documentation: https://adapterremoval.readthedocs.io + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + pattern: "*.{fq,fastq,fq.gz,fastq.gz}" + - adapterlist: + type: file + description: Optional text file containing list of adapters to look for for removal + with one adapter per line. Otherwise will look for default adapters (see + AdapterRemoval man page), or can be modified to remove user-specified + adapters via ext.args. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - singles_truncated: + type: file + description: | + Adapter trimmed FastQ files of either single-end reads, or singleton + 'orphaned' reads from merging of paired-end data (i.e., one of the pair + was lost due to filtering thresholds). + pattern: "*.truncated.fastq.gz" + - discarded: + type: file + description: | + Adapter trimmed FastQ files of reads that did not pass filtering + thresholds. + pattern: "*.discarded.fastq.gz" + - pair1_truncated: + type: file + description: | + Adapter trimmed R1 FastQ files of paired-end reads that did not merge + with their respective R2 pair due to long templates. The respective pair + is stored in 'pair2_truncated'. + pattern: "*.pair1.truncated.fastq.gz" + - pair2_truncated: + type: file + description: | + Adapter trimmed R2 FastQ files of paired-end reads that did not merge + with their respective R1 pair due to long templates. The respective pair + is stored in 'pair1_truncated'. + pattern: "*.pair2.truncated.fastq.gz" + - collapsed: + type: file + description: | + Collapsed FastQ of paired-end reads that successfully merged with their + respective R1 pair but were not trimmed. + pattern: "*.collapsed.fastq.gz" + - collapsed_truncated: + type: file + description: | + Collapsed FastQ of paired-end reads that successfully merged with their + respective R1 pair and were trimmed of adapter due to sufficient overlap. + pattern: "*.collapsed.truncated.fastq.gz" + - log: + type: file + description: AdapterRemoval log file + pattern: "*.settings" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@maxibor" + - "@jfy133" diff --git a/modules/nf-core/bbmap/bbduk/main.nf b/modules/nf-core/bbmap/bbduk/main.nf new file mode 100644 index 0000000..0ae005e --- /dev/null +++ b/modules/nf-core/bbmap/bbduk/main.nf @@ -0,0 +1,43 @@ +process BBMAP_BBDUK { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? "bioconda::bbmap=38.90" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bbmap:38.90--he522d1c_1' : + 'quay.io/biocontainers/bbmap:38.90--he522d1c_1' }" + + input: + tuple val(meta), path(reads) + path contaminants + + output: + tuple val(meta), path('*.fastq.gz'), emit: reads + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def raw = meta.single_end ? "in=${reads[0]}" : "in1=${reads[0]} in2=${reads[1]}" + def trimmed = meta.single_end ? "out=${prefix}.fastq.gz" : "out1=${prefix}_1.fastq.gz out2=${prefix}_2.fastq.gz" + def contaminants_fa = contaminants ? "ref=$contaminants" : '' + """ + maxmem=\$(echo \"$task.memory\"| sed 's/ GB/g/g') + bbduk.sh \\ + -Xmx\$maxmem \\ + $raw \\ + $trimmed \\ + threads=$task.cpus \\ + $args \\ + $contaminants_fa \\ + &> ${prefix}.bbduk.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh) + END_VERSIONS + """ +} diff --git a/modules/nf-core/bbmap/bbduk/meta.yml b/modules/nf-core/bbmap/bbduk/meta.yml new file mode 100644 index 0000000..d79b9d0 --- /dev/null +++ b/modules/nf-core/bbmap/bbduk/meta.yml @@ -0,0 +1,53 @@ +name: bbmap_bbduk +description: Adapter and quality trimming of sequencing reads +keywords: + - trimming + - adapter trimming + - quality trimming + - fastq +tools: + - bbmap: + description: BBMap is a short read aligner, as well as various other bioinformatic tools. + homepage: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + documentation: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + tool_dev_url: None + doi: "" + licence: ["UC-LBL license (see package)"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - contaminants: + type: file + description: | + Reference files containing adapter and/or contaminant sequences for sequence kmer matching + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified fastq reads + pattern: "*fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - log: + type: file + description: Bbduk log file + pattern: "*bbduk.log" + +authors: + - "@MGordon09" diff --git a/modules/nf-core/bowtie2/align/main.nf b/modules/nf-core/bowtie2/align/main.nf new file mode 100644 index 0000000..c74e376 --- /dev/null +++ b/modules/nf-core/bowtie2/align/main.nf @@ -0,0 +1,71 @@ +process BOWTIE2_ALIGN { + tag "$meta.id" + label "process_high" + + conda (params.enable_conda ? "bioconda::bowtie2=2.4.4 bioconda::samtools=1.15.1 conda-forge::pigz=2.6" : null) + container "${ workflow.containerEngine == "singularity" && !task.ext.singularity_pull_docker_container ? + "https://depot.galaxyproject.org/singularity/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:1744f68fe955578c63054b55309e05b41c37a80d-0" : + "quay.io/biocontainers/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:1744f68fe955578c63054b55309e05b41c37a80d-0" }" + + input: + tuple val(meta), path(reads) + path index + val save_unaligned + val sort_bam + + output: + tuple val(meta), path("*.bam") , emit: bam + tuple val(meta), path("*.log") , emit: log + tuple val(meta), path("*fastq.gz"), emit: fastq, optional:true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: "" + def args2 = task.ext.args2 ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + + def unaligned = "" + def reads_args = "" + if (meta.single_end) { + unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : "" + reads_args = "-U ${reads}" + } else { + unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : "" + reads_args = "-1 ${reads[0]} -2 ${reads[1]}" + } + + def samtools_command = sort_bam ? 'sort' : 'view' + + """ + INDEX=`find -L ./ -name "*.rev.1.bt2" | sed "s/.rev.1.bt2//"` + [ -z "\$INDEX" ] && INDEX=`find -L ./ -name "*.rev.1.bt2l" | sed "s/.rev.1.bt2l//"` + [ -z "\$INDEX" ] && echo "Bowtie2 index files not found" 1>&2 && exit 1 + + bowtie2 \\ + -x \$INDEX \\ + $reads_args \\ + --threads $task.cpus \\ + $unaligned \\ + $args \\ + 2> ${prefix}.bowtie2.log \\ + | samtools $samtools_command $args2 --threads $task.cpus -o ${prefix}.bam - + + if [ -f ${prefix}.unmapped.fastq.1.gz ]; then + mv ${prefix}.unmapped.fastq.1.gz ${prefix}.unmapped_1.fastq.gz + fi + + if [ -f ${prefix}.unmapped.fastq.2.gz ]; then + mv ${prefix}.unmapped.fastq.2.gz ${prefix}.unmapped_2.fastq.gz + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/bowtie2/align/meta.yml b/modules/nf-core/bowtie2/align/meta.yml new file mode 100644 index 0000000..42ba0f9 --- /dev/null +++ b/modules/nf-core/bowtie2/align/meta.yml @@ -0,0 +1,62 @@ +name: bowtie2_align +description: Align reads to a reference genome using bowtie2 +keywords: + - align + - map + - fasta + - fastq + - genome + - reference +tools: + - bowtie2: + description: | + Bowtie 2 is an ultrafast and memory-efficient tool for aligning + sequencing reads to long reference sequences. + homepage: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml + documentation: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml + doi: 10.1038/nmeth.1923 + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - index: + type: file + description: Bowtie2 genome index files + pattern: "*.ebwt" + - save_unaligned: + type: boolean + description: | + Save reads that do not map to the reference (true) or discard them (false) + (default: false) + - sort_bam: + type: boolean + description: use samtools sort (true) or samtools view (false) + pattern: "true or false" +output: + - bam: + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastq: + type: file + description: Unaligned FastQ files + pattern: "*.fastq.gz" + - log: + type: file + description: Aligment log + pattern: "*.log" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/bowtie2/build/main.nf b/modules/nf-core/bowtie2/build/main.nf new file mode 100644 index 0000000..a4da62d --- /dev/null +++ b/modules/nf-core/bowtie2/build/main.nf @@ -0,0 +1,30 @@ +process BOWTIE2_BUILD { + tag "$fasta" + label 'process_high' + + conda (params.enable_conda ? 'bioconda::bowtie2=2.4.4' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bowtie2:2.4.4--py39hbb4e92a_0' : + 'quay.io/biocontainers/bowtie2:2.4.4--py39hbb4e92a_0' }" + + input: + path fasta + + output: + path 'bowtie2' , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + mkdir bowtie2 + bowtie2-build $args --threads $task.cpus $fasta bowtie2/${fasta.baseName} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bowtie2/build/meta.yml b/modules/nf-core/bowtie2/build/meta.yml new file mode 100644 index 0000000..2da9a21 --- /dev/null +++ b/modules/nf-core/bowtie2/build/meta.yml @@ -0,0 +1,33 @@ +name: bowtie2_build +description: Builds bowtie index for reference genome +keywords: + - build + - index + - fasta + - genome + - reference +tools: + - bowtie2: + description: | + Bowtie 2 is an ultrafast and memory-efficient tool for aligning + sequencing reads to long reference sequences. + homepage: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml + documentation: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml + doi: 10.1038/nmeth.1923 + licence: ["GPL-3.0-or-later"] +input: + - fasta: + type: file + description: Input genome fasta file +output: + - index: + type: file + description: Bowtie2 genome index files + pattern: "*.bt2" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf new file mode 100644 index 0000000..d275f19 --- /dev/null +++ b/modules/nf-core/cat/fastq/main.nf @@ -0,0 +1,80 @@ +process CAT_FASTQ { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "conda-forge::sed=4.7" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'ubuntu:20.04' }" + + input: + tuple val(meta), path(reads, stageAs: "input*/*") + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads.collect{ it.toString() } + if (meta.single_end) { + if (readList.size > 1) { + """ + cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size > 2) { + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz + cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads.collect{ it.toString() } + if (meta.single_end) { + if (readList.size > 1) { + """ + touch ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size > 2) { + """ + touch ${prefix}_1.merged.fastq.gz + touch ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + +} diff --git a/modules/nf-core/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml new file mode 100644 index 0000000..c836598 --- /dev/null +++ b/modules/nf-core/cat/fastq/meta.yml @@ -0,0 +1,39 @@ +name: cat_fastq +description: Concatenates fastq files +keywords: + - fastq + - concatenate +tools: + - cat: + description: | + The cat utility reads files sequentially, writing them to the standard output. + documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: list + description: | + List of input FastQ files to be concatenated. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Merged fastq file + pattern: "*.{merged.fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/centrifuge/centrifuge/main.nf b/modules/nf-core/centrifuge/centrifuge/main.nf new file mode 100644 index 0000000..3d23fc9 --- /dev/null +++ b/modules/nf-core/centrifuge/centrifuge/main.nf @@ -0,0 +1,61 @@ +process CENTRIFUGE_CENTRIFUGE { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? "bioconda::centrifuge=1.0.4_beta" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6' : + 'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }" + + input: + tuple val(meta), path(reads) + path db + val save_unaligned + val save_aligned + val sam_format + + output: + tuple val(meta), path('*report.txt') , emit: report + tuple val(meta), path('*results.txt') , emit: results + tuple val(meta), path('*.sam') , optional: true, emit: sam + tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_mapped + tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "-U ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" + def unaligned = '' + def aligned = '' + if (meta.single_end) { + unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : '' + } else { + unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : '' + } + def sam_output = sam_format ? "--out-fmt 'sam'" : '' + """ + ## we add "-no-name ._" to ensure silly Mac OSX metafiles files aren't included + db_name=`find -L ${db} -name "*.1.cf" -not -name "._*" | sed 's/.1.cf//'` + centrifuge \\ + -x \$db_name \\ + -p $task.cpus \\ + $paired \\ + --report-file ${prefix}.report.txt \\ + -S ${prefix}.results.txt \\ + $unaligned \\ + $aligned \\ + $sam_output \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/centrifuge/centrifuge/meta.yml b/modules/nf-core/centrifuge/centrifuge/meta.yml new file mode 100644 index 0000000..a252c00 --- /dev/null +++ b/modules/nf-core/centrifuge/centrifuge/meta.yml @@ -0,0 +1,66 @@ +name: centrifuge_centrifuge +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - centrifuge: + description: Centrifuge is a classifier for metagenomic sequences. + homepage: https://ccb.jhu.edu/software/centrifuge/ + documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml + doi: 10.1101/gr.210641.116 + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Path to directory containing centrifuge database files + - save_unaligned: + type: value + description: If true unmapped fastq files are saved + - save_aligned: + type: value + description: If true mapped fastq files are saved +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - report: + type: file + description: | + File containing a classification summary + pattern: "*.{report.txt}" + - results: + type: file + description: | + File containing classification results + pattern: "*.{results.txt}" + - fastq_unmapped: + type: file + description: Unmapped fastq files + pattern: "*.unmapped.fastq.gz" + - fastq_mapped: + type: file + description: Mapped fastq files + pattern: "*.mapped.fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@sofstam" + - "@jfy133" + - "@sateeshperi" diff --git a/modules/nf-core/centrifuge/kreport/main.nf b/modules/nf-core/centrifuge/kreport/main.nf new file mode 100644 index 0000000..381ddd6 --- /dev/null +++ b/modules/nf-core/centrifuge/kreport/main.nf @@ -0,0 +1,33 @@ +process CENTRIFUGE_KREPORT { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::centrifuge=1.0.4_beta" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6': + 'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }" + + input: + tuple val(meta), path(report) + path db + + output: + tuple val(meta), path('*.txt') , emit: kreport + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + db_name=`find -L ${db} -name "*.1.cf" -not -name "._*" | sed 's/.1.cf//'` + centrifuge-kreport -x \$db_name ${report} > ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/centrifuge/kreport/meta.yml b/modules/nf-core/centrifuge/kreport/meta.yml new file mode 100644 index 0000000..822a280 --- /dev/null +++ b/modules/nf-core/centrifuge/kreport/meta.yml @@ -0,0 +1,41 @@ +name: "centrifuge_kreport" +description: Creates Kraken-style reports from centrifuge out files +keywords: + - metagenomics +tools: + - centrifuge: + description: Centrifuge is a classifier for metagenomic sequences. + homepage: https://ccb.jhu.edu/software/centrifuge/ + documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml + doi: 10.1101/gr.210641.116 + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - report: + type: file + description: File containing the centrifuge classification report + pattern: "*.{txt}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - kreport: + type: file + description: | + File containing kraken-style report from centrifuge + out files. + pattern: "*.{txt}" +authors: + - "@sofstam" + - "@jfy133" diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py index 787bdb7..d139039 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -1,10 +1,9 @@ #!/usr/bin/env python +import yaml import platform from textwrap import dedent -import yaml - def _make_versions_html(versions): html = [ @@ -59,12 +58,11 @@ versions_by_module = {} for process, process_versions in versions_by_process.items(): module = process.split(":")[-1] try: - if versions_by_module[module] != process_versions: - raise AssertionError( - "We assume that software versions are the same between all modules. " - "If you see this error-message it means you discovered an edge-case " - "and should open an issue in nf-core/tools. " - ) + assert versions_by_module[module] == process_versions, ( + "We assume that software versions are the same between all modules. " + "If you see this error-message it means you discovered an edge-case " + "and should open an issue in nf-core/tools. " + ) except KeyError: versions_by_module[module] = process_versions diff --git a/modules/nf-core/diamond/blastx/main.nf b/modules/nf-core/diamond/blastx/main.nf new file mode 100644 index 0000000..1f4ff25 --- /dev/null +++ b/modules/nf-core/diamond/blastx/main.nf @@ -0,0 +1,68 @@ +process DIAMOND_BLASTX { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? "bioconda::diamond=2.0.15" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/diamond:2.0.15--hb97b32f_0' : + 'quay.io/biocontainers/diamond:2.0.15--hb97b32f_0' }" + + input: + tuple val(meta), path(fasta) + path db + val out_ext + val blast_columns + + output: + tuple val(meta), path('*.blast'), optional: true, emit: blast + tuple val(meta), path('*.xml') , optional: true, emit: xml + tuple val(meta), path('*.txt') , optional: true, emit: txt + tuple val(meta), path('*.daa') , optional: true, emit: daa + tuple val(meta), path('*.sam') , optional: true, emit: sam + tuple val(meta), path('*.tsv') , optional: true, emit: tsv + tuple val(meta), path('*.paf') , optional: true, emit: paf + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def columns = blast_columns ? "${blast_columns}" : '' + switch ( out_ext ) { + case "blast": outfmt = 0; break + case "xml": outfmt = 5; break + case "txt": outfmt = 6; break + case "daa": outfmt = 100; break + case "sam": outfmt = 101; break + case "tsv": outfmt = 102; break + case "paf": outfmt = 103; break + default: + outfmt = '6'; + out_ext = 'txt'; + log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)"); + break + } + """ + DB=`find -L ./ -name "*.dmnd" | sed 's/.dmnd//'` + + diamond \\ + blastx \\ + --threads $task.cpus \\ + --db \$DB \\ + --query $fasta \\ + --outfmt ${outfmt} ${columns} \\ + $args \\ + --out ${prefix}.${out_ext} \\ + --log + + mv diamond.log ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/diamond/blastx/meta.yml b/modules/nf-core/diamond/blastx/meta.yml new file mode 100644 index 0000000..327b893 --- /dev/null +++ b/modules/nf-core/diamond/blastx/meta.yml @@ -0,0 +1,81 @@ +name: diamond_blastx +description: Queries a DIAMOND database using blastx mode +keywords: + - fasta + - diamond + - blastx + - DNA sequence +tools: + - diamond: + description: Accelerated BLAST compatible local sequence aligner + homepage: https://github.com/bbuchfink/diamond + documentation: https://github.com/bbuchfink/diamond/wiki + tool_dev_url: https://github.com/bbuchfink/diamond + doi: "doi:10.1038/s41592-021-01101-x" + licence: ["GPL v3.0"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input fasta file containing query sequences + pattern: "*.{fa,fasta}" + - db: + type: directory + description: Directory containing the nucelotide blast database + pattern: "*" + - out_ext: + type: string + description: | + Specify the type of output file to be generated. `blast` corresponds to + BLAST pairwise format. `xml` corresponds to BLAST xml format. + `txt` corresponds to to BLAST tabular format. `tsv` corresponds to + taxonomic classification format. + pattern: "blast|xml|txt|daa|sam|tsv|paf" + +output: + - blast: + type: file + description: File containing blastp hits + pattern: "*.{blast}" + - xml: + type: file + description: File containing blastp hits + pattern: "*.{xml}" + - txt: + type: file + description: File containing hits in tabular BLAST format. + pattern: "*.{txt}" + - daa: + type: file + description: File containing hits DAA format + pattern: "*.{daa}" + - sam: + type: file + description: File containing aligned reads in SAM format + pattern: "*.{sam}" + - tsv: + type: file + description: Tab separated file containing taxonomic classification of hits + pattern: "*.{tsv}" + - paf: + type: file + description: File containing aligned reads in pairwise mapping format format + pattern: "*.{paf}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - log: + type: file + description: Log file containing stdout information + pattern: "*.{log}" + +authors: + - "@spficklin" + - "@jfy133" + - "@mjamy" diff --git a/modules/nf-core/eido/convert/main.nf b/modules/nf-core/eido/convert/main.nf new file mode 100644 index 0000000..be4c02f --- /dev/null +++ b/modules/nf-core/eido/convert/main.nf @@ -0,0 +1,38 @@ +process EIDO_CONVERT { + tag '$samplesheet' + label 'process_single' + + conda (params.enable_conda ? "conda-forge::eido=0.1.9" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/eido/0.1.9_cv1/eido_0.1.9_cv1.sif' : + 'biocontainers/eido:0.1.9_cv1' }" + + input: + path samplesheet + val format + path pep_input_base_dir + + output: + path "versions.yml" , emit: versions + path "${prefix}.${format}" , emit: samplesheet_converted + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "samplesheet_converted" + """ + eido \\ + convert \\ + -f $format \\ + $samplesheet \\ + $args \\ + -p samples=${prefix}.${format} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + eido: \$(echo \$(eido --version 2>&1) | sed 's/^.*eido //;s/ .*//' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/eido/convert/meta.yml b/modules/nf-core/eido/convert/meta.yml new file mode 100644 index 0000000..bd12e03 --- /dev/null +++ b/modules/nf-core/eido/convert/meta.yml @@ -0,0 +1,39 @@ +name: "eido_convert" +description: Convert any PEP project or Nextflow samplesheet to any format +keywords: + - eido + - convert + - PEP + - format + - samplesheet +tools: + - "eido": + description: "Convert any PEP project or Nextflow samplesheet to any format" + homepage: "http://eido.databio.org/en/latest/" + documentation: "http://eido.databio.org/en/latest/" + doi: "10.1093/gigascience/giab077" + licence: "BSD-2-Clause" + +input: + - samplesheet: + type: file + description: Nextflow samplesheet or PEP project + pattern: "*.{yaml,yml,csv}" + - format: + type: value + description: Extension of an output file + - pep_input_base_dir: + type: file + description: Optional path to the directory where files specified in a PEP config file are stored. Any paths specified in the config will need to be relative to this base directory. + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - samplesheet_converted: + type: file + description: PEP project or samplesheet converted to csv file + +authors: + - "@rafalstepien" diff --git a/modules/nf-core/eido/validate/main.nf b/modules/nf-core/eido/validate/main.nf new file mode 100644 index 0000000..e564e83 --- /dev/null +++ b/modules/nf-core/eido/validate/main.nf @@ -0,0 +1,33 @@ +process EIDO_VALIDATE { + tag '$samplesheet' + label 'process_single' + + conda (params.enable_conda ? "conda-forge::eido=0.1.9" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/eido/0.1.9_cv2/eido_0.1.9_cv2.sif' : + 'biocontainers/eido:0.1.9_cv2' }" + + input: + path samplesheet + path schema + path pep_input_base_dir + + output: + path "versions.yml" , emit: versions + path "*.log" , emit: log + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "validation" + """ + eido validate $args $samplesheet -s $schema -e > ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + eido: \$(echo \$(eido --version 2>&1) | sed 's/^.*eido //;s/ .*//' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/eido/validate/meta.yml b/modules/nf-core/eido/validate/meta.yml new file mode 100644 index 0000000..eb7b295 --- /dev/null +++ b/modules/nf-core/eido/validate/meta.yml @@ -0,0 +1,41 @@ +name: "eido_validate" +description: Validate samplesheet or PEP config against a schema +keywords: + - eido + - validate + - schema + - format + - pep +tools: + - "validate": + description: "Validate samplesheet or PEP config against a schema." + homepage: "http://eido.databio.org/en/latest/" + documentation: "http://eido.databio.org/en/latest/" + doi: "10.1093/gigascience/giab077" + licence: "BSD-2-Clause" + +input: + - samplesheet: + type: file + description: Samplesheet or PEP file to be validated + pattern: "*.{yaml,yml,csv}" + - schema: + type: file + description: Schema that the samplesheet will be validated against + pattern: "*.{yaml,yml}" + - pep_input_base_dir: + type: file + description: Optional path to the directory where files specified in a PEP config file are stored. Any paths specified in the config will need to be relative to this base directory. + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - log: + type: file + description: File containing validation log. + pattern: "*.log" + +authors: + - "@rafalstepien" diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf new file mode 100644 index 0000000..11ea4db --- /dev/null +++ b/modules/nf-core/fastp/main.nf @@ -0,0 +1,98 @@ +process FASTP { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? 'bioconda::fastp=0.23.2' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastp:0.23.2--h79da9fb_0' : + 'quay.io/biocontainers/fastp:0.23.2--h79da9fb_0' }" + + input: + tuple val(meta), path(reads) + val save_trimmed_fail + val save_merged + + output: + tuple val(meta), path('*.fastp.fastq.gz') , optional:true, emit: reads + tuple val(meta), path('*.json') , emit: json + tuple val(meta), path('*.html') , emit: html + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + tuple val(meta), path('*.fail.fastq.gz') , optional:true, emit: reads_fail + tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : '' + // Added soft-links to original fastqs for consistent naming in MultiQC + // Use single ended for interleaved. Add --interleaved_in in config. + if ( task.ext.args?.contains('--interleaved_in') ) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz + + fastp \\ + --stdout \\ + --in1 ${prefix}.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $fail_fastq \\ + $args \\ + 2> ${prefix}.fastp.log \\ + | gzip -c > ${prefix}.fastp.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else if (meta.single_end) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz + + fastp \\ + --stdout \\ + --in1 ${prefix}.fastq.gz \\ + --out1 ${prefix}.fastp.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $fail_fastq \\ + $args \\ + 2> ${prefix}.fastp.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else { + def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : '' + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -sf ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -sf ${reads[1]} ${prefix}_2.fastq.gz + fastp \\ + --in1 ${prefix}_1.fastq.gz \\ + --in2 ${prefix}_2.fastq.gz \\ + --out1 ${prefix}_1.fastp.fastq.gz \\ + --out2 ${prefix}_2.fastp.fastq.gz \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $fail_fastq \\ + $merge_fastq \\ + --thread $task.cpus \\ + --detect_adapter_for_pe \\ + $args \\ + 2> ${prefix}.fastp.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/fastp/meta.yml b/modules/nf-core/fastp/meta.yml new file mode 100644 index 0000000..2368fde --- /dev/null +++ b/modules/nf-core/fastp/meta.yml @@ -0,0 +1,69 @@ +name: fastp +description: Perform adapter/quality trimming on sequencing reads +keywords: + - trimming + - quality control + - fastq +tools: + - fastp: + description: | + A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance. + documentation: https://github.com/OpenGene/fastp + doi: https://doi.org/10.1093/bioinformatics/bty560 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information. Use 'single_end: true' to specify single ended or interleaved FASTQs. Use 'single_end: false' for paired-end reads. + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. If you wish to run interleaved paired-end data, supply as single-end data + but with `--interleaved_in` in your `modules.conf`'s `ext.args` for the module. + - save_trimmed_fail: + type: boolean + description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz` + - save_merged: + type: boolean + description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz` + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified/unmerged fastq reads + pattern: "*fastp.fastq.gz" + - json: + type: file + description: Results in JSON format + pattern: "*.json" + - html: + type: file + description: Results in HTML format + pattern: "*.html" + - log: + type: file + description: fastq log file + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads_fail: + type: file + description: Reads the failed the preprocessing + pattern: "*fail.fastq.gz" + - reads_merged: + type: file + description: Reads that were successfully merged + pattern: "*.{merged.fastq.gz}" +authors: + - "@drpatelh" + - "@kevinmenden" diff --git a/modules/nf-core/filtlong/main.nf b/modules/nf-core/filtlong/main.nf new file mode 100644 index 0000000..afaa938 --- /dev/null +++ b/modules/nf-core/filtlong/main.nf @@ -0,0 +1,39 @@ +process FILTLONG { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::filtlong=0.2.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/filtlong:0.2.1--h9a82719_0' : + 'quay.io/biocontainers/filtlong:0.2.1--h9a82719_0' }" + + input: + tuple val(meta), path(shortreads), path(longreads) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def short_reads = !shortreads ? "" : meta.single_end ? "-1 $shortreads" : "-1 ${shortreads[0]} -2 ${shortreads[1]}" + if ("$longreads" == "${prefix}.fastq.gz") error "Longread FASTQ input and output names are the same, set prefix in module configuration to disambiguate!" + """ + filtlong \\ + $short_reads \\ + $args \\ + $longreads \\ + 2> ${prefix}.log \\ + | gzip -n > ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + filtlong: \$( filtlong --version | sed -e "s/Filtlong v//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/filtlong/meta.yml b/modules/nf-core/filtlong/meta.yml new file mode 100644 index 0000000..a50b452 --- /dev/null +++ b/modules/nf-core/filtlong/meta.yml @@ -0,0 +1,55 @@ +name: filtlong +description: Filtlong filters long reads based on quality measures or short read data. +keywords: + - nanopore + - quality control + - QC + - filtering + - long reads + - short reads +tools: + - filtlong: + description: Filtlong is a tool for filtering long reads. It can take a set of long reads and produce a smaller, better subset. It uses both read length (longer is better) and read identity (higher is better) when choosing which reads pass the filter. + homepage: https://anaconda.org/bioconda/filtlong + documentation: None + tool_dev_url: https://github.com/rrwick/Filtlong + doi: "" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - shortreads: + type: file + description: fastq file + pattern: "*.{fq,fastq,fq.gz,fastq.gz}" + - longreads: + type: file + description: fastq file + pattern: "*.{fq,fastq,fq.gz,fastq.gz}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: Filtered (compressed) fastq file + pattern: "*.fastq.gz" + - log: + type: file + description: Standard error logging file containing summary statistics + pattern: "*.log" + +authors: + - "@d4straub" + - "@sofstam" diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf new file mode 100644 index 0000000..7036704 --- /dev/null +++ b/modules/nf-core/gunzip/main.nf @@ -0,0 +1,44 @@ +process GUNZIP { + tag "$archive" + label 'process_low' + + conda (params.enable_conda ? "conda-forge::sed=4.7" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$gunzip"), emit: gunzip + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + gunzip = archive.toString() - '.gz' + """ + gunzip \\ + -f \\ + $args \\ + $archive + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + gunzip = archive.toString() - '.gz' + """ + touch $gunzip + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml new file mode 100644 index 0000000..4d2ebc8 --- /dev/null +++ b/modules/nf-core/gunzip/meta.yml @@ -0,0 +1,34 @@ +name: gunzip +description: Compresses and decompresses files. +keywords: + - gunzip + - compression +tools: + - gunzip: + description: | + gzip is a file format and a software application used for file compression and decompression. + documentation: https://www.gnu.org/software/gzip/manual/gzip.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Optional groovy Map containing meta information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be compressed/uncompressed + pattern: "*.*" +output: + - gunzip: + type: file + description: Compressed/uncompressed file + pattern: "*.*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/kaiju/kaiju/main.nf b/modules/nf-core/kaiju/kaiju/main.nf new file mode 100644 index 0000000..ae8f99e --- /dev/null +++ b/modules/nf-core/kaiju/kaiju/main.nf @@ -0,0 +1,41 @@ +process KAIJU_KAIJU { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? "bioconda::kaiju=1.8.2" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kaiju:1.8.2--h5b5514e_1': + 'quay.io/biocontainers/kaiju:1.8.2--h5b5514e_1' }" + + input: + tuple val(meta), path(reads) + path(db) + + output: + tuple val(meta), path('*.tsv'), emit: results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input = meta.single_end ? "-i ${reads}" : "-i ${reads[0]} -j ${reads[1]}" + """ + dbnodes=`find -L ${db} -name "*nodes.dmp"` + dbname=`find -L ${db} -name "*.fmi" -not -name "._*"` + kaiju \\ + $args \\ + -z $task.cpus \\ + -t \$dbnodes \\ + -f \$dbname \\ + -o ${prefix}.tsv \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kaiju: \$(echo \$( kaiju -h 2>&1 | sed -n 1p | sed 's/^.*Kaiju //' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/kaiju/kaiju/meta.yml b/modules/nf-core/kaiju/kaiju/meta.yml new file mode 100644 index 0000000..e24c8ef --- /dev/null +++ b/modules/nf-core/kaiju/kaiju/meta.yml @@ -0,0 +1,53 @@ +name: kaiju_kaiju +description: Taxonomic classification of metagenomic sequence data using a protein reference database +keywords: + - classify + - metagenomics + - fastq + - taxonomic profiling +tools: + - kaiju: + description: Fast and sensitive taxonomic classification for metagenomics + homepage: https://kaiju.binf.ku.dk/ + documentation: https://github.com/bioinformatics-centre/kaiju/blob/master/README.md + tool_dev_url: https://github.com/bioinformatics-centre/kaiju + doi: "10.1038/ncomms11257" + licence: ["GNU GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input fastq/fasta files of size 1 and 2 for single-end and paired-end data, + respectively. + pattern: "*.{fastq,fq,fasta,fa,fsa,fas,fna,fastq.gz,fq.gz,fasta.gz,fa.gz,fsa.gz,fas.gz,fna.gz}" + - db: + type: files + description: | + List containing the database and nodes files for Kaiju + e.g. [ 'database.fmi', 'nodes.dmp' ] + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - results: + type: file + description: Results with taxonomic classification of each read + pattern: "*.tsv" + +authors: + - "@talnor" + - "@sofstam" + - "@jfy133" diff --git a/modules/nf-core/kaiju/kaiju2krona/main.nf b/modules/nf-core/kaiju/kaiju2krona/main.nf new file mode 100644 index 0000000..c95d5a7 --- /dev/null +++ b/modules/nf-core/kaiju/kaiju2krona/main.nf @@ -0,0 +1,39 @@ +process KAIJU_KAIJU2KRONA { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::kaiju=1.8.2" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kaiju:1.8.2--h5b5514e_1': + 'quay.io/biocontainers/kaiju:1.8.2--h5b5514e_1' }" + + input: + tuple val(meta), path(tsv) + path(db) + + output: + tuple val(meta), path("*.txt"), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + dbnodes=`find -L ${db} -name "*nodes.dmp"` + dbnames=`find -L ${db} -name "*names.dmp"` + kaiju2krona \\ + $args \\ + -t \$dbnodes \\ + -n \$dbnames \\ + -i ${tsv} \\ + -o ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kaiju: \$(echo \$( kaiju -h 2>&1 | sed -n 1p | sed 's/^.*Kaiju //' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/kaiju/kaiju2krona/meta.yml b/modules/nf-core/kaiju/kaiju2krona/meta.yml new file mode 100644 index 0000000..a0dc2fd --- /dev/null +++ b/modules/nf-core/kaiju/kaiju2krona/meta.yml @@ -0,0 +1,44 @@ +name: kaiju_kaiju2krona +description: Convert Kaiju's tab-separated output file into a tab-separated text file which can be imported into Krona. +keywords: + - taxonomy + - visualisation + - krona chart + - metagenomics +tools: + - "kaiju": + description: Fast and sensitive taxonomic classification for metagenomics + homepage: https://kaiju.binf.ku.dk/ + documentation: https://github.com/bioinformatics-centre/kaiju/blob/master/README.md + tool_dev_url: https://github.com/bioinformatics-centre/kaiju + doi: "10.1038/ncomms11257" + licence: ["GNU GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - tsv: + type: file + description: Kaiju tab-separated output file + pattern: "*.{tsv,txt}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: file + description: Krona text-based input file converted from Kaiju report + pattern: "*.{txt,krona}" + +authors: + - "@MillironX" diff --git a/modules/nf-core/kaiju/kaiju2table/main.nf b/modules/nf-core/kaiju/kaiju2table/main.nf new file mode 100644 index 0000000..00739d1 --- /dev/null +++ b/modules/nf-core/kaiju/kaiju2table/main.nf @@ -0,0 +1,40 @@ +process KAIJU_KAIJU2TABLE { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::kaiju=1.8.2" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kaiju:1.8.2--h5b5514e_1': + 'quay.io/biocontainers/kaiju:1.8.2--h2e03b76_0' }" + + input: + tuple val(meta), path(results) + path db + val taxon_rank + + output: + tuple val(meta), path('*.txt'), emit: summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + dbnodes=`find -L ${db} -name "*nodes.dmp"` + dbname=`find -L ${db} -name "*.fmi" -not -name "._*"` + kaiju2table $args \\ + -t \$dbnodes \\ + -n \$dbname \\ + -r ${taxon_rank} \\ + -o ${prefix}.txt \\ + ${results} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kaiju: \$(echo \$( kaiju -h 2>&1 | sed -n 1p | sed 's/^.*Kaiju //' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/kaiju/kaiju2table/meta.yml b/modules/nf-core/kaiju/kaiju2table/meta.yml new file mode 100644 index 0000000..bc3e85d --- /dev/null +++ b/modules/nf-core/kaiju/kaiju2table/meta.yml @@ -0,0 +1,50 @@ +name: "kaiju_kaiju2table" +description: write your description here +keywords: + - classify + - metagenomics +tools: + - kaiju: + description: Fast and sensitive taxonomic classification for metagenomics + homepage: https://kaiju.binf.ku.dk/ + documentation: https://github.com/bioinformatics-centre/kaiju/blob/master/README.md + tool_dev_url: https://github.com/bioinformatics-centre/kaiju + doi: "10.1038/ncomms11257" + licence: ["GNU GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - results: + type: file + description: File containing the kaiju classification results + pattern: "*.{txt}" + - taxon_rank: + type: string + description: | + Taxonomic rank to display in report + pattern: "phylum|class|order|family|genus|species" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - results: + type: file + description: | + Summary table for a given taxonomic rank + pattern: "*.{tsv}" + +authors: + - "@sofstam" + - "@talnor" + - "@jfy133" diff --git a/modules/nf-core/kraken2/kraken2/main.nf b/modules/nf-core/kraken2/kraken2/main.nf new file mode 100644 index 0000000..43a1679 --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/main.nf @@ -0,0 +1,58 @@ +process KRAKEN2_KRAKEN2 { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? 'bioconda::kraken2=2.1.2 conda-forge::pigz=2.6' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' : + 'quay.io/biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }" + + input: + tuple val(meta), path(reads) + path db + val save_output_fastqs + val save_reads_assignment + + output: + tuple val(meta), path('*.classified{.,_}*') , optional:true, emit: classified_reads_fastq + tuple val(meta), path('*.unclassified{.,_}*') , optional:true, emit: unclassified_reads_fastq + tuple val(meta), path('*classifiedreads.txt') , optional:true, emit: classified_reads_assignment + tuple val(meta), path('*report.txt') , emit: report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "" : "--paired" + def classified = meta.single_end ? "${prefix}.classified.fastq" : "${prefix}.classified#.fastq" + def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq" + def classified_option = save_output_fastqs ? "--classified-out ${classified}" : "" + def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : "" + def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "" + def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : "" + + """ + kraken2 \\ + --db $db \\ + --threads $task.cpus \\ + --report ${prefix}.kraken2.report.txt \\ + --gzip-compressed \\ + $unclassified_option \\ + $classified_option \\ + $readclassification_option \\ + $paired \\ + $args \\ + $reads + + $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/kraken2/kraken2/meta.yml b/modules/nf-core/kraken2/kraken2/meta.yml new file mode 100644 index 0000000..7129fe3 --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/meta.yml @@ -0,0 +1,75 @@ +name: kraken2_kraken2 +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - kraken2: + description: | + Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads + homepage: https://ccb.jhu.edu/software/kraken2/ + documentation: https://github.com/DerrickWood/kraken2/wiki/Manual + doi: 10.1186/s13059-019-1891-0 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Kraken2 database + - save_output_fastqs: + type: boolean + description: | + If true, optional commands are added to save classified and unclassified reads + as fastq files + - save_reads_assignment: + type: boolean + description: | + If true, an optional command is added to save a file reporting the taxonomic + classification of each input read +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classified_reads_fastq: + type: file + description: | + Reads classified as belonging to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - unclassified_reads_fastq: + type: file + description: | + Reads not classified to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - classified_reads_assignment: + type: file + description: | + Kraken2 output file indicating the taxonomic assignment of + each input read + - report: + type: file + description: | + Kraken2 report containing stats about classified + and not classifed reads. + pattern: "*.{report.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/krakentools/combinekreports/main.nf b/modules/nf-core/krakentools/combinekreports/main.nf new file mode 100644 index 0000000..849f39c --- /dev/null +++ b/modules/nf-core/krakentools/combinekreports/main.nf @@ -0,0 +1,34 @@ +process KRAKENTOOLS_COMBINEKREPORTS { + label 'process_low' + + conda (params.enable_conda ? "bioconda::krakentools=1.2" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krakentools:1.2--pyh5e36f6f_0': + 'quay.io/biocontainers/krakentools:1.2--pyh5e36f6f_0' }" + + input: + tuple val(meta), path(kreports) + + output: + tuple val(meta), path("*.txt"), emit: txt + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + combine_kreports.py \\ + -r ${kreports} \\ + -o ${prefix}.txt \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + combine_kreports.py: ${VERSION} + END_VERSIONS + """ +} diff --git a/modules/nf-core/krakentools/combinekreports/meta.yml b/modules/nf-core/krakentools/combinekreports/meta.yml new file mode 100644 index 0000000..213fc8c --- /dev/null +++ b/modules/nf-core/krakentools/combinekreports/meta.yml @@ -0,0 +1,43 @@ +name: krakentools_combinekreports +description: Takes a Kraken report file and prints out a krona-compatible TEXT file +keywords: + - kraken + - krakentools + - metagenomics + - table + - combining + - merging +tools: + - krakentools: + description: KrakenTools is a suite of scripts to be used for post-analysis of Kraken/KrakenUniq/Kraken2/Bracken results. Please cite the relevant paper if using KrakenTools with any of the listed programs. + homepage: https://github.com/jenniferlu717/KrakenTools + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - kreports: + type: file + description: List of kraken-style report files + pattern: "*.{txt,kreport}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: file + description: Combined kreport file of all input files + pattern: "*.txt" + +authors: + - "@jfy133" diff --git a/modules/nf-core/krakentools/kreport2krona/main.nf b/modules/nf-core/krakentools/kreport2krona/main.nf new file mode 100644 index 0000000..1fcb1e4 --- /dev/null +++ b/modules/nf-core/krakentools/kreport2krona/main.nf @@ -0,0 +1,36 @@ +process KRAKENTOOLS_KREPORT2KRONA { + tag "$meta.id" + label 'process_low' + + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + conda (params.enable_conda ? "bioconda::krakentools=1.2" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krakentools:1.2--pyh5e36f6f_0': + 'quay.io/biocontainers/krakentools:1.2--pyh5e36f6f_0' }" + + input: + tuple val(meta), path(kreport) + + output: + tuple val(meta), path("*.txt"), emit: txt + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + kreport2krona.py \\ + -r ${kreport} \\ + -o ${prefix}.txt \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kreport2krona.py: ${VERSION} + END_VERSIONS + """ +} diff --git a/modules/nf-core/krakentools/kreport2krona/meta.yml b/modules/nf-core/krakentools/kreport2krona/meta.yml new file mode 100644 index 0000000..2f8a163 --- /dev/null +++ b/modules/nf-core/krakentools/kreport2krona/meta.yml @@ -0,0 +1,41 @@ +name: krakentools_kreport2krona +description: Takes a Kraken report file and prints out a krona-compatible TEXT file +keywords: + - kraken + - krona + - metagenomics + - visualization +tools: + - krakentools: + description: KrakenTools is a suite of scripts to be used for post-analysis of Kraken/KrakenUniq/Kraken2/Bracken results. Please cite the relevant paper if using KrakenTools with any of the listed programs. + homepage: https://github.com/jenniferlu717/KrakenTools + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - kreport: + type: file + description: Kraken report + pattern: "*.{txt,kreport}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - krona: + type: file + description: Krona text-based input file converted from Kraken report + pattern: "*.{txt,krona}" + +authors: + - "@MillironX" diff --git a/modules/nf-core/krona/ktimporttaxonomy/main.nf b/modules/nf-core/krona/ktimporttaxonomy/main.nf new file mode 100644 index 0000000..9b03462 --- /dev/null +++ b/modules/nf-core/krona/ktimporttaxonomy/main.nf @@ -0,0 +1,41 @@ +process KRONA_KTIMPORTTAXONOMY { + tag "${meta.id}" + label 'process_high' + + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + conda (params.enable_conda ? "bioconda::krona=2.8" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krona:2.8--pl5262hdfd78af_2' : + 'quay.io/biocontainers/krona:2.8--pl5262hdfd78af_2' }" + + input: + tuple val(meta), path(report) + path taxonomy, stageAs: 'taxonomy.tab' + + output: + tuple val(meta), path ('*.html'), emit: html + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '2.8' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + TAXONOMY=\$(find -L . -name '*.tab' -exec dirname {} \\;) + echo \$TAXONOMY + + ktImportTaxonomy \\ + $args \\ + -o ${prefix}.html \\ + -tax \$TAXONOMY/ \\ + $report + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krona: $VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/krona/ktimporttaxonomy/meta.yml b/modules/nf-core/krona/ktimporttaxonomy/meta.yml new file mode 100644 index 0000000..0fd7d5f --- /dev/null +++ b/modules/nf-core/krona/ktimporttaxonomy/meta.yml @@ -0,0 +1,48 @@ +name: krona_ktimporttaxonomy +description: KronaTools Import Taxonomy imports taxonomy classifications and produces an interactive Krona plot. +keywords: + - plot + - taxonomy + - interactive + - html + - visualisation + - krona chart +tools: + - krona: + description: Krona Tools is a set of scripts to create Krona charts from several Bioinformatics tools as well as from text and XML files. + homepage: https://github.com/marbl/Krona/wiki/KronaTools + documentation: http://manpages.ubuntu.com/manpages/impish/man1/ktImportTaxonomy.1.html + tool_dev_url: + doi: https://doi.org/10.1186/1471-2105-12-385 + licence: + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - database: + type: file + description: | + Path to a Krona taxonomy .tab file normally downloaded and generated by + krona/ktUpdateTaxonomy. Custom taxonomy files can have any name, but + must end in `.tab`. + pattern: "*tab" + - report: + type: file + description: "A tab-delimited file with taxonomy IDs and (optionally) query IDs, magnitudes, and scores. Query IDs are taken from column 1, taxonomy IDs from column 2, and scores from column 3. Lines beginning with # will be ignored." + pattern: "*.{tsv}" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - html: + type: file + description: A html file containing an interactive krona plot. + pattern: "*.{html}" + +authors: + - "@mjakobs" diff --git a/modules/nf-core/krona/ktimporttext/main.nf b/modules/nf-core/krona/ktimporttext/main.nf new file mode 100644 index 0000000..de0cfc2 --- /dev/null +++ b/modules/nf-core/krona/ktimporttext/main.nf @@ -0,0 +1,34 @@ +process KRONA_KTIMPORTTEXT { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::krona=2.8.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krona:2.8.1--pl5321hdfd78af_1': + 'quay.io/biocontainers/krona:2.8.1--pl5321hdfd78af_1' }" + + input: + tuple val(meta), path(report) + + output: + tuple val(meta), path ('*.html'), emit: html + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + ktImportText \\ + $args \\ + -o ${prefix}.html \\ + $report + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krona: \$( echo \$(ktImportText 2>&1) | sed 's/^.*KronaTools //g; s/- ktImportText.*\$//g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/krona/ktimporttext/meta.yml b/modules/nf-core/krona/ktimporttext/meta.yml new file mode 100644 index 0000000..a7108e0 --- /dev/null +++ b/modules/nf-core/krona/ktimporttext/meta.yml @@ -0,0 +1,47 @@ +name: "krona_ktimporttext" +description: Creates a Krona chart from text files listing quantities and lineages. +keywords: + - plot + - taxonomy + - interactive + - html + - visualisation + - krona chart + - metagenomics +tools: + - krona: + description: Krona Tools is a set of scripts to create Krona charts from several Bioinformatics tools as well as from text and XML files. + homepage: https://github.com/marbl/Krona/wiki/KronaTools + documentation: http://manpages.ubuntu.com/manpages/impish/man1/ktImportTaxonomy.1.html + tool_dev_url: https://github.com/marbl/Krona + doi: 10.1186/1471-2105-12-385 + licence: https://raw.githubusercontent.com/marbl/Krona/master/KronaTools/LICENSE.txt + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - report: + type: file + description: "Tab-delimited text file. Each line should be a number followed by a list of wedges to contribute to (starting from the highest level). If no wedges are listed (and just a quantity is given), it will contribute to the top level. If the same lineage is listed more than once, the values will be added. Quantities can be omitted if -q is specified. Lines beginning with '#' will be ignored." + pattern: "*.{txt}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - html: + type: file + description: A html file containing an interactive krona plot. + pattern: "*.{html}" + +authors: + - "@jianhong" diff --git a/modules/nf-core/malt/run/main.nf b/modules/nf-core/malt/run/main.nf new file mode 100644 index 0000000..2b91d90 --- /dev/null +++ b/modules/nf-core/malt/run/main.nf @@ -0,0 +1,49 @@ +process MALT_RUN { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? "bioconda::malt=0.41" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/malt:0.41--1' : + 'quay.io/biocontainers/malt:0.41--1' }" + + input: + tuple val(meta), path(fastqs) + val mode + path index + + output: + tuple val(meta), path("*.rma6") , emit: rma6 + tuple val(meta), path("*.{tab,text,sam}"), optional:true, emit: alignments + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def avail_mem = 6 + if (!task.memory) { + log.info '[MALT_RUN] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.' + } else { + avail_mem = task.memory.giga + } + + """ + malt-run \\ + -t $task.cpus \\ + -v \\ + -o . \\ + $args \\ + --inFile ${fastqs.join(' ')} \\ + -m $mode \\ + --index $index/ |&tee ${prefix}-malt-run.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + malt: \$(malt-run --help 2>&1 | grep -o 'version.* ' | cut -f 1 -d ',' | cut -f2 -d ' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/malt/run/meta.yml b/modules/nf-core/malt/run/meta.yml new file mode 100644 index 0000000..66f2d7a --- /dev/null +++ b/modules/nf-core/malt/run/meta.yml @@ -0,0 +1,58 @@ +name: malt_run +description: MALT, an acronym for MEGAN alignment tool, is a sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics. +keywords: + - malt + - alignment + - metagenomics + - ancient DNA + - aDNA + - palaeogenomics + - archaeogenomics + - microbiome +tools: + - malt: + description: A tool for mapping metagenomic data + homepage: https://www.wsi.uni-tuebingen.de/lehrstuehle/algorithms-in-bioinformatics/software/malt/ + documentation: https://software-ab.informatik.uni-tuebingen.de/download/malt/manual.pdf + tool_dev_url: None + doi: "10.1038/s41559-017-0446-6" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastqs: + type: file + description: Input FASTQ files + pattern: "*.{fastq.gz,fq.gz}" + - mode: + type: string + description: Program mode + pattern: "Unknown|BlastN|BlastP|BlastX|Classifier" + - index: + type: directory + description: Index/database directory from malt-build + pattern: "*/" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - rma6: + type: file + description: MEGAN6 RMA6 file + pattern: "*.rma6" + - sam: + type: file + description: Alignment files in Tab, Text or MEGAN-compatible SAM format + pattern: "*.{tab,txt,sam}" + - log: + type: file + description: Log of verbose MALT stdout + pattern: "*-malt-run.log" + +authors: + - "@jfy133" diff --git a/modules/nf-core/megan/rma2info/main.nf b/modules/nf-core/megan/rma2info/main.nf new file mode 100644 index 0000000..80d1975 --- /dev/null +++ b/modules/nf-core/megan/rma2info/main.nf @@ -0,0 +1,38 @@ +process MEGAN_RMA2INFO { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::megan=6.21.7" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/megan:6.21.7--h9ee0642_0': + 'quay.io/biocontainers/megan:6.21.7--h9ee0642_0' }" + + input: + tuple val(meta), path(rma6) + val(megan_summary) + + output: + tuple val(meta), path("*.txt.gz") , emit: txt + tuple val(meta), path("*.megan"), optional: true, emit: megan_summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def summary = megan_summary ? "-es ${prefix}.megan" : "" + """ + rma2info \\ + -i ${rma6} \\ + -o ${prefix}.txt.gz \\ + ${summary} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + megan: \$(echo \$(rma2info 2>&1) | grep version | sed 's/.*version //g;s/, built.*//g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/megan/rma2info/meta.yml b/modules/nf-core/megan/rma2info/meta.yml new file mode 100644 index 0000000..0f2d5a9 --- /dev/null +++ b/modules/nf-core/megan/rma2info/meta.yml @@ -0,0 +1,51 @@ +name: "megan_rma2info" +description: Analyses an RMA file and exports information in text format +keywords: + - megan + - rma6 + - classification + - conversion +tools: + - "megan": + description: "A tool for studying the taxonomic content of a set of DNA reads" + homepage: "https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/" + documentation: "https://software-ab.informatik.uni-tuebingen.de/download/megan6/welcome.html" + tool_dev_url: "https://github.com/husonlab/megan-ce" + doi: "10.1371/journal.pcbi.1004957" + licence: "['GPL >=3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - rma6: + type: file + description: RMA6 file from MEGAN or MALT + pattern: "*.rma6" + - megan_summary: + type: boolean + description: Specify whether to generate an MEGAN summary file + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: file + description: Compressed text file + pattern: "*.txt.gz" + - megan_summary: + type: file + description: Optionally generated MEGAN summary file + pattern: "*.megan" + +authors: + - "@jfy133" diff --git a/modules/nf-core/metaphlan3/mergemetaphlantables/main.nf b/modules/nf-core/metaphlan3/mergemetaphlantables/main.nf new file mode 100644 index 0000000..7c37eca --- /dev/null +++ b/modules/nf-core/metaphlan3/mergemetaphlantables/main.nf @@ -0,0 +1,33 @@ +process METAPHLAN3_MERGEMETAPHLANTABLES { + label 'process_single' + + conda (params.enable_conda ? 'bioconda::metaphlan=3.0.12' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/metaphlan:3.0.12--pyhb7b1952_0' : + 'quay.io/biocontainers/metaphlan:3.0.12--pyhb7b1952_0' }" + + input: + tuple val(meta), path(profiles) + + output: + tuple val(meta), path("${prefix}.txt") , emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + merge_metaphlan_tables.py \\ + $args \\ + -o ${prefix}.txt \\ + ${profiles} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + metaphlan3: \$(metaphlan --version 2>&1 | awk '{print \$3}') + END_VERSIONS + """ +} diff --git a/modules/nf-core/metaphlan3/mergemetaphlantables/meta.yml b/modules/nf-core/metaphlan3/mergemetaphlantables/meta.yml new file mode 100644 index 0000000..365973e --- /dev/null +++ b/modules/nf-core/metaphlan3/mergemetaphlantables/meta.yml @@ -0,0 +1,44 @@ +name: "metaphlan3_mergemetaphlantables" +description: Merges output abundance tables from MetaPhlAn3 +keywords: + - metagenomics + - classification + - merge + - table + - profiles +tools: + - metaphlan3: + description: Identify clades (phyla to species) present in the metagenome obtained from a microbiome sample and their relative abundance + homepage: https://huttenhower.sph.harvard.edu/metaphlan/ + documentation: https://github.com/biobakery/MetaPhlAn + doi: "10.7554/eLife.65088" + licence: ["MIT License"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - profiles: + type: file + description: List of per-sample MetaPhlAn3 taxonomic abundance tables + pattern: "*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: txt + description: Combined MetaPhlAn3 table + pattern: "*.txt" + +authors: + - "@jfy133" diff --git a/modules/nf-core/metaphlan3/metaphlan3/main.nf b/modules/nf-core/metaphlan3/metaphlan3/main.nf new file mode 100644 index 0000000..1453466 --- /dev/null +++ b/modules/nf-core/metaphlan3/metaphlan3/main.nf @@ -0,0 +1,48 @@ +process METAPHLAN3_METAPHLAN3 { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? 'bioconda::metaphlan=3.0.12' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/metaphlan:3.0.12--pyhb7b1952_0' : + 'quay.io/biocontainers/metaphlan:3.0.12--pyhb7b1952_0' }" + + input: + tuple val(meta), path(input) + path metaphlan_db + + output: + tuple val(meta), path("*_profile.txt") , emit: profile + tuple val(meta), path("*.biom") , emit: biom + tuple val(meta), path('*.bowtie2out.txt'), optional:true, emit: bt2out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_type = ("$input".endsWith(".fastq.gz") || "$input".endsWith(".fq.gz")) ? "--input_type fastq" : ("$input".contains(".fasta")) ? "--input_type fasta" : ("$input".endsWith(".bowtie2out.txt")) ? "--input_type bowtie2out" : "--input_type sam" + def input_data = ("$input_type".contains("fastq")) && !meta.single_end ? "${input[0]},${input[1]}" : "$input" + def bowtie2_out = "$input_type" == "--input_type bowtie2out" || "$input_type" == "--input_type sam" ? '' : "--bowtie2out ${prefix}.bowtie2out.txt" + + """ + BT2_DB=`find -L "${metaphlan_db}" -name "*rev.1.bt2" -exec dirname {} \\;` + + metaphlan \\ + --nproc $task.cpus \\ + $input_type \\ + $input_data \\ + $args \\ + $bowtie2_out \\ + --bowtie2db \$BT2_DB \\ + --biom ${prefix}.biom \\ + --output_file ${prefix}_profile.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + metaphlan3: \$(metaphlan --version 2>&1 | awk '{print \$3}') + END_VERSIONS + """ +} diff --git a/modules/nf-core/metaphlan3/metaphlan3/meta.yml b/modules/nf-core/metaphlan3/metaphlan3/meta.yml new file mode 100644 index 0000000..659d83a --- /dev/null +++ b/modules/nf-core/metaphlan3/metaphlan3/meta.yml @@ -0,0 +1,58 @@ +name: metaphlan3_metaphlan3 +description: MetaPhlAn is a tool for profiling the composition of microbial communities from metagenomic shotgun sequencing data. +keywords: + - metagenomics + - classification + - fastq + - bam + - fasta +tools: + - metaphlan3: + description: Identify clades (phyla to species) present in the metagenome obtained from a microbiome sample and their relative abundance + homepage: https://huttenhower.sph.harvard.edu/metaphlan/ + documentation: https://github.com/biobakery/MetaPhlAn + doi: "10.7554/eLife.65088" + licence: ["MIT License"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: Metaphlan 3.0 can classify the metagenome from a variety of input data types, including FASTQ files (single-end and paired-end), FASTA, bowtie2-produced SAM files (produced from alignments to the MetaPHlAn marker database) and intermediate bowtie2 alignment files (bowtie2out) + pattern: "*.{fastq.gz, fasta, fasta.gz, sam, bowtie2out.txt}" + - metaphlan_db: + type: file + description: | + Directory containing pre-downloaded and uncompressed MetaPhlAn3 database downloaded from: http://cmprod1.cibio.unitn.it/biobakery3/metaphlan_databases/. + Note that you will also need to specify `--index` and the database version name (e.g. 'mpa_v31_CHOCOPhlAn_201901') in your module.conf ext.args for METAPHLAN3_METAPHLAN3! + pattern: "*/" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - profile: + type: file + description: Tab-separated output file of the predicted taxon relative abundances + pattern: "*.{txt}" + - biom: + type: file + description: General-use format for representing biological sample by observation contingency tables + pattern: "*.{biom}" + - bowtie2out: + type: file + description: Intermediate Bowtie2 output produced from mapping the metagenome against the MetaPHlAn marker database ( not compatible with `bowtie2out` files generated with MetaPhlAn versions below 3 ) + pattern: "*.{bowtie2out.txt}" + +authors: + - "@MGordon09" diff --git a/modules/nf-core/minimap2/align/main.nf b/modules/nf-core/minimap2/align/main.nf new file mode 100644 index 0000000..08ac6ee --- /dev/null +++ b/modules/nf-core/minimap2/align/main.nf @@ -0,0 +1,48 @@ +process MINIMAP2_ALIGN { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? 'bioconda::minimap2=2.21 bioconda::samtools=1.12' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' : + 'quay.io/biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }" + + input: + tuple val(meta), path(reads) + path reference + val bam_format + val cigar_paf_format + val cigar_bam + + output: + tuple val(meta), path("*.paf"), optional: true, emit: paf + tuple val(meta), path("*.bam"), optional: true, emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_reads = meta.single_end ? "$reads" : "${reads[0]} ${reads[1]}" + def bam_output = bam_format ? "-a | samtools sort | samtools view -@ ${task.cpus} -b -h -o ${prefix}.bam" : "-o ${prefix}.paf" + def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' + def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' + """ + minimap2 \\ + $args \\ + -t $task.cpus \\ + $reference \\ + $input_reads \\ + $cigar_paf \\ + $set_cigar_bam \\ + $bam_output + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/minimap2/align/meta.yml b/modules/nf-core/minimap2/align/meta.yml new file mode 100644 index 0000000..991b39a --- /dev/null +++ b/modules/nf-core/minimap2/align/meta.yml @@ -0,0 +1,65 @@ +name: minimap2_align +description: A versatile pairwise aligner for genomic and spliced nucleotide sequences +keywords: + - align + - fasta + - fastq + - genome + - paf + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FASTA or FASTQ files of size 1 and 2 for single-end + and paired-end data, respectively. + - reference: + type: file + description: | + Reference database in FASTA format. + - bam_format: + type: boolean + description: Specify that output should be in BAM format + - cigar_paf_format: + type: boolean + description: Specify that output CIGAR should be in PAF format + - cigar_bam: + type: boolean + description: | + Write CIGAR with >65535 ops at the CG tag. This is recommended when + doing XYZ (https://github.com/lh3/minimap2#working-with-65535-cigar-operations) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - paf: + type: file + description: Alignment in PAF format + pattern: "*.paf" + - bam: + type: file + description: Alignment in BAM format + pattern: "*.bam" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" diff --git a/modules/nf-core/minimap2/index/main.nf b/modules/nf-core/minimap2/index/main.nf new file mode 100644 index 0000000..3dfeb86 --- /dev/null +++ b/modules/nf-core/minimap2/index/main.nf @@ -0,0 +1,33 @@ +process MINIMAP2_INDEX { + label 'process_medium' + + conda (params.enable_conda ? 'bioconda::minimap2=2.21' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/minimap2:2.21--h5bf99c6_0' : + 'quay.io/biocontainers/minimap2:2.21--h5bf99c6_0' }" + + input: + path fasta + + output: + path "*.mmi" , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + minimap2 \\ + -t $task.cpus \\ + -d ${fasta.baseName}.mmi \\ + $args \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/minimap2/index/meta.yml b/modules/nf-core/minimap2/index/meta.yml new file mode 100644 index 0000000..3bf9f04 --- /dev/null +++ b/modules/nf-core/minimap2/index/meta.yml @@ -0,0 +1,30 @@ +name: minimap2_index +description: Provides fasta index required by minimap2 alignment. +keywords: + - index + - fasta + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - fasta: + type: file + description: | + Reference database in FASTA format. +output: + - mmi: + type: file + description: Minimap2 fasta index. + pattern: "*.mmi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@yuukiiwa" + - "@drpatelh" diff --git a/modules/nf-core/motus/merge/main.nf b/modules/nf-core/motus/merge/main.nf new file mode 100644 index 0000000..a050c7a --- /dev/null +++ b/modules/nf-core/motus/merge/main.nf @@ -0,0 +1,47 @@ +VERSION = '3.0.1' + +process MOTUS_MERGE { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::motus=3.0.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/motus:3.0.1--pyhdfd78af_0': + 'quay.io/biocontainers/motus:3.0.1--pyhdfd78af_0' }" + + input: + tuple val(meta), path(input) + path db // to stop docker saying it can't find it... would have to have the module in upstream steps anyway + path profile_version_yml, stageAs: 'profile_version.yml' + + output: + tuple val(meta), path("*.txt") , optional: true, emit: txt + tuple val(meta), path("*.biom"), optional: true, emit: biom + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def cmd_input = input.size() > 1 ? "-i ${input.join(',')}" : input.isDirectory() ? "-d ${input}" : "-i ${input}" + def suffix = task.ext.args?.contains("-B") ? "biom" : "txt" + """ + motus \\ + merge \\ + -db $db \\ + ${cmd_input} \\ + $args \\ + -o ${prefix}.${suffix} + + ## Take version from the mOTUs/profile module output, as cannot reconstruct + ## version without having database staged in this directory. + VERSION=\$(cat ${profile_version_yml} | grep '/*motus:.*' | sed 's/.*otus: //g') + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + motus: \$VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/motus/merge/meta.yml b/modules/nf-core/motus/merge/meta.yml new file mode 100644 index 0000000..02d11d0 --- /dev/null +++ b/modules/nf-core/motus/merge/meta.yml @@ -0,0 +1,54 @@ +name: "motus_merge" +description: Taxonomic meta-omics profiling using universal marker genes +keywords: + - classify + - metagenomics + - fastq + - taxonomic profiling + - merging + - merge + - otu table +tools: + - "motus": + description: "Marker gene-based OTU (mOTU) profiling" + homepage: "https://motu-tool.org/" + documentation: "https://github.com/motu-tool/mOTUs/wiki" + tool_dev_url: "https://github.com/motu-tool/mOTUs" + doi: "10.1038/s41467-019-08844-4" + licence: "['GPL v3']" + +input: + - input: + type: file + description: | + List of output files (more than one) from motus profile, + or a single directory containing motus output files. + - db: + type: directory + description: | + mOTUs database downloaded by `motus downloadDB` + pattern: "db_mOTU/" + - profile_version_yml: + type: file + description: | + A single versions.yml file output from motus/profile. motus/merge cannot reconstruct + this itself without having the motus database present and configured with the tool + so here we take it from what is already reported by the upstream module. + pattern: "versions.yml" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: file + description: OTU table in txt format, if BIOM format not requested + pattern: "*.txt" + - biom: + type: file + description: OTU table in biom format, if BIOM format requested + pattern: "*.biom" + +authors: + - "@jfy133" diff --git a/modules/nf-core/motus/profile/main.nf b/modules/nf-core/motus/profile/main.nf new file mode 100644 index 0000000..bd7a127 --- /dev/null +++ b/modules/nf-core/motus/profile/main.nf @@ -0,0 +1,56 @@ +process MOTUS_PROFILE { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? "bioconda::motus=3.0.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/motus:3.0.1--pyhdfd78af_0': + 'quay.io/biocontainers/motus:3.0.1--pyhdfd78af_0' }" + + input: + tuple val(meta), path(reads) + path db + + output: + tuple val(meta), path("*.out"), emit: out + tuple val(meta), path("*.bam"), optional: true, emit: bam + tuple val(meta), path("*.mgc"), optional: true, emit: mgc + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def inputs = reads[0].getExtension() == 'bam' ? + "-i ${reads}" : + reads[0].getExtension() == 'mgc' ? "-m $reads" : + meta.single_end ? + "-s $reads" : "-f ${reads[0]} -r ${reads[1]}" + def refdb = db ? "-db ${db}" : "" + """ + motus profile \\ + $args \\ + $inputs \\ + $refdb \\ + -t $task.cpus \\ + -n $prefix \\ + -o ${prefix}.out \\ + 2> ${prefix}.log + + ## mOTUs version number is not available from command line. + ## mOTUs save the version number in index database folder. + ## mOTUs will check the database version is same version as exec version. + if [ "$db" == "" ]; then + VERSION=\$(echo \$(motus -h 2>&1) | sed 's/^.*Version: //; s/References.*\$//') + else + VERSION=\$(grep motus $db/db_mOTU_versions | sed 's/motus\\t//g') + fi + cat <<-END_VERSIONS > versions.yml + "${task.process}": + motus: \$VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/motus/profile/meta.yml b/modules/nf-core/motus/profile/meta.yml new file mode 100644 index 0000000..3c3b660 --- /dev/null +++ b/modules/nf-core/motus/profile/meta.yml @@ -0,0 +1,65 @@ +name: "motus_profile" +description: Taxonomic meta-omics profiling using universal marker genes +keywords: + - classify + - metagenomics + - fastq + - taxonomic profiling +tools: + - "motus": + description: "Marker gene-based OTU (mOTU) profiling" + homepage: "https://motu-tool.org/" + documentation: "https://github.com/motu-tool/mOTUs/wiki" + tool_dev_url: "https://github.com/motu-tool/mOTUs" + doi: "10.1038/s41467-019-08844-4" + licence: "['GPL v3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input fastq/fasta files of size 1 and 2 for single-end and paired-end data, + respectively. + Or the intermediate bam file mapped by bwa to the mOTUs database or + the output bam file from motus profile. + Or the intermediate mgc read counts table. + pattern: "*.{fastq,fq,fasta,fa,fastq.gz,fq.gz,fasta.gz,fa.gz,.bam,.mgc}" + - db: + type: directory + description: | + mOTUs database downloaded by `motus downloadDB` + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - out: + type: file + description: Results with taxonomic classification of each read + pattern: "*.out" + - bam: + type: file + description: Optional intermediate sorted BAM file from BWA + pattern: "*.{bam}" + - mgc: + type: file + description: Optional intermediate mgc read count table file saved with `-M`. + pattern: "*.{mgc}" + - log: + type: file + description: Standard error logging file containing summary statistics + pattern: "*.log" + +authors: + - "@jianhong" diff --git a/modules/nf-core/porechop/main.nf b/modules/nf-core/porechop/main.nf new file mode 100644 index 0000000..77050bc --- /dev/null +++ b/modules/nf-core/porechop/main.nf @@ -0,0 +1,37 @@ +process PORECHOP { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? "bioconda::porechop=0.2.4" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/porechop:0.2.4--py39h7cff6ad_2' : + 'quay.io/biocontainers/porechop:0.2.4--py39h7cff6ad_2' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + porechop \\ + -i $reads \\ + -t $task.cpus \\ + $args \\ + -o ${prefix}.fastq.gz \\ + > ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + porechop: \$( porechop --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/porechop/meta.yml b/modules/nf-core/porechop/meta.yml new file mode 100644 index 0000000..e526317 --- /dev/null +++ b/modules/nf-core/porechop/meta.yml @@ -0,0 +1,55 @@ +name: porechop +description: Adapter removal and demultiplexing of Oxford Nanopore reads +keywords: + - adapter + - nanopore + - demultiplexing +tools: + - porechop: + description: Adapter removal and demultiplexing of Oxford Nanopore reads + homepage: "https://github.com/rrwick/Porechop" + documentation: "https://github.com/rrwick/Porechop" + tool_dev_url: "https://github.com/rrwick/Porechop" + doi: "10.1099/mgen.0.000132" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: fastq/fastq.gz file + pattern: "*.{fastq,fastq.gz,fq,fq.gz}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: Demultiplexed and/or adapter-trimmed fastq.gz file + pattern: "*.{fastq.gz}" + - log: + type: file + description: Log file containing stdout information + pattern: "*.log" + +authors: + - "@ggabernet" + - "@jasmezz" + - "@d4straub" + - "@LaurenceKuhl" + - "@SusiJo" + - "@jonasscheid" + - "@jonoave" + - "@GokceOGUZ" + - "@jfy133" diff --git a/modules/nf-core/prinseqplusplus/main.nf b/modules/nf-core/prinseqplusplus/main.nf new file mode 100644 index 0000000..ebd8c58 --- /dev/null +++ b/modules/nf-core/prinseqplusplus/main.nf @@ -0,0 +1,61 @@ +process PRINSEQPLUSPLUS { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::prinseq-plus-plus=1.2.3" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/prinseq-plus-plus:1.2.3--hc90279e_1': + 'quay.io/biocontainers/prinseq-plus-plus:1.2.3--hc90279e_1' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*_good_out*.fastq.gz") , emit: good_reads + tuple val(meta), path("*_single_out*.fastq.gz"), optional: true, emit: single_reads + tuple val(meta), path("*_bad_out*.fastq.gz") , optional: true, emit: bad_reads + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + if (meta.single_end) { + """ + prinseq++ \\ + -threads $task.cpus \\ + -fastq ${reads} \\ + -out_name ${prefix} \\ + -out_gz \\ + -VERBOSE 1 \\ + $args \\ + | tee ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + prinseqplusplus: \$(echo \$(prinseq++ --version | cut -f 2 -d ' ' )) + END_VERSIONS + """ + } else { + """ + prinseq++ \\ + -threads $task.cpus \\ + -fastq ${reads[0]} \\ + -fastq2 ${reads[1]} \\ + -out_name ${prefix} \\ + -out_gz \\ + -VERBOSE 1 \\ + $args \\ + | tee ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + prinseqplusplus: \$(echo \$(prinseq++ --version | cut -f 2 -d ' ' )) + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/prinseqplusplus/meta.yml b/modules/nf-core/prinseqplusplus/meta.yml new file mode 100644 index 0000000..8155df9 --- /dev/null +++ b/modules/nf-core/prinseqplusplus/meta.yml @@ -0,0 +1,60 @@ +name: "prinseqplusplus" +description: PRINSEQ++ is a C++ implementation of the prinseq-lite.pl program. It can be used to filter, reformat or trim genomic and metagenomic sequence data +keywords: + - fastq + - fasta + - filter + - trim +tools: + - "prinseqplusplus": + description: "PRINSEQ++ - Multi-threaded C++ sequence cleaning" + homepage: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus" + documentation: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus" + tool_dev_url: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus" + doi: "10.7287/peerj.preprints.27553v1" + licence: "['GPL v2']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end + data, respectively. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - good_reads: + type: file + description: Reads passing filter(s) in gzipped FASTQ format + pattern: "*_good_out_{R1,R2}.fastq.gz" + - single_reads: + type: file + description: | + Single reads without the pair passing filter(s) in gzipped FASTQ format + pattern: "*_single_out_{R1,R2}.fastq.gz" + - bad_reads: + type: file + description: | + Reads without not passing filter(s) in gzipped FASTQ format + pattern: "*_bad_out_{R1,R2}.fastq.gz" + - log: + type: file + description: | + Verbose level 2 STDOUT information in a log file + pattern: "*.log" + +authors: + - "@jfy133" diff --git a/modules/nf-core/samtools/bam2fq/main.nf b/modules/nf-core/samtools/bam2fq/main.nf new file mode 100644 index 0000000..9301d1d --- /dev/null +++ b/modules/nf-core/samtools/bam2fq/main.nf @@ -0,0 +1,56 @@ +process SAMTOOLS_BAM2FQ { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : + 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + + input: + tuple val(meta), path(inputbam) + val split + + output: + tuple val(meta), path("*.fq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + if (split){ + """ + samtools \\ + bam2fq \\ + $args \\ + -@ $task.cpus \\ + -1 ${prefix}_1.fq.gz \\ + -2 ${prefix}_2.fq.gz \\ + -0 ${prefix}_other.fq.gz \\ + -s ${prefix}_singleton.fq.gz \\ + $inputbam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + } else { + """ + samtools \\ + bam2fq \\ + $args \\ + -@ $task.cpus \\ + $inputbam | gzip --no-name > ${prefix}_interleaved.fq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/samtools/bam2fq/meta.yml b/modules/nf-core/samtools/bam2fq/meta.yml new file mode 100644 index 0000000..319a60c --- /dev/null +++ b/modules/nf-core/samtools/bam2fq/meta.yml @@ -0,0 +1,55 @@ +name: samtools_bam2fq +description: | + The module uses bam2fq method from samtools to + convert a SAM, BAM or CRAM file to FASTQ format +keywords: + - bam2fq + - samtools + - fastq +tools: + - samtools: + description: Tools for dealing with SAM, BAM and CRAM files + homepage: None + documentation: http://www.htslib.org/doc/1.1/samtools.html + tool_dev_url: None + doi: "" + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - inputbam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - split: + type: boolean + description: | + TRUE/FALSE value to indicate if reads should be separated into + /1, /2 and if present other, or singleton. + Note: choosing TRUE will generate 4 different files. + Choosing FALSE will produce a single file, which will be interleaved in case + the input contains paired reads. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: | + FASTQ files, which will be either a group of 4 files (read_1, read_2, other and singleton) + or a single interleaved .fq.gz file if the user chooses not to split the reads. + pattern: "*.fq.gz" + +authors: + - "@lescai" diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf new file mode 100644 index 0000000..59ded5c --- /dev/null +++ b/modules/nf-core/samtools/view/main.nf @@ -0,0 +1,56 @@ +process SAMTOOLS_VIEW { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : + 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + + input: + tuple val(meta), path(input), path(index) + path fasta + + output: + tuple val(meta), path("*.bam") , emit: bam , optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta} -C" : "" + def file_type = input.getExtension() + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools \\ + view \\ + --threads ${task.cpus-1} \\ + ${reference} \\ + $args \\ + $input \\ + $args2 \\ + > ${prefix}.${file_type} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + touch ${prefix}.cram + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml new file mode 100644 index 0000000..a8b43ec --- /dev/null +++ b/modules/nf-core/samtools/view/meta.yml @@ -0,0 +1,57 @@ +name: samtools_view +description: filter/convert SAM/BAM/CRAM file +keywords: + - view + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: hhttp://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - index: + type: optional file + description: BAM.BAI/CRAM.CRAI file + pattern: "*.{.bai,.crai}" + - fasta: + type: optional file + description: Reference file the CRAM was created with + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: filtered/converted BAM/SAM file + pattern: "*.{bam,sam}" + - cram: + type: file + description: filtered/converted CRAM file + pattern: "*.cram" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf new file mode 100644 index 0000000..007871b --- /dev/null +++ b/modules/nf-core/untar/main.nf @@ -0,0 +1,64 @@ +process UNTAR { + tag "$archive" + label 'process_single' + + conda (params.enable_conda ? "conda-forge::sed=4.7" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$untar"), emit: untar + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + untar = archive.toString() - '.tar.gz' + + """ + mkdir output + + ## Ensures --strip-components only applied when top level of tar contents is a directory + ## If just files or multiple directories, place all in output + if [[ \$(tar -tzf ${archive} | grep "/\$" | wc -l) -eq 1 ]]; then + tar \\ + -C output --strip-components 1 \\ + -xzvf \\ + $args \\ + $archive \\ + $args2 + else + tar \\ + -C output \\ + -xzvf \\ + $args \\ + $archive \\ + $args2 + fi + + mv output ${untar} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + untar = archive.toString() - '.tar.gz' + """ + touch $untar + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml new file mode 100644 index 0000000..ea7a3f3 --- /dev/null +++ b/modules/nf-core/untar/meta.yml @@ -0,0 +1,40 @@ +name: untar +description: Extract files. +keywords: + - untar + - uncompress +tools: + - untar: + description: | + Extract tar.gz files. + documentation: https://www.gnu.org/software/tar/manual/ + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be untar + pattern: "*.{tar}.{gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - untar: + type: directory + description: Directory containing contents of archive + pattern: "*/" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index 4afe2f5..efb5aff 100644 --- a/nextflow.config +++ b/nextflow.config @@ -37,7 +37,7 @@ params { help = false validate_params = true show_hidden_params = false - schema_ignore_params = 'genomes' + schema_ignore_params = 'genomes,fasta' enable_conda = false @@ -56,6 +56,91 @@ params { max_cpus = 16 max_time = '240.h' + // Databases + databases = null + + // FASTQ preprocessing + perform_shortread_qc = false + shortread_qc_tool = 'fastp' + shortread_qc_skipadaptertrim = false + shortread_qc_mergepairs = true + shortread_qc_excludeunmerged = false + shortread_qc_adapter1 = null + shortread_qc_adapter2 = null + shortread_qc_minlength = 15 + + perform_longread_qc = false + longread_qc_skipadaptertrim = false + longread_qc_skipqualityfilter = false + longread_qc_qualityfilter_minlength = 1000 + longread_qc_qualityfilter_keeppercent = 90 + longread_qc_qualityfilter_targetbases = 500000000 + + save_preprocessed_reads = false + + // Complexity filtering + perform_shortread_complexityfilter = false + shortread_complexityfilter_tool = 'bbduk' + shortread_complexityfilter_entropy = 0.3 + shortread_complexityfilter_bbduk_windowsize = 50 + shortread_complexityfilter_bbduk_mask = false + shortread_complexityfilter_prinseqplusplus_mode = 'entropy' + shortread_complexityfilter_prinseqplusplus_dustscore = 0.5 + shortread_complexityfilter_fastp_threshold = 30 + save_complexityfiltered_reads = false + + // run merging + perform_runmerging = false + save_runmerged_reads = false + + // Host Removal + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + hostremoval_reference = null + shortread_hostremoval_index = null + longread_hostremoval_index = null + save_hostremoval_index = false + save_hostremoval_mapped = false + save_hostremoval_unmapped = false + + + // MALT + run_malt = false + malt_mode = 'BlastN' + malt_generate_megansummary = false + malt_save_reads = false // added via map + database args extension in profiling.nf + + // kraken2 + run_kraken2 = false + kraken2_save_reads = false // added directly to module in profiling.nf + kraken2_save_readclassification = false // added directly to module in profiling.nf + + // centrifuge + run_centrifuge = false + centrifuge_save_reads = false // added directly to module in profiling.nf + + // metaphlan3 + run_metaphlan3 = false + + // kaiju + run_kaiju = false + kaiju_taxon_rank = 'species' + + // diamond + run_diamond = false + diamond_output_format = 'tsv' // TSV is only format with taxonomic information apparently + diamond_save_reads = false // this will override default diamond output format so no taxonomic profile is generated! added directly to module in profiling.nf + + // mOTUs + run_motus = false + + // krona + run_krona = false + krona_taxonomy_directory = null + + // profile standardisation + run_profile_standardisation = false + generate_biom_output = false } // Load base.config by default for all pipelines @@ -70,11 +155,11 @@ try { // Load nf-core/taxprofiler custom profiles from different institutions. // Warning: Uncomment only if a pipeline-specific instititutional config already exists on nf-core/configs! -// try { -// includeConfig "${params.custom_config_base}/pipeline/taxprofiler.config" -// } catch (Exception e) { -// System.err.println("WARNING: Could not load nf-core/config/taxprofiler profiles: ${params.custom_config_base}/pipeline/taxprofiler.config") -// } +try { + includeConfig "${params.custom_config_base}/pipeline/taxprofiler.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/config/taxprofiler profiles: ${params.custom_config_base}/pipeline/taxprofiler.config") +} profiles { @@ -138,12 +223,18 @@ profiles { executor.cpus = 16 executor.memory = 60.GB } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } + test_noprofiling { includeConfig 'conf/test_noprofiling.config' } + test_nopreprocessing { includeConfig 'conf/test_nopreprocessing.config' } + test_nothing { includeConfig 'conf/test_nothing.config' } + test_motus { includeConfig 'conf/test_motus.config' } + test_pep { includeConfig 'conf/test_pep.config' } } // Load igenomes.config if required + if (!params.igenomes_ignore) { includeConfig 'conf/igenomes.config' } else { @@ -156,7 +247,7 @@ if (!params.igenomes_ignore) { // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. env { - PYTHONNOUSERSITE = 1 + PYTHONNOUSERSITE = '1' R_PROFILE_USER = "/.Rprofile" R_ENVIRON_USER = "/.Renviron" JULIA_DEPOT_PATH = "/usr/local/share/julia" diff --git a/nextflow_schema.json b/nextflow_schema.json index da76c3f..f88443f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,18 +10,26 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": ["input", "outdir", "databases"], "properties": { "input": { "type": "string", "format": "file-path", "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", + "pattern": "^\\S+\\.(csv|yaml|yml)$", "schema": "assets/schema_input.json", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).", + "description": "Path to comma-separated file containing information about the samples and libraries/runs.", + "help_text": "You will need to create a design file with information about the samples and libraries/runs you want to running in your pipeline run. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" }, + "databases": { + "type": "string", + "mimetype": "text/csv", + "format": "file-path", + "fa_icon": "fas fa-database", + "description": "Path to comma-separated file containing information about databases and profiling parameters for each taxonomic profiler", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/dev/usage#full-database-sheet).\n\nProfilers will only be executed if a corresponding database are supplied. \n\nWe recommend storing this database sheet somewhere centrally and accessible by others members of your lab/institutions, as this file will likely be regularly reused." + }, "outdir": { "type": "string", "format": "directory-path", @@ -42,43 +50,408 @@ } } }, - "reference_genome_options": { - "title": "Reference genome options", + "preprocessing_general_qc_options": { + "title": "Preprocessing general QC options", "type": "object", - "fa_icon": "fas fa-dna", - "description": "Reference genome related files and options required for the workflow.", + "description": "Common options across both long and short read preprocessing QC steps", + "default": "", "properties": { - "genome": { - "type": "string", - "description": "Name of iGenomes reference.", - "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." - }, - "fasta": { - "type": "string", - "format": "file-path", - "mimetype": "text/plain", - "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", - "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", - "fa_icon": "far fa-file-code" - }, - "igenomes_base": { - "type": "string", - "format": "directory-path", - "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes", - "fa_icon": "fas fa-cloud-download-alt", - "hidden": true - }, - "igenomes_ignore": { + "save_preprocessed_reads": { "type": "boolean", - "description": "Do not load the iGenomes reference config.", - "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + "fa_icon": "fas fa-save", + "description": "Save reads from adapter clipping/pair-merging, length filtering for both short and long reads", + "help_text": "This saves the FASTQ output from the following tools:\n\n- fastp\n- AdapterRemoval\n- Porechop\n- Filtlong\n\nThese reads will be a mixture of: adapter clipped, quality trimmed, pair-merged, and length filtered, depending on the parameters you set." } - } + }, + "fa_icon": "fas fa-users-cog" + }, + "preprocessing_short_read_qc_options": { + "title": "Preprocessing short-read QC options", + "type": "object", + "description": "Options for adapter clipping, quality trimming, pair-merging, and complexity filtering", + "default": "", + "properties": { + "perform_shortread_qc": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turns on short read quality control steps (adapter clipping, complexity filtering etc.)", + "help_text": "Turns on short read quality control steps (adapter clipping, complexity filtering etc.)\n\nThis subworkflow can perform:\n\n- Adapter removal\n- Read quality trimming\n- Read pair merging\n- Length filtering\n- Complexity filtering\n\nEither with fastp or AdapterRemoval.\n\nRemoving adapters (if present) is recommend to reduce false-postive hits that may occur from 'dirty' or 'contaminated' reference genomes in a profiling database that contain accidentially incorporated adapter sequences. Note that some, but not all, tools support paired-end alignment (utilising information about the insert covered by the pairs). However read pair merging in some cases can be recommend to increase read length (such as in aDNA). Length filtering, and/or complexity can speed up alignment by reducing the number of short unspecific reads that need to be aligned." + }, + "shortread_qc_tool": { + "type": "string", + "default": "fastp", + "enum": ["fastp", "adapterremoval"], + "fa_icon": "fas fa-tools", + "description": "Specify which tool to use for short-read QC" + }, + "shortread_qc_skipadaptertrim": { + "type": "boolean", + "fa_icon": "fas fa-forward", + "description": "Skip adapter trimming", + "help_text": "Skip the removal of sequencing adapters. \n\nThis often can be useful to speed up run-time of the pipeline when analysing data downloaded from public databases such as the ENA or SRA, as adapters should already be removed (however we recommend to check FastQC results to ensure this is the case)." + }, + "shortread_qc_adapter1": { + "type": "string", + "default": "None", + "fa_icon": "fas fa-grip-lines", + "description": "Specify adapter 1 nucleotide sequence", + "help_text": "Specify a custom forward or R1 adapter sequence to be removed from reads. \n\nIf not set, the selected short-read QC tool's defaults will be used.\n\n> Modifies tool parameter(s):\n> - fastp: `--adapter_sequence`. fastp default: `AGATCGGAAGAGCACACGTCTGAACTCCAGTCA`\n> - AdapterRemoval: `--adapter1`. AdapteRemoval2 default: `AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG`" + }, + "shortread_qc_adapter2": { + "type": "string", + "default": "None", + "fa_icon": "fas fa-grip-lines", + "description": "Specify adapter 2 nucleotide sequence", + "help_text": "Specify a custom reverse or R2 adapter sequence to be removed from reads. \n\nIf not set, the selected short-read QC tool's defaults will be used.\n\n> Modifies tool parameter(s):\n> - fastp: `--adapter_sequence`. fastp default: `AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT`\n> - AdapterRemoval: `--adapter1`. AdapteRemoval2 default: `AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT`" + }, + "shortread_qc_mergepairs": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on merging of read pairs for paired-end data", + "default": true, + "help_text": "Turn on the merging of read-pairs of paired-end short read sequencing data for AdapterRemoval (this is performed automatically with fastp).\n\n> Modifies tool parameter(s):\n> - AdapterRemoval: `--collapse`\n" + }, + "shortread_qc_excludeunmerged": { + "type": "boolean", + "fa_icon": "far fa-times-circle", + "description": "Discard unmerged reads from paired-end merging", + "help_text": "Turns off the inclusion of unmerged reads in resulting processing FASTQ file of paired-end sequencing data when using `fastp`.\n\nThis can be useful in cases where you prefer to have very short reads (e.g. aDNA), thus excluding longer-reads or possibly faulty reads where one of the pair was discarded.\n\n> Modifies tool parameter(s):\n> - removed from reads `--include_unmerged`\n" + }, + "shortread_qc_minlength": { + "type": "integer", + "default": 15, + "fa_icon": "fas fa-ruler-horizontal", + "description": "Specify the minimum length of reads to be retained", + "help_text": "Specifying a mimum read length filtering can speed up profiling by reducing the number of short unspecific reads that need to be match/aligned to the database.\n\n> Modifies tool parameter(s):\n> - removed from reads `--length_required`\n> - AdapterRemoval: `--minlength`" + }, + "perform_shortread_complexityfilter": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turns on nucleotide sequence complexity filtering", + "help_text": "Turns on sequencing complexity filtering. Complexity filtering can be useful to increase run-time by removing unspecific read sequences that do not provide any informative taxon ID." + }, + "shortread_complexityfilter_tool": { + "type": "string", + "default": "bbduk", + "enum": ["bbduk", "prinseqplusplus", "fastp"], + "fa_icon": "fas fa-hammer", + "description": "Specify which tool to use for complexity filtering" + }, + "shortread_complexityfilter_entropy": { + "type": "number", + "default": 0.3, + "fa_icon": "fas fa-random", + "description": "Specify the minimum sequence entropy level for complexity filtering", + "help_text": "Specify the minimum 'entropy' value for complexity filtering for BBDuk or PRINSEQ++.\n\nNote that this value will only be used for PRINSEQ++ if `--shortread_complexityfilter_prinseqplusplus_mode` is set to `entropy`.\n\nEntropy here corresponds to the amount of sequence variation exists within the read. Higher values correspond to more variety, and thus will likely reslut in more specific matching to a taxon's reference genome. The trade off here is fewer reads (or abundance information) available for having a confident identification.\n\n\n> Modifies tool parameter(s):\n> - BBDuk: `entropy=`\n> - PRINSEQ++: `-lc_entropy`\n\n" + }, + "shortread_complexityfilter_bbduk_windowsize": { + "type": "integer", + "default": 50, + "fa_icon": "far fa-window-maximize", + "description": "Specify the window size for BBDuk complexity filtering", + "help_text": "Specify the window size to calculate the level entropy within for BBDuk.\n\n> Modifies tool parameter(s):\n> - BBDuk: `entropywindow=`" + }, + "shortread_complexityfilter_bbduk_mask": { + "type": "boolean", + "fa_icon": "fas fa-mask", + "description": "Turn on masking rather than discarding of low complexity reads for BBduk", + "help_text": "Turn on masking of low-complexity reads (i.e., replacement with `N`) rather than removal.\n\n> Modifies tool parameter(s)\n> - BBDuk: `entropymask=`" + }, + "shortread_complexityfilter_fastp_threshold": { + "type": "integer", + "default": 30, + "fa_icon": "fas fa-sort-numeric-down", + "description": "Specify the minimum complexity filter threshold of fastp", + "help_text": "Specify the minimum sequence complexity value for fastp. This value corresponds to the percentage of bases that is different from it's adjacent bases.\n\n> Modifies tool parameter(s):\n> - removed from reads `--complexity_threshold`" + }, + "shortread_complexityfilter_prinseqplusplus_mode": { + "type": "string", + "default": "entropy", + "enum": ["entropy", "dust"], + "fa_icon": "fas fa-check-square", + "description": "Specify the complexity filter mode for PRINSEQ++" + }, + "shortread_complexityfilter_prinseqplusplus_dustscore": { + "type": "number", + "default": 0.5, + "fa_icon": "fas fa-head-side-mask", + "description": "Specify the minimum dust score for PRINTSEQ++ complexity filtering", + "help_text": "Specify the minimum dust score below which low-complexity reads will be removed. A DUST score is based on how often different tri-nucleotides occur along a read.\n\n> Modifies tool parameter(s):\n> - PRINSEQ++: `--lc_dust`" + }, + "save_complexityfiltered_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Save complexity filtered short-reads", + "help_text": "Specify whether to save the final complexity filtered reads in your results directory (`--outdir`)." + } + }, + "fa_icon": "fas fa-compress-alt" + }, + "preprocessing_long_read_qc_options": { + "title": "Preprocessing long-read QC options", + "type": "object", + "description": "Options for adapter clipping, quality trimming, and length filtering", + "default": "", + "properties": { + "perform_longread_qc": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turns on long read quality control steps (adapter clipping, length filtering etc.)", + "help_text": "Turns on long read quality control steps (adapter clipping, length and/or quality filtering.)\n\nRemoving adapters (if present) is recommend to reduce false-postive hits that may occur from 'dirty' or 'contaminated' reference genomes in a profiling database that contain accidentially incorporated adapter sequences.\n\nLength filtering, and quality filtering can speed up alignment by reducing the number of unspecific reads that need to be aligned." + }, + "longread_qc_skipadaptertrim": { + "type": "boolean", + "description": "Skip long-read trimming", + "fa_icon": "fas fa-forward", + "help_text": "Skip removal of adapters by Porechop. This can be useful in some cases to speed up run time - particularly when you are running data downloading from public databases such as the ENA/SRA that should already have adapters removed. We recommend that you check your FastQC results this is indeed the case." + }, + "longread_qc_skipqualityfilter": { + "type": "boolean", + "description": "Skip long-read length and quality filtering", + "fa_icon": "fas fa-forward", + "help_text": "Skip removal of quality filtering with Filtlong. This will skip length, percent reads, and target bases filtering (see other `--longread_qc_qualityfilter_*` parameters)." + }, + "longread_qc_qualityfilter_minlength": { + "type": "integer", + "default": 1000, + "description": "Specify the minimum length of reads to be retained", + "fa_icon": "fas fa-ruler-horizontal", + "help_text": "Specify the minimum of length of reads to be kept for downstream analysis.\n\n> Modifies tool parameter(s):\n> - Filtlong: `--min_length`" + }, + "longread_qc_qualityfilter_keeppercent": { + "type": "integer", + "default": 90, + "description": "Specify the percent of high-quality bases to be retained", + "fa_icon": "fas fa-percentage", + "help_text": "Throw out the remaining percentage of reads outside the value. This is measured by bp, not by read count. So this option throws out the worst e.g. 10% of read bases if the parameter is set to `90`. _Modified from [Filtlong documentation](https://github.com/rrwick/Filtlong)_\n\n> Modifies tool parameter(s):\n> - Filtlong: `--keep_percent`" + }, + "longread_qc_qualityfilter_targetbases": { + "type": "integer", + "default": 500000000, + "description": "Specify the number of high-quality bases in the library to be retained", + "fa_icon": "fas fa-bullseye", + "help_text": "Removes the worst reads until only the specified value of bases remain, useful for very large read sets. If the input read set is less than the specified value, this setting will have no effect. _Modified from [Filtlong documentation](https://github.com/rrwick/Filtlong)_\n\n> Modifies tool parameter(s):\n> - Filtlong: `--keep_percent`" + } + }, + "fa_icon": "fas fa-expand-alt" + }, + "preprocessing_host_removal_options": { + "title": "Preprocessing host removal options", + "type": "object", + "description": "Options for pre-profiling host read removal", + "default": "", + "properties": { + "perform_shortread_hostremoval": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on short-read host removal", + "help_text": "Turns on the ability to remove short-reads from the that derived from a known organism, using Bowtie2 and samtools\n\nThis subworkflow is useful to remove reads that may come from a host, or a known contamination like the human reference genome. Human DNA contamination of (microbial) reference genomes is well known, so removal of these prior profiling both reduces the risks of false positives, and in _some cases_ a faster runtime (as less reads need to be profiled).\n\nAlternatively, you can include the reference genome within your profiling databases and can turn off this subworkflow, with the trade off of a larger taxonomic profiling database." + }, + "perform_longread_hostremoval": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on long-read host removal", + "help_text": "Turns on the ability to remove long-reads from the that derived from a known organism, using minimap2 and samtools\n\nThis subworkflow is useful to remove reads that may come from a host, or a known contamination like the human reference genome. Human DNA contamination of (microbial) reference genomes is well known, so removal of these prior profiling both reduces the risks of false positives, and in _some cases_ a faster runtime (as less reads need to be profiled).\n\nAlternatively, you can include the reference genome within your profiling databases and can turn off this subworkflow, with the trade off of a larger taxonomic profiling database." + }, + "hostremoval_reference": { + "type": "string", + "default": "None", + "fa_icon": "fas fa-file-alt", + "description": "Specify path to single reference FASTA of host(s) genome(s)", + "help_text": "Specify a path to the FASTA file (optionally gzipped) of the reference genome of the organism to be removed.\n\nIf you have two or more host organisms or contaminants you wish to remove, you can concatenate the FASTAs of the different taxa into a single one to provide to the pipeline." + }, + "shortread_hostremoval_index": { + "type": "string", + "default": "None", + "fa_icon": "fas fa-address-book", + "description": "Specify path to the directory containing pre-made BowTie2 indexes of the host removal reference", + "help_text": "Specify the path to a _directory_ containing pre-made Bowtie2 reference index files (i.e. the directory containing `.bt1`, `.bt2` files etc.). These should sit in the same directory alongside the the reference file specified in `--hostremoval_reference`.\n\nSpecifying premade indices can speed up runtime of the host-removal step, however if not supplied the pipeline will generate the indices for you." + }, + "longread_hostremoval_index": { + "type": "string", + "default": "None", + "fa_icon": "fas fa-address-book", + "description": "Specify path to a pre-made Minimap2 index file (.mmi) of the host removal reference", + "help_text": "Specify path to a pre-made Minimap2 index file (.mmi) of the host removal reference file given to `--hostremoval_reference`.\n\nSpecifying a premade index file can speed up runtime of the host-removal step, however if not supplied the pipeline will generate the indices for you." + }, + "save_hostremoval_index": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Save mapping index of input reference when not already supplied by user", + "help_text": "Save the output files of the in-built indexing of the host genome.\n\nThis is recommend to be turned on if you plan to use the same reference genome multiple times, as supplying the directory or file to `--shortread_hostremoval_index` or `--longread_hostremoval_index` respectively can speed up runtime of future runs. Once generated, we recommend you place this file _outside_ of your run results directory in a central 'cache' directory you and others using your machine can access and supply to the pipeline." + }, + "save_hostremoval_mapped": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Save mapped reads in BAM format from host removal", + "help_text": "Save the reads mapped to the reference genome in BAM format as output by the respective hostremoval alignment tool.\n\nThis can be useful if you wish to perform other analyses on the host organism (such as host-microbe interaction), however, you should consider whether the default mapping parameters of Bowtie2 (short-read) or minimap2 (long-read) are optimised to your context. " + }, + "save_hostremoval_unmapped": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Save unmapped reads in FASTQ format from host removal", + "help_text": "Save the unreads mapped to the reference genome in FASTQ format (as exported from `samtools view`).\n\nThis can be useful if you wish to perform other analyses on the off-target reads from the host mapping, such as manual profiling or _de novo_ assembly." + } + }, + "fa_icon": "fas fa-user-times" + }, + "preprocessing_run_merging_options": { + "title": "Preprocessing run merging options", + "type": "object", + "description": "Options for per-sample run-merging", + "default": "", + "properties": { + "perform_runmerging": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on run merging", + "help_text": "Turns on the concatenation of sequencing runs or libraries with the same sample name.\n\nThis can be useful to ensure you get a single profile per sample, rather than one profile per run or library. Note that in some cases comparing profiles of independent _libraries_ may be useful, so this parameter may not always be suitable. " + }, + "save_runmerged_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Save run-concatenated input FASTQ files for each sample", + "help_text": "Save the run- and library-concatenated reads of a given sample in FASTQ format." + } + }, + "fa_icon": "fas fa-clipboard-check" + }, + "profiling_options": { + "title": "Profiling options", + "type": "object", + "description": "", + "default": "", + "properties": { + "run_centrifuge": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on profiling with Centrifuge. Requires database to be present CSV file passed to --databases" + }, + "centrifuge_save_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of Centrifuge-aligned reads", + "help_text": "Save mapped (SAM, FASTQ) and unmapped (FASTQ) reads from alignment step of centrifuge in your output results directory.\n\n> Modifies tool parameter(s):\n> - centrifuge: `--un-gz`, `--al-gz`, `--un-conc-gz`, `--al-conc-gz`, `--out-fmt`" + }, + "run_diamond": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on profiling with DIAMOND. Requires database to be present CSV file passed to --databases" + }, + "diamond_output_format": { + "type": "string", + "default": "tsv", + "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"], + "fa_icon": "fas fa-file", + "description": "Specify output format from DIAMOND profiling.", + "help_text": "DIAMOND can produce output in a number of different formats, you can specify here which to produce.\n\nNote that DIAMOND can only produce one format at a time, and depending on which you pick, some downstream steps may not be executed. For example, selecting `daa` or `sam` will mean you will not get a tabular taxonomic profile as with the other tools.\n\nWill be overriden by `--diamond_save_reads.`\n\n> Modifies tool parameter(s):\n> - diamond blastx: `--outfmt`" + }, + "diamond_save_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of DIAMOND-aligned reads. Will override --diamond_output_format and no taxon tables will be generated", + "help_text": "Save aligned reads in SAM format from alignment step of DIAMOND in your output results directory.\n\nNote this explicitly overrides `--diamond_output_format` to produce the SAM file, and no taxon table will be generated.\n\n> Modifies tool parameter(s):\n> - DIAMOND: `--outfmt`" + }, + "run_kaiju": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on profiling with Kaiju. Requires database to be present CSV file passed to --databases" + }, + "kaiju_taxon_rank": { + "type": "string", + "default": "species", + "enum": ["phylum", "class", "order", "family", "genus", "species"], + "fa_icon": "fas fa-tag", + "description": "Specify taxonomic rank to be displayed in Kaiju taxon table", + "help_text": "Specify the taxonomic level(s) to be displayed in the resulting Kaiju taxon table, as generated by the kaiju2table helper tool.\n\nThis can be either a single level (e.g. `species`), or a comma separated list to display the full taxonomic path (e.g. `superkingdom,phylum,class,order,family,genus,species.`).\n\n> Modifies tool parameter(s):\n> - kaiju2table: `-l`" + }, + "run_kraken2": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on profiling with Kraken2. Requires database to be present CSV file passed to --databases" + }, + "kraken2_save_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of Kraken2-aligned reads", + "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - kraken2: `--classified-out` and `--unclassified-out`" + }, + "kraken2_save_readclassification": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of Kraken2 per-read taxonomic assignment file", + "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - kraken2: `--output`" + }, + "run_malt": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on profiling with MALT. Requires database to be present CSV file passed to --databases" + }, + "malt_mode": { + "type": "string", + "default": "BlastN", + "fa_icon": "fas fa-check-square", + "description": "Specify which MALT alignment mode to use", + "help_text": "Specify which version of MALT alignment to use.\n\nBlastN is generally recommended (nucleotide-nucleotide alignment), but particularly for very short reads (such as aDNA), whereas BlastX mode is similar to DIAMOND and will translate the nucleotide to amino acid sequences. Note each type of alignment mode requires different parameters during database construction. Refer to the MALT manual for more information.\n\n> Modifies tool parameter(s):\n> - malt-run: `-mode` " + }, + "malt_save_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of MALT-aligned reads", + "help_text": "Turns on saving of MALT aligned reads in SAM format.\n\nNote that the SAM format produce by MALT is not completely valid, and may not work with downstream tools.\n\n> Modifies tool parameter(s):\n> - malt-run: `--alignments`, `-za`" + }, + "malt_generate_megansummary": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on generation of MEGAN summary file from MALT results", + "help_text": "Turns on saving of MALT output in an additional MEGAN summary file (`.megan`) that can be loaded into the MEGAN metagenomic exploration tool.\n\nNote: this file is generated not directly from MALT but rather then MEGAN utility script `rma2info`.\n\n> Modifies tool parameter(s):\n> - rma2info: `-es`" + }, + "run_metaphlan3": { + "type": "boolean", + "description": "Turn on profiling with MetaPhlAn3. Requires database to be present CSV file passed to --databases", + "fa_icon": "fas fa-toggle-on" + }, + "run_motus": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on profiling with mOTUs. Requires database to be present CSV file passed to --databases" + } + }, + "fa_icon": "fas fa-align-center" + }, + "postprocessing_and_visualisation_options": { + "title": "Postprocessing and visualisation options", + "type": "object", + "description": "", + "default": "", + "properties": { + "run_profile_standardisation": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on standardisation of taxon tables across profilers", + "help_text": "Turns on standardisation of output OTU tables across all tools; each into a TSV format following the following scheme:\n\n|TAXON | SAMPLE_A | SAMPLE_B |\n|-------------|----------------|-----------------|\n| taxon_a | 32 | 123 |\n| taxon_b | 1 | 5 |\n\nThis currently only is generated for mOTUs." + }, + "generate_biom_output": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on generation of BIOM output (currently only applies to mOTUs)", + "help_text": "Turn on the saving of the taxonomic output in BIOM format (`.biom`) in the results directory of your pipeline run, instead of the default TSV format.\n\nNote this file is from the output of the `motus merge` command.\n\n> Modifies tool parameter(s):\n> - `-B -o`" + }, + "run_krona": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on generation of Krona plots for supported profilers", + "help_text": "Turn on the generation of Krona interactive pie-chart HTMLs for a selection of profilers.\n\nThe tools currently supported are:\n\n- centrifuge\n- kraken2\n- kaiju\n- MALT" + }, + "krona_taxonomy_directory": { + "type": "string", + "default": "None", + "fa_icon": "fas fa-folder-open", + "description": "Specify path to krona taxonomy directories (required for MALT krona plots)", + "help_text": "Specify a path to a Krona taxonomy database directory (i.e. a directory containing a krona generated `.tab` file).\n\nThis is only required for generating Krona plots of MALT output.\n\nNote this taxonomy database must be downloaded and generated with the `updateTaxonomy.sh` script from the krona-tools package." + } + }, + "fa_icon": "fas fa-chart-line" }, "institutional_config_options": { "title": "Institutional config options", @@ -265,6 +638,36 @@ "fa_icon": "fas fa-bacon" } } + }, + "reference_genome_options": { + "title": "Reference genome options", + "type": "object", + "fa_icon": "fas fa-dna", + "description": "Reference genome related files and options required for the workflow.", + "properties": { + "genome": { + "type": "string", + "description": "Name of iGenomes reference.", + "fa_icon": "fas fa-book", + "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details.", + "hidden": true + }, + "igenomes_base": { + "type": "string", + "format": "directory-path", + "description": "Directory / URL base for iGenomes references.", + "default": "s3://ngi-igenomes/igenomes", + "fa_icon": "fas fa-cloud-download-alt", + "hidden": true + }, + "igenomes_ignore": { + "type": "boolean", + "description": "Do not load the iGenomes reference config.", + "fa_icon": "fas fa-ban", + "hidden": true, + "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + } + } } }, "allOf": [ @@ -272,7 +675,25 @@ "$ref": "#/definitions/input_output_options" }, { - "$ref": "#/definitions/reference_genome_options" + "$ref": "#/definitions/preprocessing_general_qc_options" + }, + { + "$ref": "#/definitions/preprocessing_short_read_qc_options" + }, + { + "$ref": "#/definitions/preprocessing_long_read_qc_options" + }, + { + "$ref": "#/definitions/preprocessing_host_removal_options" + }, + { + "$ref": "#/definitions/preprocessing_run_merging_options" + }, + { + "$ref": "#/definitions/profiling_options" + }, + { + "$ref": "#/definitions/postprocessing_and_visualisation_options" }, { "$ref": "#/definitions/institutional_config_options" @@ -282,6 +703,9 @@ }, { "$ref": "#/definitions/generic_options" + }, + { + "$ref": "#/definitions/reference_genome_options" } ] } diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf new file mode 100644 index 0000000..7b440c6 --- /dev/null +++ b/subworkflows/local/db_check.nf @@ -0,0 +1,53 @@ +// +// Check input samplesheet and get read channels +// + +include { DATABASE_CHECK } from '../../modules/local/database_check' +include { UNTAR } from '../../modules/nf-core/modules/untar/main' + +workflow DB_CHECK { + take: + dbsheet // file: /path/to/dbsheet.csv + + main: + + // TODO: make database sheet check + // Checks: + // 1) no duplicates, + // 2) args do not have quotes, e.g. just `,,` and NOT `,"",` + parsed_samplesheet = DATABASE_CHECK ( dbsheet ) + .csv + .splitCsv ( header:true, sep:',' ) + .map { create_db_channels(it) } + + ch_dbs_for_untar = parsed_samplesheet + .branch { + untar: it[1].toString().endsWith(".tar.gz") + skip: true + } + + // TODO Filter to only run UNTAR on DBs of tools actually using? + // TODO make optional whether to save + UNTAR ( ch_dbs_for_untar.untar ) + + ch_final_dbs = ch_dbs_for_untar.skip.mix( UNTAR.out.untar ) + + emit: + dbs = ch_final_dbs // channel: [ val(meta), [ db ] ] + versions = DATABASE_CHECK.out.versions.mix(UNTAR.out.versions.first()) // channel: [ versions.yml ] +} + +def create_db_channels(LinkedHashMap row) { + def meta = [:] + meta.tool = row.tool + meta.db_name = row.db_name + meta.db_params = row.db_params + + def array = [] + if (!file(row.db_path, type: 'dir').exists()) { + exit 1, "ERROR: Please check input samplesheet -> database could not be found!\n${row.db_path}" + } + array = [ meta, file(row.db_path) ] + + return array +} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 0aecf87..e8d5e7a 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -2,30 +2,73 @@ // Check input samplesheet and get read channels // -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' +include { EIDO_VALIDATE } from '../../modules/nf-core/modules/eido/validate/main' +include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main' workflow INPUT_CHECK { take: - samplesheet // file: /path/to/samplesheet.csv + samplesheet_or_pep_config // file: /path/to/samplesheet.csv or /path/to/pep/config.yaml + pep_input_base_dir main: - SAMPLESHEET_CHECK ( samplesheet ) - .csv + ch_versions = Channel.empty() + + EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), pep_input_base_dir ) + ch_versions = ch_versions.mix(EIDO_VALIDATE.out.versions) + + EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir ) + ch_versions = ch_versions.mix(EIDO_CONVERT.out.versions) + + ch_parsed_samplesheet = EIDO_CONVERT.out.samplesheet_converted .splitCsv ( header:true, sep:',' ) + .map { check_missing_and_singleend_autodetect(it) } + .branch { + fasta: it['fasta'] != '' + nanopore: it['instrument_platform'] == 'OXFORD_NANOPORE' + fastq: true + } + + ch_parsed_samplesheet.fastq .map { create_fastq_channel(it) } - .set { reads } + .set { fastq } + + ch_parsed_samplesheet.nanopore + .map { create_fastq_channel(it) } + .set { nanopore } + + ch_parsed_samplesheet.fasta + .map { create_fasta_channel(it) } + .set { fasta } emit: - reads // channel: [ val(meta), [ reads ] ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] + fastq = fastq ?: [] // channel: [ val(meta), [ reads ] ] + nanopore = nanopore ?: [] // channel: [ val(meta), [ reads ] ] + fasta = fasta ?: [] // channel: [ val(meta), fasta ] + versions = ch_versions // channel: [ versions.yml ] +} + +// Function to validate input sheet and auto-detect R1/R2 +def check_missing_and_singleend_autodetect(LinkedHashMap row) { + + // Checks not supported by EIDO(?) + if ( ( row['fastq_1'] != "" || row['fastq_2'] != "" ) && row['fasta'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: FastQ and FastA files cannot be specified together in the same library. Check input samplesheet! Check sample: ${row['sample']}" } + if ( row['fastq_1'] == "" && row['fastq_2'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: Input samplesheet has a missing fastq_1 when fastq_2 is specified. Check sample: ${row['sample']}" } + + single_end = row['fastq_2'] == "" ? true : false + row['single_end'] = single_end + + return row } // Function to get list of [ meta, [ fastq_1, fastq_2 ] ] def create_fastq_channel(LinkedHashMap row) { // create meta map def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() + meta.id = row.sample + meta.run_accession = row.run_accession + meta.instrument_platform = row.instrument_platform + meta.single_end = row.single_end.toBoolean() + meta.is_fasta = false // add path(s) of the fastq file(s) to the meta map def fastq_meta = [] @@ -35,10 +78,35 @@ def create_fastq_channel(LinkedHashMap row) { if (meta.single_end) { fastq_meta = [ meta, [ file(row.fastq_1) ] ] } else { - if (!file(row.fastq_2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" + if (meta.instrument_platform == 'OXFORD_NANOPORE') { + if (row.fastq_2 != '') { + exit 1, "ERROR: Please check input samplesheet -> For Oxford Nanopore reads Read 2 FastQ should be empty!\n${row.fastq_2}" + } + fastq_meta = [ meta, [ file(row.fastq_1) ] ] + } else { + if (!file(row.fastq_2).exists()) { + exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" + } + fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] + } return fastq_meta + +}// Function to get list of [ meta, fasta ] +def create_fasta_channel(LinkedHashMap row) { + def meta = [:] + meta.id = row.sample + meta.run_accession = row.run_accession + meta.instrument_platform = row.instrument_platform + meta.single_end = true + meta.is_fasta = true + + def array = [] + if (!file(row.fasta).exists()) { + exit 1, "ERROR: Please check input samplesheet -> FastA file does not exist!\n${row.fasta}" + } + array = [ meta, [ file(row.fasta) ] ] + + return array } diff --git a/subworkflows/local/longread_hostremoval.nf b/subworkflows/local/longread_hostremoval.nf new file mode 100644 index 0000000..7db020b --- /dev/null +++ b/subworkflows/local/longread_hostremoval.nf @@ -0,0 +1,47 @@ +// +// Remove host reads via alignment and export off-target reads +// + +include { MINIMAP2_INDEX } from '../../modules/nf-core/modules/minimap2/index/main' +include { MINIMAP2_ALIGN } from '../../modules/nf-core/modules/minimap2/align/main' +include { SAMTOOLS_VIEW } from '../../modules/nf-core/modules/samtools/view/main' +include { SAMTOOLS_BAM2FQ } from '../../modules/nf-core/modules/samtools/bam2fq/main' + +workflow LONGREAD_HOSTREMOVAL { + take: + reads // [ [ meta ], [ reads ] ] + reference // /path/to/fasta + index // /path/to/index + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( !params.longread_hostremoval_index ) { + ch_minimap2_index = MINIMAP2_INDEX ( reference ).index + ch_versions = ch_versions.mix( MINIMAP2_INDEX.out.versions ) + } else { + ch_minimap2_index = index + } + + MINIMAP2_ALIGN ( reads, ch_minimap2_index, true, false, false ) + ch_versions = ch_versions.mix( MINIMAP2_ALIGN.out.versions.first() ) + ch_minimap2_mapped = MINIMAP2_ALIGN.out.bam + .map { + meta, reads -> + [ meta, reads, [] ] + } + + + SAMTOOLS_VIEW ( ch_minimap2_mapped , [] ) + ch_versions = ch_versions.mix( SAMTOOLS_VIEW.out.versions.first() ) + + SAMTOOLS_BAM2FQ ( SAMTOOLS_VIEW.out.bam, false ) + ch_versions = ch_versions.mix( SAMTOOLS_BAM2FQ.out.versions.first() ) + + + emit: + reads = SAMTOOLS_BAM2FQ.out.reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] +} + diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf new file mode 100644 index 0000000..c04207e --- /dev/null +++ b/subworkflows/local/longread_preprocessing.nf @@ -0,0 +1,63 @@ +// +// Process long raw reads with porechop +// + +include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main' +include { PORECHOP } from '../../modules/nf-core/modules/porechop/main' +include { FILTLONG } from '../../modules/nf-core/modules/filtlong/main' + +workflow LONGREAD_PREPROCESSING { + take: + reads + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( !params.longread_qc_skipadaptertrim && params.longread_qc_skipqualityfilter) { + PORECHOP ( reads ) + + ch_processed_reads = PORECHOP.out.reads + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] + } + + ch_versions = ch_versions.mix(PORECHOP.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( PORECHOP.out.log ) + + } else if ( params.longread_qc_skipadaptertrim && !params.longread_qc_skipqualityfilter) { + + ch_processed_reads = FILTLONG ( reads.map{ meta, reads -> [meta, [], reads ]} ) + ch_versions = ch_versions.mix(FILTLONG.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( FILTLONG.out.log ) + + } else { + PORECHOP ( reads ) + ch_clipped_reads = PORECHOP.out.reads + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] + } + + ch_processed_reads = FILTLONG ( ch_clipped_reads.map{ meta, reads -> [meta, [], reads ]} ).reads + + ch_versions = ch_versions.mix(PORECHOP.out.versions.first()) + ch_versions = ch_versions.mix(FILTLONG.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( PORECHOP.out.log ) + ch_multiqc_files = ch_multiqc_files.mix( FILTLONG.out.log ) + } + + FASTQC_PROCESSED ( ch_processed_reads ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + + emit: + reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf new file mode 100644 index 0000000..c2ef508 --- /dev/null +++ b/subworkflows/local/profiling.nf @@ -0,0 +1,237 @@ +// +// Run profiling +// + +include { MALT_RUN } from '../../modules/nf-core/modules/malt/run/main' +include { MEGAN_RMA2INFO as MEGAN_RMA2INFO_TSV } from '../../modules/nf-core/modules/megan/rma2info/main' +include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/modules/kraken2/kraken2/main' +include { CENTRIFUGE_CENTRIFUGE } from '../../modules/nf-core/modules/centrifuge/centrifuge/main' +include { CENTRIFUGE_KREPORT } from '../../modules/nf-core/modules/centrifuge/kreport/main' +include { METAPHLAN3_METAPHLAN3 } from '../../modules/nf-core/modules/metaphlan3/metaphlan3/main' +include { KAIJU_KAIJU } from '../../modules/nf-core/modules/kaiju/kaiju/main' +include { DIAMOND_BLASTX } from '../../modules/nf-core/modules/diamond/blastx/main' +include { MOTUS_PROFILE } from '../../modules/nf-core/modules/motus/profile/main' + +workflow PROFILING { + take: + reads // [ [ meta ], [ reads ] ] + databases // [ [ meta ], path ] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + ch_raw_classifications = Channel.empty() + ch_raw_profiles = Channel.empty() + +/* + COMBINE READS WITH POSSIBLE DATABASES + */ + + // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] + ch_input_for_profiling = reads + .map { + meta, reads -> + def meta_new = meta.clone() + pairtype = meta_new['single_end'] ? '_se' : '_pe' + meta_new['id'] = meta_new['id'] + pairtype + [meta_new, reads] + } + .combine(databases) + .branch { + malt: it[2]['tool'] == 'malt' + kraken2: it[2]['tool'] == 'kraken2' + metaphlan3: it[2]['tool'] == 'metaphlan3' + centrifuge: it[2]['tool'] == 'centrifuge' + kaiju: it[2]['tool'] == 'kaiju' + diamond: it[2]['tool'] == 'diamond' + motus: it[2]['tool'] == 'motus' + unknown: true + } + + /* + PREPARE PROFILER INPUT CHANNELS & RUN PROFILING + */ + + // Each tool as a slightly different input structure and generally separate + // input channels for reads vs databases. We restructure the channel tuple + // for each tool and make liberal use of multiMap to keep reads/databases + // channel element order in sync with each other + + if ( params.run_malt ) { + + + // MALT: We groupTuple to have all samples in one channel for MALT as database + // loading takes a long time, so we only want to run it once per database + ch_input_for_malt = ch_input_for_profiling.malt + .filter { it[0]['instrument_platform'] == 'ILLUMINA' } + .map { + meta, reads, db_meta, db -> + + // Reset entire input meta for MALT to just database name, + // as we don't run run on a per-sample basis due to huge datbaases + // so all samples are in one run and so sample-specific metadata + // unnecessary. Set as database name to prevent `null` job ID and prefix. + def temp_meta = [ id: meta['db_name'] ] + + // Extend database parameters to specify whether to save alignments or not + def new_db_meta = db_meta.clone() + def sam_format = params.malt_save_reads ? ' --alignments ./ -za false' : "" + new_db_meta['db_params'] = db_meta['db_params'] + sam_format + + // Combine reduced sample metadata with updated database parameters metadata, + // make sure id is db_name for publishing purposes. + def new_meta = temp_meta + new_db_meta + new_meta['id'] = new_meta['db_name'] + + [ new_meta, reads, db ] + + } + .groupTuple(by: [0,2]) + .multiMap { + it -> + reads: [ it[0], it[1].flatten() ] + db: it[2] + } + + MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) + + ch_maltrun_for_megan = MALT_RUN.out.rma6 + .transpose() + .map{ + meta, rma -> + // re-extract meta from file names, use filename without rma to + // ensure we keep paired-end information in downstream filenames + // when no pair-merging + def meta_new = meta.clone() + meta_new['db_name'] = meta.id + meta_new['id'] = rma.baseName + [ meta_new, rma ] + } + + MEGAN_RMA2INFO_TSV (ch_maltrun_for_megan, params.malt_generate_megansummary ) + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log ) + ch_versions = ch_versions.mix( MALT_RUN.out.versions.first(), MEGAN_RMA2INFO_TSV.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( ch_maltrun_for_megan ) + ch_raw_profiles = ch_raw_profiles.mix( MEGAN_RMA2INFO_TSV.out.txt ) + + } + + if ( params.run_kraken2 ) { + + ch_input_for_kraken2 = ch_input_for_profiling.kraken2 + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db, params.kraken2_save_reads, params.kraken2_save_readclassification ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report ) + ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( KRAKEN2_KRAKEN2.out.classified_reads_assignment ) + ch_raw_profiles = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.report ) + + } + + if ( params.run_centrifuge ) { + + ch_input_for_centrifuge = ch_input_for_profiling.centrifuge + .filter{ + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] Centrifuge currently does not accept FASTA files as input. Skipping Centrifuge for sample ${it[0].id}." + !it[0].is_fasta + } + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_reads, params.centrifuge_save_reads, params.centrifuge_save_reads ) + CENTRIFUGE_KREPORT (CENTRIFUGE_CENTRIFUGE.out.report, ch_input_for_centrifuge.db) + ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( CENTRIFUGE_CENTRIFUGE.out.results ) + ch_raw_profiles = ch_raw_profiles.mix( CENTRIFUGE_KREPORT.out.kreport ) + ch_multiqc_files = ch_multiqc_files.mix( CENTRIFUGE_KREPORT.out.kreport ) + + } + + if ( params.run_metaphlan3 ) { + + ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 + .filter{ + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] MetaPhlAn3 currently does not accept FASTA files as input. Skipping MetaPhlAn3 for sample ${it[0].id}." + !it[0].is_fasta + } + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + + METAPHLAN3_METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db ) + ch_versions = ch_versions.mix( METAPHLAN3_METAPHLAN3.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN3_METAPHLAN3.out.profile ) + + } + + if ( params.run_kaiju ) { + + ch_input_for_kaiju = ch_input_for_profiling.kaiju + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + + KAIJU_KAIJU ( ch_input_for_kaiju.reads, ch_input_for_kaiju.db) + ch_versions = ch_versions.mix( KAIJU_KAIJU.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( KAIJU_KAIJU.out.results ) + + } + + if ( params.run_diamond ) { + + ch_input_for_diamond = ch_input_for_profiling.diamond + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + + // diamond only accepts single output file specification, therefore + // this will replace output file! + ch_diamond_reads_format = params.diamond_save_reads ? 'sam' : params.diamond_output_format + + DIAMOND_BLASTX ( ch_input_for_diamond.reads, ch_input_for_diamond.db, ch_diamond_reads_format , [] ) + ch_versions = ch_versions.mix( DIAMOND_BLASTX.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( DIAMOND_BLASTX.out.tsv ) + ch_multiqc_files = ch_multiqc_files.mix( DIAMOND_BLASTX.out.log ) + + } + + if ( params.run_motus ) { + + ch_input_for_motus = ch_input_for_profiling.motus + .filter{ + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] mOTUs currently does not accept FASTA files as input. Skipping mOTUs for sample ${it[0].id}." + !it[0].is_fasta + } + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + + MOTUS_PROFILE ( ch_input_for_motus.reads, ch_input_for_motus.db ) + ch_versions = ch_versions.mix( MOTUS_PROFILE.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( MOTUS_PROFILE.out.out ) + ch_multiqc_files = ch_multiqc_files.mix( MOTUS_PROFILE.out.log ) + } + + emit: + classifications = ch_raw_classifications + profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom + versions = ch_versions // channel: [ versions.yml ] + motus_version = params.run_motus ? MOTUS_PROFILE.out.versions.first() : Channel.empty() + mqc = ch_multiqc_files +} diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf new file mode 100644 index 0000000..e491423 --- /dev/null +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -0,0 +1,98 @@ +// +// Process short raw reads with AdapterRemoval +// + +include { ADAPTERREMOVAL as ADAPTERREMOVAL_SINGLE } from '../../modules/nf-core/modules/adapterremoval/main' +include { ADAPTERREMOVAL as ADAPTERREMOVAL_PAIRED } from '../../modules/nf-core/modules/adapterremoval/main' +include { CAT_FASTQ } from '../../modules/nf-core/modules/cat/fastq/main' + +workflow SHORTREAD_ADAPTERREMOVAL { + + take: + reads // [[meta], [reads]] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + ch_input_for_adapterremoval = reads + .branch{ + single: it[0].single_end + paired: !it[0].single_end + } + + ADAPTERREMOVAL_SINGLE ( ch_input_for_adapterremoval.single, [] ) + ADAPTERREMOVAL_PAIRED ( ch_input_for_adapterremoval.paired, [] ) + + /* + * Due to the ~slightly~ very ugly output implementation of the current AdapterRemoval2 version, each file + * has to be exported in a separate channel and we must manually recombine when necessary. + */ + + if ( params.shortread_qc_mergepairs && !params.shortread_qc_excludeunmerged ) { + + ch_concat_fastq = Channel.empty() + .mix( + ADAPTERREMOVAL_PAIRED.out.collapsed, + ADAPTERREMOVAL_PAIRED.out.collapsed_truncated, + ADAPTERREMOVAL_PAIRED.out.singles_truncated, + ADAPTERREMOVAL_PAIRED.out.paired_truncated + ) + .map { meta, reads -> + def meta_new = meta.clone() + meta_new.single_end = true + [meta_new, reads] + } + .groupTuple() + // Paired-end reads cause a nested tuple during grouping. + // We want to present a flat list of files to `CAT_FASTQ`. + .map { meta, fastq -> [meta, fastq.flatten()] } + + + CAT_FASTQ(ch_concat_fastq) + + ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads + .mix(ADAPTERREMOVAL_SINGLE.out.singles_truncated) + + } else if ( params.shortread_qc_mergepairs && params.shortread_qc_excludeunmerged ) { + + ch_concat_fastq = Channel.empty() + .mix( + ADAPTERREMOVAL_PAIRED.out.collapsed, + ADAPTERREMOVAL_PAIRED.out.collapsed_truncated + ) + .map { meta, reads -> + def meta_new = meta.clone() + meta_new.single_end = true + [meta_new, reads] + } + .groupTuple() + .map { meta, fastq -> [meta, fastq.flatten()] } + + + CAT_FASTQ(ch_concat_fastq) + + ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads + .mix(ADAPTERREMOVAL_SINGLE.out.singles_truncated) + + } else { + + ch_adapterremoval_reads_prepped = ADAPTERREMOVAL_PAIRED.out.paired_truncated + .mix(ADAPTERREMOVAL_SINGLE.out.singles_truncated) + + } + + ch_versions = ch_versions.mix( ADAPTERREMOVAL_SINGLE.out.versions.first() ) + ch_versions = ch_versions.mix( ADAPTERREMOVAL_PAIRED.out.versions.first() ) + + ch_multiqc_files = ch_multiqc_files.mix( + ADAPTERREMOVAL_PAIRED.out.settings, + ADAPTERREMOVAL_SINGLE.out.settings + ) + + emit: + reads = ch_adapterremoval_reads_prepped // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/shortread_complexityfiltering.nf b/subworkflows/local/shortread_complexityfiltering.nf new file mode 100644 index 0000000..a34440d --- /dev/null +++ b/subworkflows/local/shortread_complexityfiltering.nf @@ -0,0 +1,33 @@ +// +// Check input samplesheet and get read channels +// + +include { BBMAP_BBDUK } from '../../modules/nf-core/modules/bbmap/bbduk/main' +include { PRINSEQPLUSPLUS } from '../../modules/nf-core/modules/prinseqplusplus/main' + +workflow SHORTREAD_COMPLEXITYFILTERING { + take: + reads // [ [ meta ], [ reads ] ] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + // fastp complexity filtering is activated via modules.conf in shortread_preprocessing + if ( params.shortread_complexityfilter_tool == 'bbduk' ) { + ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads + ch_versions = ch_versions.mix( BBMAP_BBDUK.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( BBMAP_BBDUK.out.log ) + } else if ( params.shortread_complexityfilter_tool == 'prinseqplusplus' ) { + ch_filtered_reads = PRINSEQPLUSPLUS ( reads ).good_reads + ch_versions = ch_versions.mix( PRINSEQPLUSPLUS.out.versions.first() ) + } else { + ch_filtered_reads = reads + } + + emit: + reads = ch_filtered_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf new file mode 100644 index 0000000..05e0f3d --- /dev/null +++ b/subworkflows/local/shortread_fastp.nf @@ -0,0 +1,55 @@ +// +// Process short raw reads with FastP +// + +include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main' +include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' + +workflow SHORTREAD_FASTP { + take: + reads // [[meta], [reads]] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + ch_input_for_fastp = reads + .branch{ + single: it[0]['single_end'] == true + paired: it[0]['single_end'] == false + } + + FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) + // Last parameter here turns on merging of PE data + FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_qc_mergepairs ) + + if ( params.shortread_qc_mergepairs ) { + ch_fastp_reads_prepped_pe = FASTP_PAIRED.out.reads_merged + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = true + [ meta_new, [ reads ].flatten() ] + } + + ch_fastp_reads_prepped = ch_fastp_reads_prepped_pe.mix( FASTP_SINGLE.out.reads ) + + } else { + ch_fastp_reads_prepped = FASTP_PAIRED.out.reads + .mix( FASTP_SINGLE.out.reads ) + } + + ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) + ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) + + ch_processed_reads = ch_fastp_reads_prepped + + ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json ) + ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json ) + + emit: + reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/shortread_hostremoval.nf b/subworkflows/local/shortread_hostremoval.nf new file mode 100644 index 0000000..c5f15c7 --- /dev/null +++ b/subworkflows/local/shortread_hostremoval.nf @@ -0,0 +1,34 @@ +// +// Remove host reads via alignment and export off-target reads +// + +include { BOWTIE2_BUILD } from '../../modules/nf-core/modules/bowtie2/build/main' +include { BOWTIE2_ALIGN } from '../../modules/nf-core/modules/bowtie2/align/main' + +workflow SHORTREAD_HOSTREMOVAL { + take: + reads // [ [ meta ], [ reads ] ] + reference // /path/to/fasta + index // /path/to/index + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( !params.shortread_hostremoval_index ) { + ch_bowtie2_index = BOWTIE2_BUILD ( reference ).index + ch_versions = ch_versions.mix( BOWTIE2_BUILD.out.versions ) + } else { + ch_bowtie2_index = index.first() + } + + BOWTIE2_ALIGN ( reads, ch_bowtie2_index, true, false ) + ch_versions = ch_versions.mix( BOWTIE2_ALIGN.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( BOWTIE2_ALIGN.out.log ) + + emit: + reads = BOWTIE2_ALIGN.out.fastq // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf new file mode 100644 index 0000000..977a317 --- /dev/null +++ b/subworkflows/local/shortread_preprocessing.nf @@ -0,0 +1,39 @@ +// +// Perform read trimming and merging +// + + +include { SHORTREAD_FASTP } from './shortread_fastp' +include { SHORTREAD_ADAPTERREMOVAL } from './shortread_adapterremoval' +include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main' + +workflow SHORTREAD_PREPROCESSING { + take: + reads // [ [ meta ], [ reads ] ] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( params.shortread_qc_tool == "fastp" ) { + ch_processed_reads = SHORTREAD_FASTP ( reads ).reads + ch_versions = ch_versions.mix( SHORTREAD_FASTP.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc ) + } else if ( params.shortread_qc_tool == "adapterremoval" ) { + ch_processed_reads = SHORTREAD_ADAPTERREMOVAL ( reads ).reads + ch_versions = ch_versions.mix( SHORTREAD_ADAPTERREMOVAL.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_ADAPTERREMOVAL.out.mqc ) + } else { + ch_processed_reads = reads + } + + FASTQC_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + + emit: + reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf new file mode 100644 index 0000000..7a441ff --- /dev/null +++ b/subworkflows/local/standardisation_profiles.nf @@ -0,0 +1,141 @@ +// +// Standardise output files e.g. aggregation +// + +include { KAIJU_KAIJU2TABLE } from '../../modules/nf-core/modules/kaiju/kaiju2table/main' +include { KRAKENTOOLS_COMBINEKREPORTS } from '../../modules/nf-core/modules/krakentools/combinekreports/main' +include { KRAKENTOOLS_COMBINEKREPORTS as KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE } from '../../modules/nf-core/modules/krakentools/combinekreports/main' +include { METAPHLAN3_MERGEMETAPHLANTABLES } from '../../modules/nf-core/modules/metaphlan3/mergemetaphlantables/main' +include { MOTUS_MERGE } from '../../modules/nf-core/modules/motus/merge/main' + +workflow STANDARDISATION_PROFILES { + take: + classifications + profiles + databases + motu_version + + main: + ch_standardised_tables = Channel.empty() + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + /* + Split profile results based on tool they come from + */ + ch_input_profiles = profiles + .branch { + motus: it[0]['tool'] == 'motus' + kraken2: it[0]['tool'] == 'kraken2' + centrifuge: it[0]['tool'] == 'centrifuge' + metaphlan3: it[0]['tool'] == 'metaphlan3' + unknown: true + } + + ch_input_classifications = classifications + .branch { + kaiju: it[0]['tool'] == 'kaiju' + unknown: true + } + + ch_input_databases = databases + .branch { + motus: it[0]['tool'] == 'motus' + kaiju: it[0]['tool'] == 'kaiju' + unknown: true + } + + /* + Standardise and aggregate + */ + + // CENTRIFUGE + + // Collect and replace id for db_name for prefix + // Have to sort by size to ensure first file actually has hits otherwise + // the script fails + ch_profiles_for_centrifuge = ch_input_profiles.centrifuge + .map { [it[0]['db_name'], it[1]] } + .groupTuple(sort: {-it.size()} ) + .map { + [[id:it[0]], it[1]] + } + + KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE ( ch_profiles_for_centrifuge ) + ch_standardised_tables = ch_standardised_tables.mix( KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE.out.txt ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE.out.txt ) + ch_versions = ch_versions.mix( KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE.out.versions ) + + // Kaiju + + // Collect and replace id for db_name for prefix + ch_profiles_for_kaiju = ch_input_classifications.kaiju + .map { [it[0]['db_name'], it[1]] } + .groupTuple() + .map { + [[id:it[0]], it[1]] + } + + KAIJU_KAIJU2TABLE ( ch_profiles_for_kaiju, ch_input_databases.kaiju.map{it[1]}, params.kaiju_taxon_rank) + ch_standardised_tables = ch_standardised_tables.mix( KAIJU_KAIJU2TABLE.out.summary ) + ch_multiqc_files = ch_multiqc_files.mix( KAIJU_KAIJU2TABLE.out.summary ) + ch_versions = ch_versions.mix( KAIJU_KAIJU2TABLE.out.versions ) + + // Kraken2 + + // Collect and replace id for db_name for prefix + // Have to sort by size to ensure first file actually has hits otherwise + // the script fails + ch_profiles_for_kraken2 = ch_input_profiles.kraken2 + .map { [it[0]['db_name'], it[1]] } + .groupTuple(sort: {-it.size()} ) + .map { + [[id:it[0]], it[1]] + } + + KRAKENTOOLS_COMBINEKREPORTS ( ch_profiles_for_kraken2 ) + ch_standardised_tables = ch_standardised_tables.mix( KRAKENTOOLS_COMBINEKREPORTS.out.txt ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS.out.txt ) + ch_versions = ch_versions.mix( KRAKENTOOLS_COMBINEKREPORTS.out.versions ) + + // MetaPhlAn3 + + ch_profiles_for_metaphlan3 = ch_input_profiles.metaphlan3 + .map { [it[0]['db_name'], it[1]] } + .groupTuple() + .map { + [[id:it[0]], it[1]] + } + + METAPHLAN3_MERGEMETAPHLANTABLES ( ch_profiles_for_metaphlan3 ) + ch_standardised_tables = ch_standardised_tables.mix( METAPHLAN3_MERGEMETAPHLANTABLES.out.txt ) + ch_multiqc_files = ch_multiqc_files.mix( METAPHLAN3_MERGEMETAPHLANTABLES.out.txt ) + ch_versions = ch_versions.mix( METAPHLAN3_MERGEMETAPHLANTABLES.out.versions ) + + // mOTUs + + // mOTUs has a 'single' database, and cannot create custom ones. + // Therefore removing db info here, and publish merged at root mOTUs results + // directory + + ch_profiles_for_motus = ch_input_profiles.motus + .map { [it[0]['db_name'], it[1]] } + .groupTuple() + .map { + [[id:it[0]], it[1]] + } + + MOTUS_MERGE ( ch_profiles_for_motus, ch_input_databases.motus.map{it[1]}, motu_version ) + + if ( params.generate_biom_output ) { + ch_standardised_tables = ch_standardised_tables.mix ( MOTUS_MERGE.out.biom ) + } else { + ch_standardised_tables = ch_standardised_tables.mix ( MOTUS_MERGE.out.txt ) + } + ch_versions = ch_versions.mix( MOTUS_MERGE.out.versions ) + + emit: + tables = ch_standardised_tables + versions = ch_versions + mqc = ch_multiqc_files +} diff --git a/subworkflows/local/visualization_krona.nf b/subworkflows/local/visualization_krona.nf new file mode 100644 index 0000000..f06768a --- /dev/null +++ b/subworkflows/local/visualization_krona.nf @@ -0,0 +1,105 @@ +// +// Create Krona visualizations +// + +include { MEGAN_RMA2INFO as MEGAN_RMA2INFO_KRONA } from '../../modules/nf-core/modules/megan/rma2info/main' +include { KAIJU_KAIJU2KRONA } from '../../modules/nf-core/modules/kaiju/kaiju2krona/main' +include { KRAKENTOOLS_KREPORT2KRONA } from '../../modules/nf-core/modules/krakentools/kreport2krona/main' +include { KRONA_CLEANUP } from '../../modules/local/krona_cleanup' +include { KRONA_KTIMPORTTEXT } from '../../modules/nf-core/modules/krona/ktimporttext/main' +include { KRONA_KTIMPORTTAXONOMY } from '../../modules/nf-core/modules/krona/ktimporttaxonomy/main' +include { GUNZIP } from '../../modules/nf-core/modules/gunzip/main' + +workflow VISUALIZATION_KRONA { + take: + classifications + profiles + databases + + main: + ch_krona_text = Channel.empty() + ch_krona_html = Channel.empty() + ch_versions = Channel.empty() + + /* + Split profile results based on tool they come from + */ + ch_input_profiles = profiles + .branch { + centrifuge: it[0]['tool'] == 'centrifuge' + kraken2: it[0]['tool'] == 'kraken2' + unknown: true + } + ch_input_classifications = classifications + .branch { + kaiju: it[0]['tool'] == 'kaiju' + malt: it[0]['tool'] == 'malt' + unknown: true + } + + /* + Convert Kraken2 formatted reports into Krona text files + */ + ch_kraken_reports = ch_input_profiles.kraken2 + .mix( ch_input_profiles.centrifuge ) + KRAKENTOOLS_KREPORT2KRONA ( ch_kraken_reports ) + ch_krona_text = ch_krona_text.mix( KRAKENTOOLS_KREPORT2KRONA.out.txt ) + ch_versions = ch_versions.mix( KRAKENTOOLS_KREPORT2KRONA.out.versions.first() ) + + /* + Combine Kaiju profiles with their databases + */ + ch_input_for_kaiju2krona = ch_input_classifications.kaiju + .map{ [it[0]['db_name'], it[0], it[1]] } + .combine( databases.map{ [it[0]['db_name'], it[1]] }, by: 0 ) + .multiMap{ + it -> + profiles: [it[1], it[2]] + db: it[3] + } + + /* + Convert Kaiju formatted reports into Krona text files + */ + KAIJU_KAIJU2KRONA( ch_input_for_kaiju2krona.profiles, ch_input_for_kaiju2krona.db ) + ch_krona_text = ch_krona_text.mix( KAIJU_KAIJU2KRONA.out.txt ) + ch_versions = ch_versions.mix( KAIJU_KAIJU2KRONA.out.versions.first() ) + + /* + Remove taxonomy level annotations from the Krona text files + */ + KRONA_CLEANUP( ch_krona_text ) + ch_cleaned_krona_text = KRONA_CLEANUP.out.txt + ch_versions = ch_versions.mix( KRONA_CLEANUP.out.versions.first() ) + + /* + Convert Krona text files into html Krona visualizations + */ + ch_krona_text_for_import = ch_cleaned_krona_text + .map{[[id: it[0]['db_name'], tool: it[0]['tool']], it[1]]} + .groupTuple() + + KRONA_KTIMPORTTEXT( ch_krona_text_for_import ) + ch_krona_html = ch_krona_html.mix( KRONA_KTIMPORTTEXT.out.html ) + ch_versions = ch_versions.mix( KRONA_KTIMPORTTEXT.out.versions.first() ) + + /* + Convert MALT/MEGAN RMA2INFO files into html Krona visualisations + */ + if ( params.krona_taxonomy_directory ) { + MEGAN_RMA2INFO_KRONA ( ch_input_classifications.malt, false ) + GUNZIP ( MEGAN_RMA2INFO_KRONA.out.txt ) + ch_krona_taxonomy_for_input = GUNZIP.out.gunzip + .map{[[id: it[0]['db_name'], tool: it[0]['tool']], it[1]]} + .groupTuple() + + KRONA_KTIMPORTTAXONOMY ( ch_krona_taxonomy_for_input, file(params.krona_taxonomy_directory, checkExists: true) ) + ch_krona_html.mix( KRONA_KTIMPORTTAXONOMY.out.html ) + ch_versions = ch_versions.mix( MEGAN_RMA2INFO_KRONA.out.versions.first() ) + ch_versions = ch_versions.mix( KRONA_KTIMPORTTAXONOMY.out.versions.first() ) + } + + emit: + html = ch_krona_html + versions = ch_versions +} diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 1167cea..8b9edb7 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -11,11 +11,36 @@ WorkflowTaxprofiler.initialise(params, log) // TODO nf-core: Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta ] +def checkPathParamList = [ params.input, params.databases, params.hostremoval_reference, + params.shortread_hostremoval_index, params.multiqc_config + ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters -if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if ( params.input ) { + ch_input = file(params.input, checkIfExists: true) + pep_input_base_dir = file(params.input).extension.matches("yaml|yml") ? file(file(params.input).getParent(), checkIfExists: true) : [] +} else { + exit 1, "Input samplesheet, or PEP config and base directory not specified" +} + +if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } + +if (params.shortread_qc_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." +if (params.shortread_qc_excludeunmerged && !params.shortread_qc_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_qc_mergepairs" + +if (params.shortread_complexityfilter_tool == 'fastp' && ( params.perform_shortread_qc == false || params.shortread_qc_tool != 'fastp' )) exit 1, "ERROR: [nf-core/taxprofiler] cannot use fastp complexity filtering if preprocessing not turned on and/or tool is not fastp. Please specify --perform_shortread_qc and/or --shortread_qc_tool 'fastp'" + +if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." } +if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." } + +if (params.hostremoval_reference ) { ch_reference = file(params.hostremoval_reference) } +if (params.shortread_hostremoval_index ) { ch_shortread_reference_index = file(params.shortread_hostremoval_index ) } else { ch_shortread_reference_index = [] } +if (params.longread_hostremoval_index ) { ch_longread_reference_index = file(params.longread_hostremoval_index ) } else { ch_longread_reference_index = [] } + +if (params.diamond_save_reads ) log.warn "[nf-core/taxprofiler] DIAMOND only allows output of a single format. As --diamond_save_reads supplied, only aligned reads in SAM format will be produced, no taxonomic profiles will be available." + +if (params.run_malt && params.run_krona && !params.krona_taxonomy_directory) log.warn "[nf-core/taxprofiler] Krona can only be run on MALT output if path to Krona taxonomy database supplied to --krona_taxonomy_directory. Krona will not be executed in this run for MALT." /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -37,7 +62,17 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { INPUT_CHECK } from '../subworkflows/local/input_check' + +include { DB_CHECK } from '../subworkflows/local/db_check' +include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' +include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' +include { SHORTREAD_HOSTREMOVAL } from '../subworkflows/local/shortread_hostremoval' +include { LONGREAD_HOSTREMOVAL } from '../subworkflows/local/longread_hostremoval' +include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_complexityfiltering' +include { PROFILING } from '../subworkflows/local/profiling' +include { VISUALIZATION_KRONA } from '../subworkflows/local/visualization_krona' +include { STANDARDISATION_PROFILES } from '../subworkflows/local/standardisation_profiles' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -51,6 +86,7 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -64,30 +100,150 @@ def multiqc_report = [] workflow TAXPROFILER { ch_versions = Channel.empty() + ch_multiqc_logo= Channel.fromPath("$projectDir/docs/images/nf-core-taxprofiler_logo_custom_light.png") - // - // SUBWORKFLOW: Read in samplesheet, validate and stage input files - // + /* + SUBWORKFLOW: Read in samplesheet, validate and stage input files + */ INPUT_CHECK ( - ch_input + ch_input, pep_input_base_dir ) ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) - // - // MODULE: Run FastQC - // - FASTQC ( - INPUT_CHECK.out.reads + DB_CHECK ( + ch_databases ) + ch_versions = ch_versions.mix(DB_CHECK.out.versions) + + /* + MODULE: Run FastQC + */ + ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ) + + FASTQC ( + ch_input_for_fastqc + ) + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + /* + SUBWORKFLOW: PERFORM PREPROCESSING + */ + if ( params.perform_shortread_qc ) { + ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads + ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions ) + } else { + ch_shortreads_preprocessed = INPUT_CHECK.out.fastq + } + + if ( params.perform_longread_qc ) { + ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ).reads + .map { it -> [ it[0], [it[1]] ] } + ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions ) + } else { + ch_longreads_preprocessed = INPUT_CHECK.out.nanopore + } + + /* + SUBWORKFLOW: COMPLEXITY FILTERING + */ + + // fastp complexity filtering is activated via modules.conf in shortread_preprocessing + if ( params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp' ) { + ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads + ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) + } else { + ch_shortreads_filtered = ch_shortreads_preprocessed + } + + /* + SUBWORKFLOW: HOST REMOVAL + */ + + if ( params.perform_shortread_hostremoval ) { + ch_shortreads_hostremoved = SHORTREAD_HOSTREMOVAL ( ch_shortreads_filtered, ch_reference, ch_shortread_reference_index ).reads + ch_versions = ch_versions.mix(SHORTREAD_HOSTREMOVAL.out.versions) + } else { + ch_shortreads_hostremoved = ch_shortreads_filtered + } + + if ( params.perform_longread_hostremoval ) { + ch_longreads_hostremoved = LONGREAD_HOSTREMOVAL ( ch_longreads_preprocessed, ch_reference, ch_longread_reference_index ).reads + ch_versions = ch_versions.mix(LONGREAD_HOSTREMOVAL.out.versions) + } else { + ch_longreads_hostremoved = ch_longreads_preprocessed + } + + if ( params.perform_runmerging ) { + + ch_reads_for_cat_branch = ch_shortreads_hostremoved + .mix( ch_longreads_hostremoved ) + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new.remove('run_accession') + [ meta_new, reads ] + } + .groupTuple() + .map { + meta, reads -> + [ meta, reads.flatten() ] + } + .branch { + meta, reads -> + // we can't concatenate files if there is not a second run, we branch + // here to separate them out, and mix back in after for efficiency + cat: ( meta.single_end && reads.size() > 1 ) || ( !meta.single_end && reads.size() > 2 ) + skip: true + } + + ch_reads_runmerged = CAT_FASTQ ( ch_reads_for_cat_branch.cat ).reads + .mix( ch_reads_for_cat_branch.skip ) + .map { + meta, reads -> + [ meta, [ reads ].flatten() ] + } + .mix( INPUT_CHECK.out.fasta ) + + ch_versions = ch_versions.mix(CAT_FASTQ.out.versions) + + } else { + ch_reads_runmerged = ch_shortreads_hostremoved + .mix( ch_longreads_hostremoved, INPUT_CHECK.out.fasta ) + } + + /* + SUBWORKFLOW: PROFILING + */ + + PROFILING ( ch_reads_runmerged, DB_CHECK.out.dbs ) + ch_versions = ch_versions.mix( PROFILING.out.versions ) + + /* + SUBWORKFLOW: VISUALIZATION_KRONA + */ + if ( params.run_krona ) { + VISUALIZATION_KRONA ( PROFILING.out.classifications, PROFILING.out.profiles, DB_CHECK.out.dbs ) + ch_versions = ch_versions.mix( VISUALIZATION_KRONA.out.versions ) + } + + /* + SUBWORKFLOW: PROFILING STANDARDISATION + */ + if ( params.run_profile_standardisation ) { + STANDARDISATION_PROFILES ( PROFILING.out.classifications, PROFILING.out.profiles, DB_CHECK.out.dbs, PROFILING.out.motus_version ) + ch_versions = ch_versions.mix( STANDARDISATION_PROFILES.out.versions ) + } + + /* + MODULE: MultiQC + */ + CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) - // - // MODULE: MultiQC - // + workflow_summary = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) @@ -100,6 +256,29 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + if (params.perform_shortread_qc) { + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) + } + + if (params.perform_longread_qc) { + ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) + } + + if (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp'){ + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) + } + + if (params.perform_shortread_hostremoval) { + ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_HOSTREMOVAL.out.mqc.collect{it[1]}.ifEmpty([])) + } + + ch_multiqc_files = ch_multiqc_files.mix( PROFILING.out.mqc.collect{it[1]}.ifEmpty([]) ) + + if ( params.run_profile_standardisation ) { + ch_multiqc_files = ch_multiqc_files.mix( STANDARDISATION_PROFILES.out.mqc.collect{it[1]}.ifEmpty([]) ) + } + + // TODO create multiQC module for metaphlan MULTIQC ( ch_multiqc_files.collect(), ch_multiqc_config.collect().ifEmpty([]),