diff --git a/README.md b/README.md index b94fa02..60baf67 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ ## Introduction -**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for Taxonomic profiling of shotgun metagenomic data. +**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling against multiple profiling tools and databases and produces standardised output tables. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! @@ -29,7 +29,23 @@ On release, automated continuous integration tests run the pipeline on a full-si 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +2. Performs optional read pre-processing + - Adapter clipping and merging + - Low complexity filtering + - Host read removal + - Run merging +3. Performs taxonomic profiling a choice of: + - Kraken2 + - MetaPhlAn3 + - MALT + - DIAMOND + - Centrifuge + - Kaiju + - mOTUs +4. Perform optional post-processing with: + - bracken +5. Standardises output tables +6. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) ## Quick Start diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 5473b62..16e668b 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -1,249 +1,227 @@ #!/usr/bin/env python - -"""Provide a command line tool to validate and transform tabular samplesheets.""" - - -import argparse -import csv -import logging +from distutils import extension +import os import sys -from collections import Counter -from pathlib import Path +import errno +import argparse -logger = logging.getLogger() - - -class RowChecker: - """ - Define a service that can validate and transform each given row. - - Attributes: - modified (list): A list of dicts, where each dict corresponds to a previously - validated and transformed row. The order of rows is maintained. - - """ - - VALID_FORMATS = ( - ".fq.gz", - ".fastq.gz", +def parse_args(args=None): + Description = ( + "Reformat nf-core/taxprofiler samplesheet file and check its contents." ) + Epilog = "Example usage: python check_samplesheet.py " - def __init__( - self, - sample_col="sample", - first_col="fastq_1", - second_col="fastq_2", - single_col="single_end", - **kwargs, - ): - """ - Initialize the row checker with the expected column names. + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_IN", help="Input samplesheet file.") + parser.add_argument("FILE_OUT", help="Output file.") + return parser.parse_args(args) - Args: - sample_col (str): The name of the column that contains the sample name - (default "sample"). - first_col (str): The name of the column that contains the first (or only) - FASTQ file path (default "fastq_1"). - second_col (str): The name of the column that contains the second (if any) - FASTQ file path (default "fastq_2"). - single_col (str): The name of the new column that will be inserted and - records whether the sample contains single- or paired-end sequencing - reads (default "single_end"). - """ - super().__init__(**kwargs) - self._sample_col = sample_col - self._first_col = first_col - self._second_col = second_col - self._single_col = single_col - self._seen = set() - self.modified = [] +def make_dir(path): + if len(path) > 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise exception - def validate_and_transform(self, row): - """ - Perform all validations on the given row and insert the read pairing status. - Args: - row (dict): A mapping from column headers (keys) to elements of that row - (values). - - """ - self._validate_sample(row) - self._validate_first(row) - self._validate_second(row) - self._validate_pair(row) - self._seen.add((row[self._sample_col], row[self._first_col])) - self.modified.append(row) - - def _validate_sample(self, row): - """Assert that the sample name exists and convert spaces to underscores.""" - assert len(row[self._sample_col]) > 0, "Sample input is required." - # Sanitize samples slightly. - row[self._sample_col] = row[self._sample_col].replace(" ", "_") - - def _validate_first(self, row): - """Assert that the first FASTQ entry is non-empty and has the right format.""" - assert len(row[self._first_col]) > 0, "At least the first FASTQ file is required." - self._validate_fastq_format(row[self._first_col]) - - def _validate_second(self, row): - """Assert that the second FASTQ entry has the right format if it exists.""" - if len(row[self._second_col]) > 0: - self._validate_fastq_format(row[self._second_col]) - - def _validate_pair(self, row): - """Assert that read pairs have the same file extension. Report pair status.""" - if row[self._first_col] and row[self._second_col]: - row[self._single_col] = False - assert ( - Path(row[self._first_col]).suffixes == Path(row[self._second_col]).suffixes - ), "FASTQ pairs must have the same file extensions." - else: - row[self._single_col] = True - - def _validate_fastq_format(self, filename): - """Assert that a given filename has one of the expected FASTQ extensions.""" - assert any(filename.endswith(extension) for extension in self.VALID_FORMATS), ( - f"The FASTQ file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FORMATS)}" +def print_error(error, context="Line", context_str=""): + error_str = "ERROR: Please check samplesheet -> {}".format(error) + if context != "" and context_str != "": + error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format( + error, context.strip(), context_str.strip() ) - - def validate_unique_samples(self): - """ - Assert that the combination of sample name and FASTQ filename is unique. - - In addition to the validation, also rename the sample if more than one sample, - FASTQ file combination exists. - - """ - assert len(self._seen) == len(self.modified), "The pair of sample name and FASTQ must be unique." - if len({pair[0] for pair in self._seen}) < len(self._seen): - counts = Counter(pair[0] for pair in self._seen) - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - if counts[sample] > 1: - row[self._sample_col] = f"{sample}_T{seen[sample]}" - - -def sniff_format(handle): - """ - Detect the tabular format. - - Args: - handle (text file): A handle to a `text file`_ object. The read position is - expected to be at the beginning (index 0). - - Returns: - csv.Dialect: The detected tabular format. - - .. _text file: - https://docs.python.org/3/glossary.html#term-text-file - - """ - peek = handle.read(2048) - sniffer = csv.Sniffer() - if not sniffer.has_header(peek): - logger.critical(f"The given sample sheet does not appear to contain a header.") - sys.exit(1) - dialect = sniffer.sniff(peek) - handle.seek(0) - return dialect - + print(error_str) + sys.exit(1) def check_samplesheet(file_in, file_out): """ - Check that the tabular samplesheet has the structure expected by nf-core pipelines. - - Validate the general shape of the table, expected columns, and each row. Also add - an additional column which records whether one or two FASTQ reads were found. - - Args: - file_in (pathlib.Path): The given tabular samplesheet. The format can be either - CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. - file_out (pathlib.Path): Where the validated and transformed samplesheet should - be created; always in CSV format. - - Example: - This function checks that the samplesheet follows the following structure, - see also the `viral recon samplesheet`_:: - - sample,fastq_1,fastq_2 - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz - SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz, - - .. _viral recon samplesheet: - https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv + This function checks that the samplesheet follows the following structure: + sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta + 2611,ERR5766174,ILLUMINA,,,ERX5474930_ERR5766174_1.fa.gz + 2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz, + 2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,, + 2613,ERR5766181,ILLUMINA,ERX5474937_ERR5766181_1.fastq.gz,ERX5474937_ERR5766181_2.fastq.gz, """ - required_columns = {"sample", "fastq_1", "fastq_2"} - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_in.open(newline="") as in_handle: - reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) - # Validate the existence of the expected header columns. - if not required_columns.issubset(reader.fieldnames): - logger.critical(f"The sample sheet **must** contain the column headers: {', '.join(required_columns)}.") + + FQ_EXTENSIONS = (".fq", ".fq.gz", ".fastq", ".fastq.gz") + FA_EXTENSIONS = ( + ".fa", + ".fa.gz", + ".fasta", + ".fasta.gz", + ".fna", + ".fna.gz", + ".fas", + ".fas.gz", + ) + INSTRUMENT_PLATFORMS = [ + "ABI_SOLID", + "BGISEQ", + "CAPILLARY", + "COMPLETE_GENOMICS", + "DNBSEQ", + "HELICOS", + "ILLUMINA", + "ION_TORRENT", + "LS454", + "OXFORD_NANOPORE", + "PACBIO_SMRT", + ] + + sample_mapping_dict = {} + with open(file_in, "r") as fin: + + ## Check header + MIN_COLS = 4 + HEADER = [ + "sample", + "run_accession", + "instrument_platform", + "fastq_1", + "fastq_2", + "fasta", + ] + header = [x.strip('"') for x in fin.readline().strip().split(",")] + if header[: len(HEADER)] != HEADER: + print( + "ERROR: Please check samplesheet header -> {} != {}".format( + ",".join(header), ",".join(HEADER) + ) + ) sys.exit(1) - # Validate each row. - checker = RowChecker() - for i, row in enumerate(reader): - try: - checker.validate_and_transform(row) - except AssertionError as error: - logger.critical(f"{str(error)} On line {i + 2}.") - sys.exit(1) - checker.validate_unique_samples() - header = list(reader.fieldnames) - header.insert(1, "single_end") - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_out.open(mode="w", newline="") as out_handle: - writer = csv.DictWriter(out_handle, header, delimiter=",") - writer.writeheader() - for row in checker.modified: - writer.writerow(row) + + ## Check sample entries + for line in fin: + lspl = [x.strip().strip('"') for x in line.strip().split(",")] + + # Check valid number of columns per row + if len(lspl) < len(HEADER): + print_error( + "Invalid number of columns (minimum = {})!".format(len(HEADER)), + "Line", + line, + ) + num_cols = len([x for x in lspl if x]) + if num_cols < MIN_COLS: + print_error( + "Invalid number of populated columns (minimum = {})!".format( + MIN_COLS + ), + "Line", + line, + ) + + ## Check sample name entries + ( + sample, + run_accession, + instrument_platform, + fastq_1, + fastq_2, + fasta, + ) = lspl[: len(HEADER)] + sample = sample.replace(" ", "_") + if not sample: + print_error("Sample entry has not been specified!", "Line", line) + + ## Check FastQ file extension + for fastq in [fastq_1, fastq_2]: + if fastq: + if fastq.find(" ") != -1: + print_error("FastQ file contains spaces!", "Line", line) + if not fastq.endswith(FQ_EXTENSIONS): + print_error( + f"FastQ file does not have extension {' or '.join(list(FQ_EXTENSIONS))} !", + "Line", + line, + ) + if fasta: + if fasta.find(" ") != -1: + print_error("FastA file contains spaces!", "Line", line) + if not fasta.endswith(FA_EXTENSIONS): + print_error( + f"FastA file does not have extension {' or '.join(list(FA_EXTENSIONS))}!", + "Line", + line, + ) + sample_info = [] + + # Check run_accession + if not run_accession: + print_error("Run accession has not been specified!", "Line", line) + else: + sample_info.append(run_accession) + + # Check instrument_platform + if not instrument_platform: + print_error("Instrument platform has not been specified!", "Line", line) + else: + if instrument_platform not in INSTRUMENT_PLATFORMS: + print_error( + f"Instrument platform {instrument_platform} is not supported!", + f"List of supported platforms {', '.join(INSTRUMENT_PLATFORMS)}", + "Line", + line, + ) + sample_info.append(instrument_platform) + + ## Auto-detect paired-end/single-end + if sample and fastq_1 and fastq_2: ## Paired-end short reads + sample_info.extend(["0", fastq_1, fastq_2, fasta]) + elif sample and fastq_1 and not fastq_2: ## Single-end short reads + sample_info.extend(["1", fastq_1, fastq_2, fasta]) + elif ( + sample and fasta and not fastq_1 and not fastq_2 + ): ## Single-end long reads + sample_info.extend(["1", fastq_1, fastq_2, fasta]) + elif fasta and (fastq_1 or fastq_2): + print_error( + "FastQ and FastA files cannot be specified together in the same library!", + "Line", + line, + ) + else: + print_error("Invalid combination of columns provided!", "Line", line) + + ## Create sample mapping dictionary = { sample: [ run_accession, instrument_platform, single_end, fastq_1, fastq_2 , fasta ] } + if sample not in sample_mapping_dict: + sample_mapping_dict[sample] = [sample_info] + else: + if sample_info in sample_mapping_dict[sample]: + print_error("Samplesheet contains duplicate rows!", "Line", line) + else: + sample_mapping_dict[sample].append(sample_info) + + ## Write validated samplesheet with appropriate columns + HEADER_OUT = [ + "sample", + "run_accession", + "instrument_platform", + "single_end", + "fastq_1", + "fastq_2", + "fasta", + ] + if len(sample_mapping_dict) > 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write(",".join(HEADER_OUT) + "\n") + for sample in sorted(sample_mapping_dict.keys()): + for idx, val in enumerate(sample_mapping_dict[sample]): + fout.write(f"{sample},{','.join(val)}\n") + else: + print_error("No entries to process!", "Samplesheet: {}".format(file_in)) -def parse_args(argv=None): - """Define and immediately parse command line arguments.""" - parser = argparse.ArgumentParser( - description="Validate and transform a tabular samplesheet.", - epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", - ) - parser.add_argument( - "file_in", - metavar="FILE_IN", - type=Path, - help="Tabular input samplesheet in CSV or TSV format.", - ) - parser.add_argument( - "file_out", - metavar="FILE_OUT", - type=Path, - help="Transformed output samplesheet in CSV format.", - ) - parser.add_argument( - "-l", - "--log-level", - help="The desired log level (default WARNING).", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), - default="WARNING", - ) - return parser.parse_args(argv) - - -def main(argv=None): - """Coordinate argument parsing and program execution.""" - args = parse_args(argv) - logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") - if not args.file_in.is_file(): - logger.error(f"The given input file {args.file_in} was not found!") - sys.exit(2) - args.file_out.parent.mkdir(parents=True, exist_ok=True) - check_samplesheet(args.file_in, args.file_out) +def main(args=None): + args = parse_args(args) + check_samplesheet(args.FILE_IN, args.FILE_OUT) if __name__ == "__main__": diff --git a/conf/modules.config b/conf/modules.config index da58a5d..9e334bc 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -28,6 +28,65 @@ process { withName: FASTQC { ext.args = '--quiet' + ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } + publishDir = [ + path: { "${params.outdir}/fastqc/raw" }, + mode: 'copy', + pattern: '*.html' + ] + } + + withName: FASTP { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + // TODO also include option to NOT merge + ext.args = [ + { ${meta.single_end} } == 0 ? "-m" : '', + params.fastp_exclude_unmerged ? '' : "--include_unmerged" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/fastp" }, + mode: 'copy', + pattern: '*.fastq.gz' + ] + } + + withName: FASTQC_POST { + ext.args = '--quiet' + ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } + publishDir = [ + path: { "${params.outdir}/fastqc/processed" }, + mode: 'copy', + pattern: '*.html' + ] + } + + withName: CAT_FASTQ { + publishDir = [ + path: { "${params.outdir}/prepared_sequences" }, + mode: 'copy', + pattern: '*.fastq.gz' + ] + } + + withName: MALT_RUN { + publishDir = [ + path: { "${params.outdir}/malt/${meta.db_name}" }, + mode: 'copy', + pattern: '*.{rma6,tab,text,sam,log}' + ] + ext.args = { "${meta.db_params}" } + ext.when = params.run_malt + } + + withName: KRAKEN2_KRAKEN2 { + publishDir = [ + path: { "${params.outdir}/kraken2/${meta.db_name}" }, + mode: 'copy', + pattern: '.{fastq.gz,txt}' + ] + ext.args = { "${meta.db_params}" } + ext.when = params.run_kraken2 + ext.prefix = { "${meta.id}-${meta.db_name}" } } withName: CUSTOM_DUMPSOFTWAREVERSIONS { diff --git a/conf/test.config b/conf/test.config index 42772cf..51f3bb6 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,8 +22,6 @@ params { // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - // Genome references - genome = 'R64-1-1' } diff --git a/lib/WorkflowTaxprofiler.groovy b/lib/WorkflowTaxprofiler.groovy index a9b80a9..93f0467 100755 --- a/lib/WorkflowTaxprofiler.groovy +++ b/lib/WorkflowTaxprofiler.groovy @@ -10,10 +10,11 @@ class WorkflowTaxprofiler { public static void initialise(params, log) { genomeExistsError(params, log) - if (!params.fasta) { - log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." - System.exit(1) - } + // TODO update as necessary + //if (!params.fasta) { + // log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." + // System.exit(1) + //} } // diff --git a/modules.json b/modules.json index 939b85f..6a785b8 100644 --- a/modules.json +++ b/modules.json @@ -3,12 +3,24 @@ "homePage": "https://github.com/nf-core/taxprofiler", "repos": { "nf-core/modules": { + "cat/fastq": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + }, "custom/dumpsoftwareversions": { "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41" }, + "fastp": { + "git_sha": "d0a1cbb703a130c19f6796c3fce24fbe7dfce789" + }, "fastqc": { "git_sha": "9d0cad583b9a71a6509b754fdf589cbfbed08961" }, + "kraken2/kraken2": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + }, + "malt/run": { + "git_sha": "72b96f4e504eef673f2b5c13560a9d90b669129b" + }, "multiqc": { "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41" } diff --git a/modules/local/database_check.nf b/modules/local/database_check.nf new file mode 100644 index 0000000..4da4313 --- /dev/null +++ b/modules/local/database_check.nf @@ -0,0 +1,25 @@ +process DATABASE_CHECK { + tag "$databasesheet" + + conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'quay.io/biocontainers/python:3.8.3' }" + + input: + path databasesheet + + output: + path '*.csv' , emit: csv + path "versions.yml", emit: versions + + script: // This script is bundled with the pipeline, in nf-core/taxprofiler/bin/ + """ + cat $databasesheet >> database_sheet.valid.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/cat/fastq/main.nf b/modules/nf-core/modules/cat/fastq/main.nf new file mode 100644 index 0000000..bf0877c --- /dev/null +++ b/modules/nf-core/modules/cat/fastq/main.nf @@ -0,0 +1,51 @@ +process CAT_FASTQ { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "conda-forge::sed=4.7" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img' : + 'biocontainers/biocontainers:v1.2.0_cv1' }" + + input: + tuple val(meta), path(reads, stageAs: "input*/*") + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads.collect{ it.toString() } + if (meta.single_end) { + if (readList.size > 1) { + """ + cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size > 2) { + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz + cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } +} diff --git a/modules/nf-core/modules/cat/fastq/meta.yml b/modules/nf-core/modules/cat/fastq/meta.yml new file mode 100644 index 0000000..c836598 --- /dev/null +++ b/modules/nf-core/modules/cat/fastq/meta.yml @@ -0,0 +1,39 @@ +name: cat_fastq +description: Concatenates fastq files +keywords: + - fastq + - concatenate +tools: + - cat: + description: | + The cat utility reads files sequentially, writing them to the standard output. + documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: list + description: | + List of input FastQ files to be concatenated. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Merged fastq file + pattern: "*.{merged.fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/modules/fastp/main.nf b/modules/nf-core/modules/fastp/main.nf new file mode 100644 index 0000000..5c9e3b8 --- /dev/null +++ b/modules/nf-core/modules/fastp/main.nf @@ -0,0 +1,75 @@ +process FASTP { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? 'bioconda::fastp=0.23.2' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastp:0.23.2--h79da9fb_0' : + 'quay.io/biocontainers/fastp:0.23.2--h79da9fb_0' }" + + input: + tuple val(meta), path(reads) + val save_trimmed_fail + val save_merged + + output: + tuple val(meta), path('*.trim.fastq.gz') , optional:true, emit: reads + tuple val(meta), path('*.json') , emit: json + tuple val(meta), path('*.html') , emit: html + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + tuple val(meta), path('*.fail.fastq.gz') , optional:true, emit: reads_fail + tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + // Added soft-links to original fastqs for consistent naming in MultiQC + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + def fail_fastq = save_trimmed_fail ? "--failed_out ${prefix}.fail.fastq.gz" : '' + """ + [ ! -f ${prefix}.fastq.gz ] && ln -s $reads ${prefix}.fastq.gz + fastp \\ + --in1 ${prefix}.fastq.gz \\ + --out1 ${prefix}.trim.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $fail_fastq \\ + $args \\ + 2> ${prefix}.fastp.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else { + def fail_fastq = save_trimmed_fail ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : '' + def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : '' + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz + fastp \\ + --in1 ${prefix}_1.fastq.gz \\ + --in2 ${prefix}_2.fastq.gz \\ + --out1 ${prefix}_1.trim.fastq.gz \\ + --out2 ${prefix}_2.trim.fastq.gz \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $fail_fastq \\ + $merge_fastq \\ + --thread $task.cpus \\ + --detect_adapter_for_pe \\ + $args \\ + 2> ${prefix}.fastp.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/modules/fastp/meta.yml b/modules/nf-core/modules/fastp/meta.yml new file mode 100644 index 0000000..3274e41 --- /dev/null +++ b/modules/nf-core/modules/fastp/meta.yml @@ -0,0 +1,68 @@ +name: fastp +description: Perform adapter/quality trimming on sequencing reads +keywords: + - trimming + - quality control + - fastq +tools: + - fastp: + description: | + A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance. + documentation: https://github.com/OpenGene/fastp + doi: https://doi.org/10.1093/bioinformatics/bty560 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - save_trimmed_fail: + type: boolean + description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz` + - save_merged: + type: boolean + description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz` + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified/unmerged fastq reads + pattern: "*trim.fastq.gz" + - json: + type: file + description: Results in JSON format + pattern: "*.json" + - html: + type: file + description: Results in HTML format + pattern: "*.html" + - log: + type: file + description: fastq log file + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads_fail: + type: file + description: Reads the failed the preprocessing + pattern: "*fail.fastq.gz" + - reads_merged: + type: file + description: Reads that were successfully merged + pattern: "*.{merged.fastq.gz}" +authors: + - "@drpatelh" + - "@kevinmenden" diff --git a/modules/nf-core/modules/kraken2/kraken2/main.nf b/modules/nf-core/modules/kraken2/kraken2/main.nf new file mode 100644 index 0000000..3ec5df5 --- /dev/null +++ b/modules/nf-core/modules/kraken2/kraken2/main.nf @@ -0,0 +1,49 @@ +process KRAKEN2_KRAKEN2 { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? 'bioconda::kraken2=2.1.2 conda-forge::pigz=2.6' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' : + 'quay.io/biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }" + + input: + tuple val(meta), path(reads) + path db + + output: + tuple val(meta), path('*classified*') , emit: classified + tuple val(meta), path('*unclassified*'), emit: unclassified + tuple val(meta), path('*report.txt') , emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "" : "--paired" + def classified = meta.single_end ? "${prefix}.classified.fastq" : "${prefix}.classified#.fastq" + def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq" + """ + kraken2 \\ + --db $db \\ + --threads $task.cpus \\ + --unclassified-out $unclassified \\ + --classified-out $classified \\ + --report ${prefix}.kraken2.report.txt \\ + --gzip-compressed \\ + $paired \\ + $args \\ + $reads + + pigz -p $task.cpus *.fastq + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/kraken2/kraken2/meta.yml b/modules/nf-core/modules/kraken2/kraken2/meta.yml new file mode 100644 index 0000000..9d6a385 --- /dev/null +++ b/modules/nf-core/modules/kraken2/kraken2/meta.yml @@ -0,0 +1,60 @@ +name: kraken2_kraken2 +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - kraken2: + description: | + Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads + homepage: https://ccb.jhu.edu/software/kraken2/ + documentation: https://github.com/DerrickWood/kraken2/wiki/Manual + doi: 10.1186/s13059-019-1891-0 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Kraken2 database +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classified: + type: file + description: | + Reads classified to belong to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - unclassified: + type: file + description: | + Reads not classified to belong to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - txt: + type: file + description: | + Kraken2 report containing stats about classified + and not classifed reads. + pattern: "*.{report.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/modules/malt/run/main.nf b/modules/nf-core/modules/malt/run/main.nf new file mode 100644 index 0000000..4e2e50c --- /dev/null +++ b/modules/nf-core/modules/malt/run/main.nf @@ -0,0 +1,50 @@ +process MALT_RUN { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? "bioconda::malt=0.53" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/malt:0.53--hdfd78af_0' : + 'quay.io/biocontainers/malt:0.53--hdfd78af_0' }" + + input: + tuple val(meta), path(fastqs) + val mode + path index + + output: + tuple val(meta), path("*.rma6") , emit: rma6 + tuple val(meta), path("*.{tab,text,sam}"), optional:true, emit: alignments + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def avail_mem = 6 + if (!task.memory) { + log.info '[MALT_RUN] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.' + } else { + avail_mem = task.memory.giga + } + + """ + malt-run \\ + -J-Xmx${avail_mem}g \\ + -t $task.cpus \\ + -v \\ + -o . \\ + $args \\ + --inFile ${fastqs.join(' ')} \\ + -m $mode \\ + --index $index/ |&tee ${prefix}-malt-run.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + malt: \$(malt-run --help 2>&1 | grep -o 'version.* ' | cut -f 1 -d ',' | cut -f2 -d ' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/malt/run/meta.yml b/modules/nf-core/modules/malt/run/meta.yml new file mode 100644 index 0000000..66f2d7a --- /dev/null +++ b/modules/nf-core/modules/malt/run/meta.yml @@ -0,0 +1,58 @@ +name: malt_run +description: MALT, an acronym for MEGAN alignment tool, is a sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics. +keywords: + - malt + - alignment + - metagenomics + - ancient DNA + - aDNA + - palaeogenomics + - archaeogenomics + - microbiome +tools: + - malt: + description: A tool for mapping metagenomic data + homepage: https://www.wsi.uni-tuebingen.de/lehrstuehle/algorithms-in-bioinformatics/software/malt/ + documentation: https://software-ab.informatik.uni-tuebingen.de/download/malt/manual.pdf + tool_dev_url: None + doi: "10.1038/s41559-017-0446-6" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastqs: + type: file + description: Input FASTQ files + pattern: "*.{fastq.gz,fq.gz}" + - mode: + type: string + description: Program mode + pattern: "Unknown|BlastN|BlastP|BlastX|Classifier" + - index: + type: directory + description: Index/database directory from malt-build + pattern: "*/" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - rma6: + type: file + description: MEGAN6 RMA6 file + pattern: "*.rma6" + - sam: + type: file + description: Alignment files in Tab, Text or MEGAN-compatible SAM format + pattern: "*.{tab,txt,sam}" + - log: + type: file + description: Log of verbose MALT stdout + pattern: "*-malt-run.log" + +authors: + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index 97ce0a8..7b897ab 100644 --- a/nextflow.config +++ b/nextflow.config @@ -34,7 +34,7 @@ params { help = false validate_params = true show_hidden_params = false - schema_ignore_params = 'genomes' + schema_ignore_params = 'genomes,fasta' enable_conda = false // Config options @@ -51,6 +51,19 @@ params { max_cpus = 16 max_time = '240.h' + // Databaess + databases = null + + // FASTQ preprocessing + fastp_clip_merge = false + fastp_exclude_unmerged = true + + // MALT + run_malt = false + malt_mode = 'BlastN' + + // kraken2 + run_kraken2 = false } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index fa4588a..4ab5c06 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -57,15 +57,6 @@ "fa_icon": "fas fa-book", "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." }, - "fasta": { - "type": "string", - "format": "file-path", - "mimetype": "text/plain", - "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", - "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", - "fa_icon": "far fa-file-code" - }, "igenomes_base": { "type": "string", "format": "directory-path", diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf new file mode 100644 index 0000000..909d98f --- /dev/null +++ b/subworkflows/local/db_check.nf @@ -0,0 +1,40 @@ +// +// Check input samplesheet and get read channels +// + +include { DATABASE_CHECK } from '../../modules/local/database_check' + +workflow DB_CHECK { + take: + dbsheet // file: /path/to/dbsheet.csv + + main: + + // TODO: make database sheet check + parsed_samplesheet = DATABASE_CHECK ( dbsheet ) + .csv + .splitCsv ( header:true, sep:',' ) + .dump(tag: "db_split_csv_out") + .map { create_db_channels(it) } + .dump(tag: "db_channel_prepped") + .set{ dbs } + + emit: + dbs // channel: [ val(meta), [ db ] ] + versions = DATABASE_CHECK.out.versions // channel: [ versions.yml ] +} + +def create_db_channels(LinkedHashMap row) { + def meta = [:] + meta.tool = row.tool + meta.db_name = row.db_name + meta.db_params = row.db_params + + def array = [] + if (!file(row.db_path, type: 'dir').exists()) { + exit 1, "ERROR: Please check input samplesheet -> database could not be found!\n${row.db_path}" + } + array = [ meta, file(row.db_path) ] + + return array +} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 0aecf87..d66fb3a 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -9,14 +9,28 @@ workflow INPUT_CHECK { samplesheet // file: /path/to/samplesheet.csv main: - SAMPLESHEET_CHECK ( samplesheet ) + parsed_samplesheet = SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) - .map { create_fastq_channel(it) } - .set { reads } + .dump(tag: "input_split_csv_out") + .branch { + fasta: it['fasta'] != '' + fastq: true + } + + parsed_samplesheet.fastq + .map { create_fastq_channels(it) } + .dump(tag: "fastq_channel_init") + .set { fastq } + + parsed_samplesheet.fasta + .map { create_fasta_channels(it) } + .dump(tag: "fasta_channel_init") + .set { fasta } emit: - reads // channel: [ val(meta), [ reads ] ] + fastq // channel: [ val(meta), [ reads ] ] + fasta // channel: [ val(meta), fasta ] versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } @@ -24,8 +38,10 @@ workflow INPUT_CHECK { def create_fastq_channel(LinkedHashMap row) { // create meta map def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() + meta.id = row.sample + meta.run_accession = row.run_accession + meta.instrument_platform = row.instrument_platform + meta.single_end = row.single_end.toBoolean() // add path(s) of the fastq file(s) to the meta map def fastq_meta = [] @@ -42,3 +58,20 @@ def create_fastq_channel(LinkedHashMap row) { } return fastq_meta } + +// Function to get list of [ meta, fasta ] +def create_fasta_channels(LinkedHashMap row) { + def meta = [:] + meta.id = row.sample + meta.run_accession = row.run_accession + meta.instrument_platform = row.instrument_platform + meta.single_end = true + + def array = [] + if (!file(row.fasta).exists()) { + exit 1, "ERROR: Please check input samplesheet -> FastA file does not exist!\n${row.fasta}" + } + array = [ meta, [ file(row.fasta) ] ] + + return array +} diff --git a/subworkflows/local/preprocessing.nf b/subworkflows/local/preprocessing.nf new file mode 100644 index 0000000..5832824 --- /dev/null +++ b/subworkflows/local/preprocessing.nf @@ -0,0 +1,73 @@ +// +// Check input samplesheet and get read channels +// + + +include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main' +include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' +include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main' + +workflow FASTQ_PREPROCESSING { + take: + reads // file: /path/to/samplesheet.csv + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + // + // STEP: Read clipping and merging + // + // TODO give option to clip only and retain pairs + // TODO give option to retain singletons (probably fastp option likely) + // TODO move to subworkflow + + + if ( params.fastp_clip_merge ) { + + ch_input_for_fastp = reads + .dump(tag: "pre-fastp_branch") + .branch{ + single: it[0]['single_end'] == true + paired: it[0]['single_end'] == false + } + + ch_input_for_fastp.single.dump(tag: "input_fastp_single") + ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") + + FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) + FASTP_PAIRED ( ch_input_for_fastp.paired, false, true ) + + ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged + .mix( FASTP_SINGLE.out.reads ) + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] + } + + FASTQC_POST ( ch_fastp_reads_prepped ) + + ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) + ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) + + ch_processed_reads = ch_fastp_reads_prepped + + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} ) + + ch_multiqc_files.dump(tag: "preprocessing_mqc_final") + + } else { + ch_processed_reads = reads + } + + + emit: + reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index e8cb60a..f740324 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -11,11 +11,12 @@ WorkflowTaxprofiler.initialise(params, log) // TODO nf-core: Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta ] +def checkPathParamList = [ params.input, params.databases, params.multiqc_config ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters -if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -35,7 +36,11 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { INPUT_CHECK } from '../subworkflows/local/input_check' + +include { DB_CHECK } from '../subworkflows/local/db_check' +include { FASTQ_PREPROCESSING } from '../subworkflows/local/preprocessing' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -50,6 +55,11 @@ include { FASTQC } from '../modules/nf-core/modules/fastqc/ include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' +include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' +include { MALT_RUN } from '../modules/nf-core/modules/malt/run/main' +include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/modules/kraken2/kraken2/main' + + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -71,11 +81,15 @@ workflow TAXPROFILER { ) ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) + DB_CHECK ( + ch_databases + ) + // // MODULE: Run FastQC // FASTQC ( - INPUT_CHECK.out.reads + INPUT_CHECK.out.fastq ) ch_versions = ch_versions.mix(FASTQC.out.versions.first()) @@ -83,6 +97,88 @@ workflow TAXPROFILER { ch_versions.unique().collectFile(name: 'collated_versions.yml') ) + // + // PERFORM PREPROCESSING + // + if ( params.fastp_clip_merge ) { + FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq ) + } + + // + // PERFORM RUN MERGING + // + ch_processed_for_combine = FASTQ_PREPROCESSING.out.reads + .dump(tag: "prep_for_combine_grouping") + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['run_accession'] = 'combined' + [ meta_new, reads ] + } + .groupTuple ( by: 0 ) + .branch{ + combine: it[1].size() >= 2 + skip: it[1].size() < 2 + } + + CAT_FASTQ ( ch_processed_for_combine.combine ) + + ch_reads_for_profiling = ch_processed_for_combine.skip + .dump(tag: "skip_combine") + .mix( CAT_FASTQ.out.reads ) + .dump(tag: "files_for_profiling") + + // + // COMBINE READS WITH POSSIBLE DATABASES + // + + // output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] + ch_input_for_profiling = ch_reads_for_profiling + .combine(DB_CHECK.out.dbs) + .dump(tag: "reads_plus_db") + .branch { + malt: it[2]['tool'] == 'malt' + kraken2: it[2]['tool'] == 'kraken2' + unknown: true + } + + // + // PREP PROFILER INPUT CHANNELS ON PER TOOL BASIS + // + + // We groupTuple to have all samples in one channel for MALT as database + // loading takes a long time, so we only want to run it once per database + ch_input_for_malt = ch_input_for_profiling.malt + .map { + it -> + def temp_meta = [ id: it[2]['db_name']] + it[2] + def db = it[3] + [ temp_meta, it[1], db ] + } + .groupTuple(by: [0,2]) + .dump(tag: "input for malt") + .multiMap { + it -> + reads: [ it[0], it[1].flatten() ] + db: it[2] + } + + // We can run Kraken2 one-by-one sample-wise + ch_input_for_kraken2 = ch_input_for_profiling.kraken2 + .dump(tag: "input for kraken") + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + // + // RUN PROFILING + // + MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) + + // // MODULE: MultiQC // @@ -95,7 +191,20 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + if (params.fastp_clip_merge) { + ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc) + } + if (params.run_kraken2) { + ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([])) + ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first()) + } + if (params.run_malt) { + ch_multiqc_files = ch_multiqc_files.mix(MALT_RUN.out.log.collect{it[1]}.ifEmpty([])) + ch_versions = ch_versions.mix(MALT_RUN.out.versions.first()) + } + // TODO MALT results overwriting per database? + // TODO Versions for Karken/MALT not report? MULTIQC ( ch_multiqc_files.collect() )