From 7321daf4f6f141efae218a34b2b69c482d1dfeed Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 18 Feb 2022 08:41:34 +0100 Subject: [PATCH 01/10] Update README with approximate pipeline outline --- README.md | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8f33f2c..f57666d 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ ## Introduction -**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for Taxonomic profiling of shotgun metagenomic data. +**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling against multiple profiling tools and databases and produces standardised output tables. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! @@ -29,7 +29,23 @@ On release, automated continuous integration tests run the pipeline on a full-si 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +2. Performs optional read pre-processing + - Adapter clipping and merging + - Low complexity filtering + - Host read removal + - Run merging +3. Performs taxonomic profiling a choice of: + - Kraken2 + - MetaPhlAn3 + - MALT + - DIAMOND + - Centrifuge + - Kaiju + - mOTUs +4. Perform optional post-processing with: + - bracken +5. Standardises output tables +6. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) ## Quick Start From c68afbbf90c98e87716dc526d6c5a9631a341201 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 18 Feb 2022 08:45:06 +0100 Subject: [PATCH 02/10] Update nextflow.config --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index c2d51b5..186979c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -121,7 +121,7 @@ if (!params.igenomes_ignore) { } // Export these variables to prevent local Python/R libraries from conflicting with those in the container -// The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. +// The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. env { From f867c057a4e1f0c7f4fc23db44356ee120de261f Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 18 Feb 2022 11:53:13 +0100 Subject: [PATCH 03/10] add more columns to samplesheet --- bin/check_samplesheet.py | 138 +++++++++++++++++++++++++++++++++------ 1 file changed, 118 insertions(+), 20 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 41dd9aa..eb6a7dc 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -3,6 +3,7 @@ # TODO nf-core: Update the script to check the samplesheet # This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv +from distutils import extension import os import sys import errno @@ -10,7 +11,9 @@ import argparse def parse_args(args=None): - Description = "Reformat nf-core/taxprofiler samplesheet file and check its contents." + Description = ( + "Reformat nf-core/taxprofiler samplesheet file and check its contents." + ) Epilog = "Example usage: python check_samplesheet.py " parser = argparse.ArgumentParser(description=Description, epilog=Epilog) @@ -43,25 +46,62 @@ def check_samplesheet(file_in, file_out): """ This function checks that the samplesheet follows the following structure: - sample,fastq_1,fastq_2 - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz - SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz, + sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta + 2611,ERR5766174,ILLUMINA,NA,NA,FA_EXTENSIONSERX5474930_ERR5766174_1.fa.gz + 2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz,NA + 2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,NA,NA + 2613,ERR5766181,ILLUMINA,ERX5474930_ERR5766174_1.fa.gz,ERX5474930_ERR5766174_2.fa.gz,NA For an example see: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv """ + FQ_EXTENSIONS = (".fq", ".fq.gz", ".fastq", ".fastq.gz") + FA_EXTENSIONS = ( + ".fa", + ".fa.gz", + ".fasta", + ".fasta.gz", + ".fna", + ".fna.gz", + ".fas", + ".fas.gz", + ) + INSTRUMENT_PLATFORMS = [ + "ABI_SOLID", + "BGISEQ", + "CAPILLARY", + "COMPLETE_GENOMICS", + "DNBSEQ", + "HELICOS", + "ILLUMINA", + "ION_TORRENT", + "LS454", + "OXFORD_NANOPORE", + "PACBIO_SMRT", + ] + sample_mapping_dict = {} with open(file_in, "r") as fin: ## Check header - MIN_COLS = 2 + MIN_COLS = 4 # TODO nf-core: Update the column names for the input samplesheet - HEADER = ["sample", "fastq_1", "fastq_2"] + HEADER = [ + "sample", + "run_accession", + "instrument_platform", + "fastq_1", + "fastq_2", + "fasta", + ] header = [x.strip('"') for x in fin.readline().strip().split(",")] if header[: len(HEADER)] != HEADER: - print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER))) + print( + "ERROR: Please check samplesheet header -> {} != {}".format( + ",".join(header), ",".join(HEADER) + ) + ) sys.exit(1) ## Check sample entries @@ -78,13 +118,22 @@ def check_samplesheet(file_in, file_out): num_cols = len([x for x in lspl if x]) if num_cols < MIN_COLS: print_error( - "Invalid number of populated columns (minimum = {})!".format(MIN_COLS), + "Invalid number of populated columns (minimum = {})!".format( + MIN_COLS + ), "Line", line, ) ## Check sample name entries - sample, fastq_1, fastq_2 = lspl[: len(HEADER)] + ( + sample, + run_accession, + instrument_platform, + fastq_1, + fastq_2, + fasta, + ) = lspl[: len(HEADER)] sample = sample.replace(" ", "_") if not sample: print_error("Sample entry has not been specified!", "Line", line) @@ -94,23 +143,55 @@ def check_samplesheet(file_in, file_out): if fastq: if fastq.find(" ") != -1: print_error("FastQ file contains spaces!", "Line", line) - if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"): + if not fastq.endswith(FQ_EXTENSIONS): print_error( - "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!", + f"FastQ file does not have extension {' or '.join(list(FQ_EXTENSIONS))} !", "Line", line, ) + if fasta: + if fasta.find(" ") != -1: + print_error("FastA file contains spaces!", "Line", line) + if not fasta.endswith(FA_EXTENSIONS): + print_error( + f"FastA file does not have extension {' or '.join(list(FA_EXTENSIONS))}!", + "Line", + line, + ) + sample_info = [] + + # Check run_accession + if not run_accession: + print_error("Run accession has not been specified!", "Line", line) + else: + sample_info.append(run_accession) + + # Check instrument_platform + if not instrument_platform: + print_error("Instrument platform has not been specified!", "Line", line) + else: + if instrument_platform not in INSTRUMENT_PLATFORMS: + print_error( + f"Instrument platform {instrument_platform} is not supported!", + f"List of supported platforms {', '.join(INSTRUMENT_PLATFORMS)}", + "Line", + line, + ) + sample_info.append(instrument_platform) ## Auto-detect paired-end/single-end - sample_info = [] ## [single_end, fastq_1, fastq_2] if sample and fastq_1 and fastq_2: ## Paired-end short reads - sample_info = ["0", fastq_1, fastq_2] + sample_info.extend(["0", fastq_1, fastq_2, fasta]) elif sample and fastq_1 and not fastq_2: ## Single-end short reads - sample_info = ["1", fastq_1, fastq_2] + sample_info.extend(["1", fastq_1, fastq_2, fasta]) + elif ( + sample and fasta and not fastq_1 and not fastq_2 + ): ## Single-end long reads + sample_info.extend(["1", fastq_1, fastq_2, fasta]) else: print_error("Invalid combination of columns provided!", "Line", line) - ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] } + ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 , fasta, run_accession, instrument_platform] } if sample not in sample_mapping_dict: sample_mapping_dict[sample] = [sample_info] else: @@ -120,19 +201,36 @@ def check_samplesheet(file_in, file_out): sample_mapping_dict[sample].append(sample_info) ## Write validated samplesheet with appropriate columns + HEADER_OUT = [ + "sample", + "run_accession", + "instrument_platform", + "single_end", + "fastq_1", + "fastq_2", + "fasta", + ] if len(sample_mapping_dict) > 0: out_dir = os.path.dirname(file_out) make_dir(out_dir) with open(file_out, "w") as fout: - fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n") + fout.write(",".join(HEADER_OUT) + "\n") for sample in sorted(sample_mapping_dict.keys()): ## Check that multiple runs of the same sample are of the same datatype - if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]): - print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample)) + if not all( + x[0] == sample_mapping_dict[sample][0][0] + for x in sample_mapping_dict[sample] + ): + print_error( + "Multiple runs of a sample must be of the same datatype!", + "Sample: {}".format(sample), + ) for idx, val in enumerate(sample_mapping_dict[sample]): - fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n") + fout.write( + ",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n" + ) else: print_error("No entries to process!", "Samplesheet: {}".format(file_in)) From 54a1a4fd459afa7e9987c72e5e1d4b7ac67683d0 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 18 Feb 2022 13:11:18 +0100 Subject: [PATCH 04/10] update samplesheet specs --- bin/check_samplesheet.py | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index eb6a7dc..77a5107 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -47,13 +47,10 @@ def check_samplesheet(file_in, file_out): This function checks that the samplesheet follows the following structure: sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta - 2611,ERR5766174,ILLUMINA,NA,NA,FA_EXTENSIONSERX5474930_ERR5766174_1.fa.gz - 2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz,NA - 2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,NA,NA - 2613,ERR5766181,ILLUMINA,ERX5474930_ERR5766174_1.fa.gz,ERX5474930_ERR5766174_2.fa.gz,NA - - For an example see: - https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv + 2611,ERR5766174,ILLUMINA,,,ERX5474930_ERR5766174_1.fa.gz + 2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz, + 2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,, + 2613,ERR5766181,ILLUMINA,ERX5474937_ERR5766181_1.fastq.gz,ERX5474937_ERR5766181_2.fastq.gz, """ FQ_EXTENSIONS = (".fq", ".fq.gz", ".fastq", ".fastq.gz") @@ -216,21 +213,9 @@ def check_samplesheet(file_in, file_out): with open(file_out, "w") as fout: fout.write(",".join(HEADER_OUT) + "\n") for sample in sorted(sample_mapping_dict.keys()): - - ## Check that multiple runs of the same sample are of the same datatype - if not all( - x[0] == sample_mapping_dict[sample][0][0] - for x in sample_mapping_dict[sample] - ): - print_error( - "Multiple runs of a sample must be of the same datatype!", - "Sample: {}".format(sample), - ) - for idx, val in enumerate(sample_mapping_dict[sample]): - fout.write( - ",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n" - ) + fout.write(f"{sample},{','.join(val)}\n") + # fout.write(f",".join(["{}".format(sample)] + val) + "\n") else: print_error("No entries to process!", "Samplesheet: {}".format(file_in)) From a6cfa0a1ba3b9c0ad2263d8225bcbc05117f4bc3 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 18 Feb 2022 13:15:30 +0100 Subject: [PATCH 05/10] cleanup --- bin/check_samplesheet.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 77a5107..0f19523 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -1,8 +1,5 @@ #!/usr/bin/env python -# TODO nf-core: Update the script to check the samplesheet -# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv - from distutils import extension import os import sys @@ -83,7 +80,6 @@ def check_samplesheet(file_in, file_out): ## Check header MIN_COLS = 4 - # TODO nf-core: Update the column names for the input samplesheet HEADER = [ "sample", "run_accession", @@ -188,7 +184,7 @@ def check_samplesheet(file_in, file_out): else: print_error("Invalid combination of columns provided!", "Line", line) - ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 , fasta, run_accession, instrument_platform] } + ## Create sample mapping dictionary = { sample: [ run_accession, instrument_platform, single_end, fastq_1, fastq_2 , fasta ] } if sample not in sample_mapping_dict: sample_mapping_dict[sample] = [sample_info] else: @@ -215,7 +211,6 @@ def check_samplesheet(file_in, file_out): for sample in sorted(sample_mapping_dict.keys()): for idx, val in enumerate(sample_mapping_dict[sample]): fout.write(f"{sample},{','.join(val)}\n") - # fout.write(f",".join(["{}".format(sample)] + val) + "\n") else: print_error("No entries to process!", "Samplesheet: {}".format(file_in)) From 1b893cb039fee4544cef4d716668ef62c8551d17 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 18 Feb 2022 13:27:10 +0100 Subject: [PATCH 06/10] add check for fastq with fasta --- bin/check_samplesheet.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 0f19523..d6e7123 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -181,6 +181,12 @@ def check_samplesheet(file_in, file_out): sample and fasta and not fastq_1 and not fastq_2 ): ## Single-end long reads sample_info.extend(["1", fastq_1, fastq_2, fasta]) + elif fasta and (fastq_1 or fastq_2): + print_error( + "FastQ and FastA files cannot be specified together in the same library!", + "Line", + line, + ) else: print_error("Invalid combination of columns provided!", "Line", line) From cf55cc592cf41868f8cd480f53116525ab08f960 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 18 Feb 2022 16:51:01 +0100 Subject: [PATCH 07/10] Get skeleton read processing to input for profiling --- README.md | 6 +- conf/modules.config | 39 +++++++++++ conf/test.config | 4 +- lib/WorkflowTaxprofiler.groovy | 9 +-- modules.json | 6 ++ modules/nf-core/modules/cat/fastq/main.nf | 51 ++++++++++++++ modules/nf-core/modules/cat/fastq/meta.yml | 39 +++++++++++ modules/nf-core/modules/fastp/main.nf | 75 ++++++++++++++++++++ modules/nf-core/modules/fastp/meta.yml | 68 ++++++++++++++++++ nextflow.config | 5 +- nextflow_schema.json | 9 --- subworkflows/local/input_check.nf | 43 ++++++++++-- workflows/taxprofiler.nf | 80 +++++++++++++++++++++- 13 files changed, 407 insertions(+), 27 deletions(-) create mode 100644 modules/nf-core/modules/cat/fastq/main.nf create mode 100644 modules/nf-core/modules/cat/fastq/meta.yml create mode 100644 modules/nf-core/modules/fastp/main.nf create mode 100644 modules/nf-core/modules/fastp/meta.yml diff --git a/README.md b/README.md index f57666d..5f00709 100644 --- a/README.md +++ b/README.md @@ -42,9 +42,9 @@ On release, automated continuous integration tests run the pipeline on a full-si - Centrifuge - Kaiju - mOTUs -4. Perform optional post-processing with: - - bracken -5. Standardises output tables +4. Perform optional post-processing with: + - bracken +5. Standardises output tables 6. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) ## Quick Start diff --git a/conf/modules.config b/conf/modules.config index a0506a4..2d533e9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -28,8 +28,47 @@ process { withName: FASTQC { ext.args = '--quiet' + ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } + publishDir = [ + path: { "${params.outdir}/fastqc/raw" }, + mode: 'copy', + pattern: '*.html' + ] } + withName: FASTP { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + // TODO also include option to NOT merge + ext.args = [ + { ${meta.single_end} } == 0 ? "-m" : '', + params.fastp_exclude_unmerged ? '' : "--include_unmerged" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/fastp" }, + mode: 'copy', + pattern: '*.fastq.gz' + ] + } + + withName: FASTQC_POST { + ext.args = '--quiet' + ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } + publishDir = [ + path: { "${params.outdir}/fastqc/processed" }, + mode: 'copy', + pattern: '*.html' + ] + } + + withName: CAT_FASTQ { + publishDir = [ + path: { "${params.outdir}/prepared_sequences" }, + mode: 'copy', + pattern: '*.fastq.gz' + ] + } + + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, diff --git a/conf/test.config b/conf/test.config index 45f87af..5db566a 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,8 +22,6 @@ params { // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - // Genome references - genome = 'R64-1-1' } diff --git a/lib/WorkflowTaxprofiler.groovy b/lib/WorkflowTaxprofiler.groovy index 53482ac..57a95e3 100755 --- a/lib/WorkflowTaxprofiler.groovy +++ b/lib/WorkflowTaxprofiler.groovy @@ -10,10 +10,11 @@ class WorkflowTaxprofiler { public static void initialise(params, log) { genomeExistsError(params, log) - if (!params.fasta) { - log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." - System.exit(1) - } + // TODO update as necessary + //if (!params.fasta) { + // log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." + // System.exit(1) + //} } // diff --git a/modules.json b/modules.json index 939b85f..6cf2b3e 100644 --- a/modules.json +++ b/modules.json @@ -3,9 +3,15 @@ "homePage": "https://github.com/nf-core/taxprofiler", "repos": { "nf-core/modules": { + "cat/fastq": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + }, "custom/dumpsoftwareversions": { "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41" }, + "fastp": { + "git_sha": "d0a1cbb703a130c19f6796c3fce24fbe7dfce789" + }, "fastqc": { "git_sha": "9d0cad583b9a71a6509b754fdf589cbfbed08961" }, diff --git a/modules/nf-core/modules/cat/fastq/main.nf b/modules/nf-core/modules/cat/fastq/main.nf new file mode 100644 index 0000000..bf0877c --- /dev/null +++ b/modules/nf-core/modules/cat/fastq/main.nf @@ -0,0 +1,51 @@ +process CAT_FASTQ { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "conda-forge::sed=4.7" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img' : + 'biocontainers/biocontainers:v1.2.0_cv1' }" + + input: + tuple val(meta), path(reads, stageAs: "input*/*") + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads.collect{ it.toString() } + if (meta.single_end) { + if (readList.size > 1) { + """ + cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size > 2) { + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz + cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } +} diff --git a/modules/nf-core/modules/cat/fastq/meta.yml b/modules/nf-core/modules/cat/fastq/meta.yml new file mode 100644 index 0000000..c836598 --- /dev/null +++ b/modules/nf-core/modules/cat/fastq/meta.yml @@ -0,0 +1,39 @@ +name: cat_fastq +description: Concatenates fastq files +keywords: + - fastq + - concatenate +tools: + - cat: + description: | + The cat utility reads files sequentially, writing them to the standard output. + documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: list + description: | + List of input FastQ files to be concatenated. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Merged fastq file + pattern: "*.{merged.fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/modules/fastp/main.nf b/modules/nf-core/modules/fastp/main.nf new file mode 100644 index 0000000..5c9e3b8 --- /dev/null +++ b/modules/nf-core/modules/fastp/main.nf @@ -0,0 +1,75 @@ +process FASTP { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? 'bioconda::fastp=0.23.2' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastp:0.23.2--h79da9fb_0' : + 'quay.io/biocontainers/fastp:0.23.2--h79da9fb_0' }" + + input: + tuple val(meta), path(reads) + val save_trimmed_fail + val save_merged + + output: + tuple val(meta), path('*.trim.fastq.gz') , optional:true, emit: reads + tuple val(meta), path('*.json') , emit: json + tuple val(meta), path('*.html') , emit: html + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + tuple val(meta), path('*.fail.fastq.gz') , optional:true, emit: reads_fail + tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + // Added soft-links to original fastqs for consistent naming in MultiQC + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + def fail_fastq = save_trimmed_fail ? "--failed_out ${prefix}.fail.fastq.gz" : '' + """ + [ ! -f ${prefix}.fastq.gz ] && ln -s $reads ${prefix}.fastq.gz + fastp \\ + --in1 ${prefix}.fastq.gz \\ + --out1 ${prefix}.trim.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $fail_fastq \\ + $args \\ + 2> ${prefix}.fastp.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else { + def fail_fastq = save_trimmed_fail ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : '' + def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : '' + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz + fastp \\ + --in1 ${prefix}_1.fastq.gz \\ + --in2 ${prefix}_2.fastq.gz \\ + --out1 ${prefix}_1.trim.fastq.gz \\ + --out2 ${prefix}_2.trim.fastq.gz \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $fail_fastq \\ + $merge_fastq \\ + --thread $task.cpus \\ + --detect_adapter_for_pe \\ + $args \\ + 2> ${prefix}.fastp.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/modules/fastp/meta.yml b/modules/nf-core/modules/fastp/meta.yml new file mode 100644 index 0000000..3274e41 --- /dev/null +++ b/modules/nf-core/modules/fastp/meta.yml @@ -0,0 +1,68 @@ +name: fastp +description: Perform adapter/quality trimming on sequencing reads +keywords: + - trimming + - quality control + - fastq +tools: + - fastp: + description: | + A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance. + documentation: https://github.com/OpenGene/fastp + doi: https://doi.org/10.1093/bioinformatics/bty560 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - save_trimmed_fail: + type: boolean + description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz` + - save_merged: + type: boolean + description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz` + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified/unmerged fastq reads + pattern: "*trim.fastq.gz" + - json: + type: file + description: Results in JSON format + pattern: "*.json" + - html: + type: file + description: Results in HTML format + pattern: "*.html" + - log: + type: file + description: fastq log file + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads_fail: + type: file + description: Reads the failed the preprocessing + pattern: "*fail.fastq.gz" + - reads_merged: + type: file + description: Reads that were successfully merged + pattern: "*.{merged.fastq.gz}" +authors: + - "@drpatelh" + - "@kevinmenden" diff --git a/nextflow.config b/nextflow.config index 186979c..160dba7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -33,7 +33,7 @@ params { help = false validate_params = true show_hidden_params = false - schema_ignore_params = 'genomes' + schema_ignore_params = 'genomes,fasta' enable_conda = false // Config options @@ -50,6 +50,9 @@ params { max_cpus = 16 max_time = '240.h' + // FASTQ preprocessing + fastp_clip_merge = false + fastp_exclude_unmerged = true } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index 535e08d..cc43add 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -56,15 +56,6 @@ "fa_icon": "fas fa-book", "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." }, - "fasta": { - "type": "string", - "format": "file-path", - "mimetype": "text/plain", - "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", - "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", - "fa_icon": "far fa-file-code" - }, "igenomes_base": { "type": "string", "format": "directory-path", diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index cddcbb3..241a8a7 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -9,22 +9,38 @@ workflow INPUT_CHECK { samplesheet // file: /path/to/samplesheet.csv main: - SAMPLESHEET_CHECK ( samplesheet ) + parsed_samplesheet = SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) + .dump(tag: "split_csv_out") + .branch { + fasta: it['fasta'] != '' + fastq: true + } + + parsed_samplesheet.fastq .map { create_fastq_channels(it) } - .set { reads } + .dump(tag: "fastq_channel_init") + .set { fastq } + + parsed_samplesheet.fasta + .map { create_fasta_channels(it) } + .dump(tag: "fasta_channel_init") + .set { fasta } emit: - reads // channel: [ val(meta), [ reads ] ] + fastq // channel: [ val(meta), [ reads ] ] + fasta // channel: [ val(meta), fasta ] versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } // Function to get list of [ meta, [ fastq_1, fastq_2 ] ] def create_fastq_channels(LinkedHashMap row) { def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() + meta.id = row.sample + meta.run_accession = row.run_accession + meta.instrument_platform = row.instrument_platform + meta.single_end = row.single_end.toBoolean() def array = [] if (!file(row.fastq_1).exists()) { @@ -40,3 +56,20 @@ def create_fastq_channels(LinkedHashMap row) { } return array } + +// Function to get list of [ meta, fasta ] +def create_fasta_channels(LinkedHashMap row) { + def meta = [:] + meta.id = row.sample + meta.run_accession = row.run_accession + meta.instrument_platform = row.instrument_platform + meta.single_end = true + + def array = [] + if (!file(row.fasta).exists()) { + exit 1, "ERROR: Please check input samplesheet -> FastA file does not exist!\n${row.fasta}" + } + array = [ meta, [ file(row.fasta) ] ] + + return array +} diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 56c532b..3f10a9d 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -11,7 +11,7 @@ WorkflowTaxprofiler.initialise(params, log) // TODO nf-core: Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta ] +def checkPathParamList = [ params.input, params.multiqc_config ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters @@ -50,6 +50,11 @@ include { FASTQC } from '../modules/nf-core/modules/fastqc/ include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' +include { FASTP as FASTP_SINGLE } from '../modules/nf-core/modules/fastp/main' +include { FASTP as FASTP_PAIRED } from '../modules/nf-core/modules/fastp/main' +include { FASTQC as FASTQC_POST } from '../modules/nf-core/modules/fastqc/main' +include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' + /* ======================================================================================== RUN MAIN WORKFLOW @@ -75,7 +80,7 @@ workflow TAXPROFILER { // MODULE: Run FastQC // FASTQC ( - INPUT_CHECK.out.reads + INPUT_CHECK.out.fastq ) ch_versions = ch_versions.mix(FASTQC.out.versions.first()) @@ -83,6 +88,71 @@ workflow TAXPROFILER { ch_versions.unique().collectFile(name: 'collated_versions.yml') ) + // + // MODULE: Run Clip/Merge/Complexity + // + // TODO give option to clip only and retain pairs + // TODO give option to retain singletons (probably fastp option likely) + // TODO move to subworkflow + if ( params.fastp_clip_merge ) { + + ch_input_for_fastp = INPUT_CHECK.out.fastq + .dump(tag: "pre-fastp_branch") + .branch{ + single: it[0]['single_end'] == true + paired: it[0]['single_end'] == false + } + + ch_input_for_fastp.single.dump(tag: "input_fastp_single") + ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") + + FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) + FASTP_PAIRED ( ch_input_for_fastp.paired, false, true ) + + ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged + .mix( FASTP_SINGLE.out.reads ) + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] + } + + FASTQC_POST ( ch_fastp_reads_prepped ) + + ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) + ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) + + ch_processed_reads = ch_fastp_reads_prepped + + } else { + ch_processed_reads = INPUT_CHECK.out.fastq + } + + + // MODULE: Cat merge runs of same sample + ch_processed_for_combine = ch_processed_reads + .dump(tag: "prep_for_combine_grouping") + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['run_accession'] = 'combined' + [ meta_new, reads ] + } + .groupTuple ( by: 0 ) + .branch{ + combine: it[1].size() >= 2 + skip: it[1].size() < 2 + } + + CAT_FASTQ ( ch_processed_for_combine.combine ) + + // Ready for profiling! + ch_reads_for_profiling = ch_processed_for_combine.skip + .dump(tag: "skip_combine") + .mix( CAT_FASTQ.out.reads ) + .dump(tag: "files_for_profiling") + // // MODULE: MultiQC // @@ -95,6 +165,12 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + if (params.fastp_clip_merge) { + ch_multiqc_files = ch_multiqc_files.mix(FASTP_SINGLE.out.json.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTP_PAIRED.out.json.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC_POST.out.zip.collect{it[1]}.ifEmpty([])) + } + MULTIQC ( ch_multiqc_files.collect() From 278f5605ca831338384b5045e34f500f0b5ca1a2 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 19 Feb 2022 12:36:08 +0100 Subject: [PATCH 08/10] Added database preparation and final channel for profiling --- modules/local/database_check.nf | 25 ++++++++++ nextflow.config | 3 ++ subworkflows/local/db_check.nf | 40 ++++++++++++++++ subworkflows/local/input_check.nf | 2 +- subworkflows/local/preprocessing.nf | 73 +++++++++++++++++++++++++++++ workflows/taxprofiler.nf | 65 ++++++++----------------- 6 files changed, 161 insertions(+), 47 deletions(-) create mode 100644 modules/local/database_check.nf create mode 100644 subworkflows/local/db_check.nf create mode 100644 subworkflows/local/preprocessing.nf diff --git a/modules/local/database_check.nf b/modules/local/database_check.nf new file mode 100644 index 0000000..4da4313 --- /dev/null +++ b/modules/local/database_check.nf @@ -0,0 +1,25 @@ +process DATABASE_CHECK { + tag "$databasesheet" + + conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'quay.io/biocontainers/python:3.8.3' }" + + input: + path databasesheet + + output: + path '*.csv' , emit: csv + path "versions.yml", emit: versions + + script: // This script is bundled with the pipeline, in nf-core/taxprofiler/bin/ + """ + cat $databasesheet >> database_sheet.valid.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index 160dba7..60b5a42 100644 --- a/nextflow.config +++ b/nextflow.config @@ -50,6 +50,9 @@ params { max_cpus = 16 max_time = '240.h' + // Databaess + databases = null + // FASTQ preprocessing fastp_clip_merge = false fastp_exclude_unmerged = true diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf new file mode 100644 index 0000000..909d98f --- /dev/null +++ b/subworkflows/local/db_check.nf @@ -0,0 +1,40 @@ +// +// Check input samplesheet and get read channels +// + +include { DATABASE_CHECK } from '../../modules/local/database_check' + +workflow DB_CHECK { + take: + dbsheet // file: /path/to/dbsheet.csv + + main: + + // TODO: make database sheet check + parsed_samplesheet = DATABASE_CHECK ( dbsheet ) + .csv + .splitCsv ( header:true, sep:',' ) + .dump(tag: "db_split_csv_out") + .map { create_db_channels(it) } + .dump(tag: "db_channel_prepped") + .set{ dbs } + + emit: + dbs // channel: [ val(meta), [ db ] ] + versions = DATABASE_CHECK.out.versions // channel: [ versions.yml ] +} + +def create_db_channels(LinkedHashMap row) { + def meta = [:] + meta.tool = row.tool + meta.db_name = row.db_name + meta.db_params = row.db_params + + def array = [] + if (!file(row.db_path, type: 'dir').exists()) { + exit 1, "ERROR: Please check input samplesheet -> database could not be found!\n${row.db_path}" + } + array = [ meta, file(row.db_path) ] + + return array +} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 241a8a7..8497faa 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -12,7 +12,7 @@ workflow INPUT_CHECK { parsed_samplesheet = SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) - .dump(tag: "split_csv_out") + .dump(tag: "input_split_csv_out") .branch { fasta: it['fasta'] != '' fastq: true diff --git a/subworkflows/local/preprocessing.nf b/subworkflows/local/preprocessing.nf new file mode 100644 index 0000000..5832824 --- /dev/null +++ b/subworkflows/local/preprocessing.nf @@ -0,0 +1,73 @@ +// +// Check input samplesheet and get read channels +// + + +include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main' +include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' +include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main' + +workflow FASTQ_PREPROCESSING { + take: + reads // file: /path/to/samplesheet.csv + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + // + // STEP: Read clipping and merging + // + // TODO give option to clip only and retain pairs + // TODO give option to retain singletons (probably fastp option likely) + // TODO move to subworkflow + + + if ( params.fastp_clip_merge ) { + + ch_input_for_fastp = reads + .dump(tag: "pre-fastp_branch") + .branch{ + single: it[0]['single_end'] == true + paired: it[0]['single_end'] == false + } + + ch_input_for_fastp.single.dump(tag: "input_fastp_single") + ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") + + FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) + FASTP_PAIRED ( ch_input_for_fastp.paired, false, true ) + + ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged + .mix( FASTP_SINGLE.out.reads ) + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] + } + + FASTQC_POST ( ch_fastp_reads_prepped ) + + ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) + ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) + + ch_processed_reads = ch_fastp_reads_prepped + + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} ) + + ch_multiqc_files.dump(tag: "preprocessing_mqc_final") + + } else { + ch_processed_reads = reads + } + + + emit: + reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 3f10a9d..4a356a5 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -11,11 +11,12 @@ WorkflowTaxprofiler.initialise(params, log) // TODO nf-core: Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config ] +def checkPathParamList = [ params.input, params.databases, params.multiqc_config ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters -if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } /* ======================================================================================== @@ -35,7 +36,11 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { INPUT_CHECK } from '../subworkflows/local/input_check' + +include { DB_CHECK } from '../subworkflows/local/db_check' +include { FASTQ_PREPROCESSING } from '../subworkflows/local/preprocessing' + /* ======================================================================================== @@ -50,9 +55,6 @@ include { FASTQC } from '../modules/nf-core/modules/fastqc/ include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' -include { FASTP as FASTP_SINGLE } from '../modules/nf-core/modules/fastp/main' -include { FASTP as FASTP_PAIRED } from '../modules/nf-core/modules/fastp/main' -include { FASTQC as FASTQC_POST } from '../modules/nf-core/modules/fastqc/main' include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' /* @@ -76,6 +78,10 @@ workflow TAXPROFILER { ) ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) + DB_CHECK ( + ch_databases + ) + // // MODULE: Run FastQC // @@ -91,47 +97,12 @@ workflow TAXPROFILER { // // MODULE: Run Clip/Merge/Complexity // - // TODO give option to clip only and retain pairs - // TODO give option to retain singletons (probably fastp option likely) - // TODO move to subworkflow if ( params.fastp_clip_merge ) { - - ch_input_for_fastp = INPUT_CHECK.out.fastq - .dump(tag: "pre-fastp_branch") - .branch{ - single: it[0]['single_end'] == true - paired: it[0]['single_end'] == false - } - - ch_input_for_fastp.single.dump(tag: "input_fastp_single") - ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") - - FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) - FASTP_PAIRED ( ch_input_for_fastp.paired, false, true ) - - ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged - .mix( FASTP_SINGLE.out.reads ) - .map { - meta, reads -> - def meta_new = meta.clone() - meta_new['single_end'] = 1 - [ meta_new, reads ] - } - - FASTQC_POST ( ch_fastp_reads_prepped ) - - ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) - ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) - - ch_processed_reads = ch_fastp_reads_prepped - - } else { - ch_processed_reads = INPUT_CHECK.out.fastq + FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq ) } - // MODULE: Cat merge runs of same sample - ch_processed_for_combine = ch_processed_reads + ch_processed_for_combine = FASTQ_PREPROCESSING.out.reads .dump(tag: "prep_for_combine_grouping") .map { meta, reads -> @@ -153,6 +124,10 @@ workflow TAXPROFILER { .mix( CAT_FASTQ.out.reads ) .dump(tag: "files_for_profiling") + // Combine reads with possible databases + + ch_reads_for_profiling.combine(DB_CHECK.out.dbs).dump(tag: "reads_plus_db") + // // MODULE: MultiQC // @@ -166,9 +141,7 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) if (params.fastp_clip_merge) { - ch_multiqc_files = ch_multiqc_files.mix(FASTP_SINGLE.out.json.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(FASTP_PAIRED.out.json.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC_POST.out.zip.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc) } From 2c183ed2edc61ab058580cb63441917d516eb564 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 3 Mar 2022 17:42:02 +0100 Subject: [PATCH 09/10] Add Kraken2 and MALT/run as Proof of Concept (currnetly MQC issue) --- conf/modules.config | 20 ++++++ modules.json | 6 ++ .../nf-core/modules/kraken2/kraken2/main.nf | 49 +++++++++++++ .../nf-core/modules/kraken2/kraken2/meta.yml | 60 ++++++++++++++++ modules/nf-core/modules/malt/run/main.nf | 49 +++++++++++++ modules/nf-core/modules/malt/run/meta.yml | 58 ++++++++++++++++ nextflow.config | 11 ++- workflows/taxprofiler.nf | 69 +++++++++++++++++-- 8 files changed, 315 insertions(+), 7 deletions(-) create mode 100644 modules/nf-core/modules/kraken2/kraken2/main.nf create mode 100644 modules/nf-core/modules/kraken2/kraken2/meta.yml create mode 100644 modules/nf-core/modules/malt/run/main.nf create mode 100644 modules/nf-core/modules/malt/run/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 2d533e9..dbc926c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -68,6 +68,26 @@ process { ] } + withName: MALT_RUN { + publishDir = [ + path: { "${params.outdir}/malt/${meta.db_name}" }, + mode: 'copy', + pattern: '*.{rma6,tab,text,sam,log}' + ] + ext.args = { "${meta.db_params}" } + ext.when = params.run_malt + } + + withName: KRAKEN2_KRAKEN2 { + publishDir = [ + path: { "${params.outdir}/kraken2/${meta.db_name}" }, + mode: 'copy', + pattern: '.{fastq.gz,txt}' + ] + ext.args = { "${meta.db_params}" } + ext.when = params.run_kraken2 + ext.prefix = { "${meta.id}-${meta.db_name}" } + } withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ diff --git a/modules.json b/modules.json index 6cf2b3e..844b33a 100644 --- a/modules.json +++ b/modules.json @@ -15,6 +15,12 @@ "fastqc": { "git_sha": "9d0cad583b9a71a6509b754fdf589cbfbed08961" }, + "kraken2/kraken2": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + }, + "malt/run": { + "git_sha": "76cdd46f3f8a77fb5023fb5a39c4ab99925b8b56" + }, "multiqc": { "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41" } diff --git a/modules/nf-core/modules/kraken2/kraken2/main.nf b/modules/nf-core/modules/kraken2/kraken2/main.nf new file mode 100644 index 0000000..3ec5df5 --- /dev/null +++ b/modules/nf-core/modules/kraken2/kraken2/main.nf @@ -0,0 +1,49 @@ +process KRAKEN2_KRAKEN2 { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? 'bioconda::kraken2=2.1.2 conda-forge::pigz=2.6' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' : + 'quay.io/biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }" + + input: + tuple val(meta), path(reads) + path db + + output: + tuple val(meta), path('*classified*') , emit: classified + tuple val(meta), path('*unclassified*'), emit: unclassified + tuple val(meta), path('*report.txt') , emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "" : "--paired" + def classified = meta.single_end ? "${prefix}.classified.fastq" : "${prefix}.classified#.fastq" + def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq" + """ + kraken2 \\ + --db $db \\ + --threads $task.cpus \\ + --unclassified-out $unclassified \\ + --classified-out $classified \\ + --report ${prefix}.kraken2.report.txt \\ + --gzip-compressed \\ + $paired \\ + $args \\ + $reads + + pigz -p $task.cpus *.fastq + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/kraken2/kraken2/meta.yml b/modules/nf-core/modules/kraken2/kraken2/meta.yml new file mode 100644 index 0000000..9d6a385 --- /dev/null +++ b/modules/nf-core/modules/kraken2/kraken2/meta.yml @@ -0,0 +1,60 @@ +name: kraken2_kraken2 +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - kraken2: + description: | + Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads + homepage: https://ccb.jhu.edu/software/kraken2/ + documentation: https://github.com/DerrickWood/kraken2/wiki/Manual + doi: 10.1186/s13059-019-1891-0 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Kraken2 database +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classified: + type: file + description: | + Reads classified to belong to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - unclassified: + type: file + description: | + Reads not classified to belong to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - txt: + type: file + description: | + Kraken2 report containing stats about classified + and not classifed reads. + pattern: "*.{report.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/modules/malt/run/main.nf b/modules/nf-core/modules/malt/run/main.nf new file mode 100644 index 0000000..61c02ec --- /dev/null +++ b/modules/nf-core/modules/malt/run/main.nf @@ -0,0 +1,49 @@ +process MALT_RUN { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? "bioconda::malt=0.53" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/malt:0.53--hdfd78af_0' : + 'quay.io/biocontainers/malt:0.53--hdfd78af_0' }" + + input: + tuple val(meta), path(fastqs) + val mode + path index + + output: + tuple val(meta), path("*.rma6") , emit: rma6 + tuple val(meta), path("*.{tab,text,sam}"), optional:true, emit: alignments + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def avail_mem = 6 + if (!task.memory) { + log.info '[MALT_RUN] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.' + } else { + avail_mem = task.memory.giga + } + + """ + malt-run \\ + -J-Xmx${avail_mem}g \\ + -t $task.cpus \\ + -v \\ + -o . \\ + $args \\ + --inFile ${fastqs.join(' ')} \\ + -m $mode \\ + --index $index/ |&tee malt-run.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + malt: \$(malt-run --help 2>&1 | grep -o 'version.* ' | cut -f 1 -d ',' | cut -f2 -d ' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/malt/run/meta.yml b/modules/nf-core/modules/malt/run/meta.yml new file mode 100644 index 0000000..ae4277a --- /dev/null +++ b/modules/nf-core/modules/malt/run/meta.yml @@ -0,0 +1,58 @@ +name: malt_run +description: MALT, an acronym for MEGAN alignment tool, is a sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics. +keywords: + - malt + - alignment + - metagenomics + - ancient DNA + - aDNA + - palaeogenomics + - archaeogenomics + - microbiome +tools: + - malt: + description: A tool for mapping metagenomic data + homepage: https://www.wsi.uni-tuebingen.de/lehrstuehle/algorithms-in-bioinformatics/software/malt/ + documentation: https://software-ab.informatik.uni-tuebingen.de/download/malt/manual.pdf + tool_dev_url: None + doi: "10.1038/s41559-017-0446-6" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastqs: + type: file + description: Input FASTQ files + pattern: "*.{fastq.gz,fq.gz}" + - mode: + type: string + description: Program mode + pattern: "Unknown|BlastN|BlastP|BlastX|Classifier" + - index: + type: directory + description: Index/database directory from malt-build + pattern: "*/" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - rma6: + type: file + description: MEGAN6 RMA6 file + pattern: "*.rma6" + - sam: + type: file + description: Alignment files in Tab, Text or MEGAN-compatible SAM format + pattern: "*.{tab,txt,sam}" + - log: + type: file + description: Log of verbose MALT stdout + pattern: "malt-run.log" + +authors: + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index 60b5a42..7991bdf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -54,8 +54,15 @@ params { databases = null // FASTQ preprocessing - fastp_clip_merge = false - fastp_exclude_unmerged = true + fastp_clip_merge = false + fastp_exclude_unmerged = true + + // MALT + run_malt = false + malt_mode = 'BlastN' + + // kraken2 + run_kraken2 = false } // Load base.config by default for all pipelines diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 4a356a5..bd25563 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -56,6 +56,9 @@ include { MULTIQC } from '../modules/nf-core/modules/multiqc include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' +include { MALT_RUN } from '../modules/nf-core/modules/malt/run/main' +include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/modules/kraken2/kraken2/main' + /* ======================================================================================== @@ -95,13 +98,15 @@ workflow TAXPROFILER { ) // - // MODULE: Run Clip/Merge/Complexity + // PERFORM PREPROCESSING // if ( params.fastp_clip_merge ) { FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq ) } - // MODULE: Cat merge runs of same sample + // + // PERFORM RUN MERGING + // ch_processed_for_combine = FASTQ_PREPROCESSING.out.reads .dump(tag: "prep_for_combine_grouping") .map { @@ -118,15 +123,61 @@ workflow TAXPROFILER { CAT_FASTQ ( ch_processed_for_combine.combine ) - // Ready for profiling! ch_reads_for_profiling = ch_processed_for_combine.skip .dump(tag: "skip_combine") .mix( CAT_FASTQ.out.reads ) .dump(tag: "files_for_profiling") - // Combine reads with possible databases + // + // COMBINE READS WITH POSSIBLE DATABASES + // + + // output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] + ch_input_for_profiling = ch_reads_for_profiling + .combine(DB_CHECK.out.dbs) + .dump(tag: "reads_plus_db") + .branch { + malt: it[2]['tool'] == 'malt' + kraken2: it[2]['tool'] == 'kraken2' + unknown: true + } + + // + // PREP PROFILER INPUT CHANNELS ON PER TOOL BASIS + // + + // We groupTuple to have all samples in one channel for MALT as database + // loading takes a long time, so we only want to run it once per database + ch_input_for_malt = ch_input_for_profiling.malt + .map { + it -> + def temp_meta = [ id: it[2]['db_name']] + it[2] + def db = it[3] + [ temp_meta, it[1], db ] + } + .groupTuple(by: [0,2]) + .dump(tag: "input for malt") + .multiMap { + it -> + reads: [ it[0], it[1].flatten() ] + db: it[2] + } + + // We can run Kraken2 one-by-one sample-wise + ch_input_for_kraken2 = ch_input_for_profiling.kraken2 + .dump(tag: "input for kraken") + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + // + // RUN PROFILING + // + MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) - ch_reads_for_profiling.combine(DB_CHECK.out.dbs).dump(tag: "reads_plus_db") // // MODULE: MultiQC @@ -143,6 +194,14 @@ workflow TAXPROFILER { if (params.fastp_clip_merge) { ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc) } + if (params.run_kraken2) { + ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([])) + ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first()) + } + if (params.run_malt) { + ch_multiqc_files = ch_multiqc_files.mix(MALT_RUN.out.log.collect{it[1]}.ifEmpty([])) + ch_versions = ch_versions.mix(MALT_RUN.out.versions.first()) + } MULTIQC ( From e39c6a8ccb337b5889271df4f1e57bcbd3dd2c74 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 3 Mar 2022 18:04:03 +0100 Subject: [PATCH 10/10] Fix MALT multiqc report clash --- modules.json | 2 +- modules/nf-core/modules/malt/run/main.nf | 3 ++- modules/nf-core/modules/malt/run/meta.yml | 2 +- workflows/taxprofiler.nf | 3 ++- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/modules.json b/modules.json index 844b33a..6a785b8 100644 --- a/modules.json +++ b/modules.json @@ -19,7 +19,7 @@ "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, "malt/run": { - "git_sha": "76cdd46f3f8a77fb5023fb5a39c4ab99925b8b56" + "git_sha": "72b96f4e504eef673f2b5c13560a9d90b669129b" }, "multiqc": { "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41" diff --git a/modules/nf-core/modules/malt/run/main.nf b/modules/nf-core/modules/malt/run/main.nf index 61c02ec..4e2e50c 100644 --- a/modules/nf-core/modules/malt/run/main.nf +++ b/modules/nf-core/modules/malt/run/main.nf @@ -23,6 +23,7 @@ process MALT_RUN { script: def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" def avail_mem = 6 if (!task.memory) { log.info '[MALT_RUN] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.' @@ -39,7 +40,7 @@ process MALT_RUN { $args \\ --inFile ${fastqs.join(' ')} \\ -m $mode \\ - --index $index/ |&tee malt-run.log + --index $index/ |&tee ${prefix}-malt-run.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/modules/malt/run/meta.yml b/modules/nf-core/modules/malt/run/meta.yml index ae4277a..66f2d7a 100644 --- a/modules/nf-core/modules/malt/run/meta.yml +++ b/modules/nf-core/modules/malt/run/meta.yml @@ -52,7 +52,7 @@ output: - log: type: file description: Log of verbose MALT stdout - pattern: "malt-run.log" + pattern: "*-malt-run.log" authors: - "@jfy133" diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index bd25563..bf3e6ee 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -203,7 +203,8 @@ workflow TAXPROFILER { ch_versions = ch_versions.mix(MALT_RUN.out.versions.first()) } - + // TODO MALT results overwriting per database? + // TODO Versions for Karken/MALT not report? MULTIQC ( ch_multiqc_files.collect() )