From 7321daf4f6f141efae218a34b2b69c482d1dfeed Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 18 Feb 2022 08:41:34 +0100 Subject: [PATCH 001/214] Update README with approximate pipeline outline --- README.md | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8f33f2c..f57666d 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ ## Introduction -**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for Taxonomic profiling of shotgun metagenomic data. +**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling against multiple profiling tools and databases and produces standardised output tables. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! @@ -29,7 +29,23 @@ On release, automated continuous integration tests run the pipeline on a full-si 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +2. Performs optional read pre-processing + - Adapter clipping and merging + - Low complexity filtering + - Host read removal + - Run merging +3. Performs taxonomic profiling a choice of: + - Kraken2 + - MetaPhlAn3 + - MALT + - DIAMOND + - Centrifuge + - Kaiju + - mOTUs +4. Perform optional post-processing with: + - bracken +5. Standardises output tables +6. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) ## Quick Start From c68afbbf90c98e87716dc526d6c5a9631a341201 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 18 Feb 2022 08:45:06 +0100 Subject: [PATCH 002/214] Update nextflow.config --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index c2d51b5..186979c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -121,7 +121,7 @@ if (!params.igenomes_ignore) { } // Export these variables to prevent local Python/R libraries from conflicting with those in the container -// The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. +// The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. env { From f867c057a4e1f0c7f4fc23db44356ee120de261f Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 18 Feb 2022 11:53:13 +0100 Subject: [PATCH 003/214] add more columns to samplesheet --- bin/check_samplesheet.py | 138 +++++++++++++++++++++++++++++++++------ 1 file changed, 118 insertions(+), 20 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 41dd9aa..eb6a7dc 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -3,6 +3,7 @@ # TODO nf-core: Update the script to check the samplesheet # This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv +from distutils import extension import os import sys import errno @@ -10,7 +11,9 @@ import argparse def parse_args(args=None): - Description = "Reformat nf-core/taxprofiler samplesheet file and check its contents." + Description = ( + "Reformat nf-core/taxprofiler samplesheet file and check its contents." + ) Epilog = "Example usage: python check_samplesheet.py " parser = argparse.ArgumentParser(description=Description, epilog=Epilog) @@ -43,25 +46,62 @@ def check_samplesheet(file_in, file_out): """ This function checks that the samplesheet follows the following structure: - sample,fastq_1,fastq_2 - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz - SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz, + sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta + 2611,ERR5766174,ILLUMINA,NA,NA,FA_EXTENSIONSERX5474930_ERR5766174_1.fa.gz + 2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz,NA + 2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,NA,NA + 2613,ERR5766181,ILLUMINA,ERX5474930_ERR5766174_1.fa.gz,ERX5474930_ERR5766174_2.fa.gz,NA For an example see: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv """ + FQ_EXTENSIONS = (".fq", ".fq.gz", ".fastq", ".fastq.gz") + FA_EXTENSIONS = ( + ".fa", + ".fa.gz", + ".fasta", + ".fasta.gz", + ".fna", + ".fna.gz", + ".fas", + ".fas.gz", + ) + INSTRUMENT_PLATFORMS = [ + "ABI_SOLID", + "BGISEQ", + "CAPILLARY", + "COMPLETE_GENOMICS", + "DNBSEQ", + "HELICOS", + "ILLUMINA", + "ION_TORRENT", + "LS454", + "OXFORD_NANOPORE", + "PACBIO_SMRT", + ] + sample_mapping_dict = {} with open(file_in, "r") as fin: ## Check header - MIN_COLS = 2 + MIN_COLS = 4 # TODO nf-core: Update the column names for the input samplesheet - HEADER = ["sample", "fastq_1", "fastq_2"] + HEADER = [ + "sample", + "run_accession", + "instrument_platform", + "fastq_1", + "fastq_2", + "fasta", + ] header = [x.strip('"') for x in fin.readline().strip().split(",")] if header[: len(HEADER)] != HEADER: - print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER))) + print( + "ERROR: Please check samplesheet header -> {} != {}".format( + ",".join(header), ",".join(HEADER) + ) + ) sys.exit(1) ## Check sample entries @@ -78,13 +118,22 @@ def check_samplesheet(file_in, file_out): num_cols = len([x for x in lspl if x]) if num_cols < MIN_COLS: print_error( - "Invalid number of populated columns (minimum = {})!".format(MIN_COLS), + "Invalid number of populated columns (minimum = {})!".format( + MIN_COLS + ), "Line", line, ) ## Check sample name entries - sample, fastq_1, fastq_2 = lspl[: len(HEADER)] + ( + sample, + run_accession, + instrument_platform, + fastq_1, + fastq_2, + fasta, + ) = lspl[: len(HEADER)] sample = sample.replace(" ", "_") if not sample: print_error("Sample entry has not been specified!", "Line", line) @@ -94,23 +143,55 @@ def check_samplesheet(file_in, file_out): if fastq: if fastq.find(" ") != -1: print_error("FastQ file contains spaces!", "Line", line) - if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"): + if not fastq.endswith(FQ_EXTENSIONS): print_error( - "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!", + f"FastQ file does not have extension {' or '.join(list(FQ_EXTENSIONS))} !", "Line", line, ) + if fasta: + if fasta.find(" ") != -1: + print_error("FastA file contains spaces!", "Line", line) + if not fasta.endswith(FA_EXTENSIONS): + print_error( + f"FastA file does not have extension {' or '.join(list(FA_EXTENSIONS))}!", + "Line", + line, + ) + sample_info = [] + + # Check run_accession + if not run_accession: + print_error("Run accession has not been specified!", "Line", line) + else: + sample_info.append(run_accession) + + # Check instrument_platform + if not instrument_platform: + print_error("Instrument platform has not been specified!", "Line", line) + else: + if instrument_platform not in INSTRUMENT_PLATFORMS: + print_error( + f"Instrument platform {instrument_platform} is not supported!", + f"List of supported platforms {', '.join(INSTRUMENT_PLATFORMS)}", + "Line", + line, + ) + sample_info.append(instrument_platform) ## Auto-detect paired-end/single-end - sample_info = [] ## [single_end, fastq_1, fastq_2] if sample and fastq_1 and fastq_2: ## Paired-end short reads - sample_info = ["0", fastq_1, fastq_2] + sample_info.extend(["0", fastq_1, fastq_2, fasta]) elif sample and fastq_1 and not fastq_2: ## Single-end short reads - sample_info = ["1", fastq_1, fastq_2] + sample_info.extend(["1", fastq_1, fastq_2, fasta]) + elif ( + sample and fasta and not fastq_1 and not fastq_2 + ): ## Single-end long reads + sample_info.extend(["1", fastq_1, fastq_2, fasta]) else: print_error("Invalid combination of columns provided!", "Line", line) - ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] } + ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 , fasta, run_accession, instrument_platform] } if sample not in sample_mapping_dict: sample_mapping_dict[sample] = [sample_info] else: @@ -120,19 +201,36 @@ def check_samplesheet(file_in, file_out): sample_mapping_dict[sample].append(sample_info) ## Write validated samplesheet with appropriate columns + HEADER_OUT = [ + "sample", + "run_accession", + "instrument_platform", + "single_end", + "fastq_1", + "fastq_2", + "fasta", + ] if len(sample_mapping_dict) > 0: out_dir = os.path.dirname(file_out) make_dir(out_dir) with open(file_out, "w") as fout: - fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n") + fout.write(",".join(HEADER_OUT) + "\n") for sample in sorted(sample_mapping_dict.keys()): ## Check that multiple runs of the same sample are of the same datatype - if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]): - print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample)) + if not all( + x[0] == sample_mapping_dict[sample][0][0] + for x in sample_mapping_dict[sample] + ): + print_error( + "Multiple runs of a sample must be of the same datatype!", + "Sample: {}".format(sample), + ) for idx, val in enumerate(sample_mapping_dict[sample]): - fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n") + fout.write( + ",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n" + ) else: print_error("No entries to process!", "Samplesheet: {}".format(file_in)) From 54a1a4fd459afa7e9987c72e5e1d4b7ac67683d0 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 18 Feb 2022 13:11:18 +0100 Subject: [PATCH 004/214] update samplesheet specs --- bin/check_samplesheet.py | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index eb6a7dc..77a5107 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -47,13 +47,10 @@ def check_samplesheet(file_in, file_out): This function checks that the samplesheet follows the following structure: sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta - 2611,ERR5766174,ILLUMINA,NA,NA,FA_EXTENSIONSERX5474930_ERR5766174_1.fa.gz - 2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz,NA - 2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,NA,NA - 2613,ERR5766181,ILLUMINA,ERX5474930_ERR5766174_1.fa.gz,ERX5474930_ERR5766174_2.fa.gz,NA - - For an example see: - https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv + 2611,ERR5766174,ILLUMINA,,,ERX5474930_ERR5766174_1.fa.gz + 2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz, + 2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,, + 2613,ERR5766181,ILLUMINA,ERX5474937_ERR5766181_1.fastq.gz,ERX5474937_ERR5766181_2.fastq.gz, """ FQ_EXTENSIONS = (".fq", ".fq.gz", ".fastq", ".fastq.gz") @@ -216,21 +213,9 @@ def check_samplesheet(file_in, file_out): with open(file_out, "w") as fout: fout.write(",".join(HEADER_OUT) + "\n") for sample in sorted(sample_mapping_dict.keys()): - - ## Check that multiple runs of the same sample are of the same datatype - if not all( - x[0] == sample_mapping_dict[sample][0][0] - for x in sample_mapping_dict[sample] - ): - print_error( - "Multiple runs of a sample must be of the same datatype!", - "Sample: {}".format(sample), - ) - for idx, val in enumerate(sample_mapping_dict[sample]): - fout.write( - ",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n" - ) + fout.write(f"{sample},{','.join(val)}\n") + # fout.write(f",".join(["{}".format(sample)] + val) + "\n") else: print_error("No entries to process!", "Samplesheet: {}".format(file_in)) From a6cfa0a1ba3b9c0ad2263d8225bcbc05117f4bc3 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 18 Feb 2022 13:15:30 +0100 Subject: [PATCH 005/214] cleanup --- bin/check_samplesheet.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 77a5107..0f19523 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -1,8 +1,5 @@ #!/usr/bin/env python -# TODO nf-core: Update the script to check the samplesheet -# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv - from distutils import extension import os import sys @@ -83,7 +80,6 @@ def check_samplesheet(file_in, file_out): ## Check header MIN_COLS = 4 - # TODO nf-core: Update the column names for the input samplesheet HEADER = [ "sample", "run_accession", @@ -188,7 +184,7 @@ def check_samplesheet(file_in, file_out): else: print_error("Invalid combination of columns provided!", "Line", line) - ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 , fasta, run_accession, instrument_platform] } + ## Create sample mapping dictionary = { sample: [ run_accession, instrument_platform, single_end, fastq_1, fastq_2 , fasta ] } if sample not in sample_mapping_dict: sample_mapping_dict[sample] = [sample_info] else: @@ -215,7 +211,6 @@ def check_samplesheet(file_in, file_out): for sample in sorted(sample_mapping_dict.keys()): for idx, val in enumerate(sample_mapping_dict[sample]): fout.write(f"{sample},{','.join(val)}\n") - # fout.write(f",".join(["{}".format(sample)] + val) + "\n") else: print_error("No entries to process!", "Samplesheet: {}".format(file_in)) From 1b893cb039fee4544cef4d716668ef62c8551d17 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 18 Feb 2022 13:27:10 +0100 Subject: [PATCH 006/214] add check for fastq with fasta --- bin/check_samplesheet.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 0f19523..d6e7123 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -181,6 +181,12 @@ def check_samplesheet(file_in, file_out): sample and fasta and not fastq_1 and not fastq_2 ): ## Single-end long reads sample_info.extend(["1", fastq_1, fastq_2, fasta]) + elif fasta and (fastq_1 or fastq_2): + print_error( + "FastQ and FastA files cannot be specified together in the same library!", + "Line", + line, + ) else: print_error("Invalid combination of columns provided!", "Line", line) From cf55cc592cf41868f8cd480f53116525ab08f960 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 18 Feb 2022 16:51:01 +0100 Subject: [PATCH 007/214] Get skeleton read processing to input for profiling --- README.md | 6 +- conf/modules.config | 39 +++++++++++ conf/test.config | 4 +- lib/WorkflowTaxprofiler.groovy | 9 +-- modules.json | 6 ++ modules/nf-core/modules/cat/fastq/main.nf | 51 ++++++++++++++ modules/nf-core/modules/cat/fastq/meta.yml | 39 +++++++++++ modules/nf-core/modules/fastp/main.nf | 75 ++++++++++++++++++++ modules/nf-core/modules/fastp/meta.yml | 68 ++++++++++++++++++ nextflow.config | 5 +- nextflow_schema.json | 9 --- subworkflows/local/input_check.nf | 43 ++++++++++-- workflows/taxprofiler.nf | 80 +++++++++++++++++++++- 13 files changed, 407 insertions(+), 27 deletions(-) create mode 100644 modules/nf-core/modules/cat/fastq/main.nf create mode 100644 modules/nf-core/modules/cat/fastq/meta.yml create mode 100644 modules/nf-core/modules/fastp/main.nf create mode 100644 modules/nf-core/modules/fastp/meta.yml diff --git a/README.md b/README.md index f57666d..5f00709 100644 --- a/README.md +++ b/README.md @@ -42,9 +42,9 @@ On release, automated continuous integration tests run the pipeline on a full-si - Centrifuge - Kaiju - mOTUs -4. Perform optional post-processing with: - - bracken -5. Standardises output tables +4. Perform optional post-processing with: + - bracken +5. Standardises output tables 6. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) ## Quick Start diff --git a/conf/modules.config b/conf/modules.config index a0506a4..2d533e9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -28,8 +28,47 @@ process { withName: FASTQC { ext.args = '--quiet' + ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } + publishDir = [ + path: { "${params.outdir}/fastqc/raw" }, + mode: 'copy', + pattern: '*.html' + ] } + withName: FASTP { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + // TODO also include option to NOT merge + ext.args = [ + { ${meta.single_end} } == 0 ? "-m" : '', + params.fastp_exclude_unmerged ? '' : "--include_unmerged" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/fastp" }, + mode: 'copy', + pattern: '*.fastq.gz' + ] + } + + withName: FASTQC_POST { + ext.args = '--quiet' + ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } + publishDir = [ + path: { "${params.outdir}/fastqc/processed" }, + mode: 'copy', + pattern: '*.html' + ] + } + + withName: CAT_FASTQ { + publishDir = [ + path: { "${params.outdir}/prepared_sequences" }, + mode: 'copy', + pattern: '*.fastq.gz' + ] + } + + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, diff --git a/conf/test.config b/conf/test.config index 45f87af..5db566a 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,8 +22,6 @@ params { // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - // Genome references - genome = 'R64-1-1' } diff --git a/lib/WorkflowTaxprofiler.groovy b/lib/WorkflowTaxprofiler.groovy index 53482ac..57a95e3 100755 --- a/lib/WorkflowTaxprofiler.groovy +++ b/lib/WorkflowTaxprofiler.groovy @@ -10,10 +10,11 @@ class WorkflowTaxprofiler { public static void initialise(params, log) { genomeExistsError(params, log) - if (!params.fasta) { - log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." - System.exit(1) - } + // TODO update as necessary + //if (!params.fasta) { + // log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." + // System.exit(1) + //} } // diff --git a/modules.json b/modules.json index 939b85f..6cf2b3e 100644 --- a/modules.json +++ b/modules.json @@ -3,9 +3,15 @@ "homePage": "https://github.com/nf-core/taxprofiler", "repos": { "nf-core/modules": { + "cat/fastq": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + }, "custom/dumpsoftwareversions": { "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41" }, + "fastp": { + "git_sha": "d0a1cbb703a130c19f6796c3fce24fbe7dfce789" + }, "fastqc": { "git_sha": "9d0cad583b9a71a6509b754fdf589cbfbed08961" }, diff --git a/modules/nf-core/modules/cat/fastq/main.nf b/modules/nf-core/modules/cat/fastq/main.nf new file mode 100644 index 0000000..bf0877c --- /dev/null +++ b/modules/nf-core/modules/cat/fastq/main.nf @@ -0,0 +1,51 @@ +process CAT_FASTQ { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "conda-forge::sed=4.7" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img' : + 'biocontainers/biocontainers:v1.2.0_cv1' }" + + input: + tuple val(meta), path(reads, stageAs: "input*/*") + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads.collect{ it.toString() } + if (meta.single_end) { + if (readList.size > 1) { + """ + cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size > 2) { + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz + cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } +} diff --git a/modules/nf-core/modules/cat/fastq/meta.yml b/modules/nf-core/modules/cat/fastq/meta.yml new file mode 100644 index 0000000..c836598 --- /dev/null +++ b/modules/nf-core/modules/cat/fastq/meta.yml @@ -0,0 +1,39 @@ +name: cat_fastq +description: Concatenates fastq files +keywords: + - fastq + - concatenate +tools: + - cat: + description: | + The cat utility reads files sequentially, writing them to the standard output. + documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: list + description: | + List of input FastQ files to be concatenated. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Merged fastq file + pattern: "*.{merged.fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/modules/fastp/main.nf b/modules/nf-core/modules/fastp/main.nf new file mode 100644 index 0000000..5c9e3b8 --- /dev/null +++ b/modules/nf-core/modules/fastp/main.nf @@ -0,0 +1,75 @@ +process FASTP { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? 'bioconda::fastp=0.23.2' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastp:0.23.2--h79da9fb_0' : + 'quay.io/biocontainers/fastp:0.23.2--h79da9fb_0' }" + + input: + tuple val(meta), path(reads) + val save_trimmed_fail + val save_merged + + output: + tuple val(meta), path('*.trim.fastq.gz') , optional:true, emit: reads + tuple val(meta), path('*.json') , emit: json + tuple val(meta), path('*.html') , emit: html + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + tuple val(meta), path('*.fail.fastq.gz') , optional:true, emit: reads_fail + tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + // Added soft-links to original fastqs for consistent naming in MultiQC + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + def fail_fastq = save_trimmed_fail ? "--failed_out ${prefix}.fail.fastq.gz" : '' + """ + [ ! -f ${prefix}.fastq.gz ] && ln -s $reads ${prefix}.fastq.gz + fastp \\ + --in1 ${prefix}.fastq.gz \\ + --out1 ${prefix}.trim.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $fail_fastq \\ + $args \\ + 2> ${prefix}.fastp.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else { + def fail_fastq = save_trimmed_fail ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : '' + def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : '' + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz + fastp \\ + --in1 ${prefix}_1.fastq.gz \\ + --in2 ${prefix}_2.fastq.gz \\ + --out1 ${prefix}_1.trim.fastq.gz \\ + --out2 ${prefix}_2.trim.fastq.gz \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $fail_fastq \\ + $merge_fastq \\ + --thread $task.cpus \\ + --detect_adapter_for_pe \\ + $args \\ + 2> ${prefix}.fastp.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/modules/fastp/meta.yml b/modules/nf-core/modules/fastp/meta.yml new file mode 100644 index 0000000..3274e41 --- /dev/null +++ b/modules/nf-core/modules/fastp/meta.yml @@ -0,0 +1,68 @@ +name: fastp +description: Perform adapter/quality trimming on sequencing reads +keywords: + - trimming + - quality control + - fastq +tools: + - fastp: + description: | + A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance. + documentation: https://github.com/OpenGene/fastp + doi: https://doi.org/10.1093/bioinformatics/bty560 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - save_trimmed_fail: + type: boolean + description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz` + - save_merged: + type: boolean + description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz` + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified/unmerged fastq reads + pattern: "*trim.fastq.gz" + - json: + type: file + description: Results in JSON format + pattern: "*.json" + - html: + type: file + description: Results in HTML format + pattern: "*.html" + - log: + type: file + description: fastq log file + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads_fail: + type: file + description: Reads the failed the preprocessing + pattern: "*fail.fastq.gz" + - reads_merged: + type: file + description: Reads that were successfully merged + pattern: "*.{merged.fastq.gz}" +authors: + - "@drpatelh" + - "@kevinmenden" diff --git a/nextflow.config b/nextflow.config index 186979c..160dba7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -33,7 +33,7 @@ params { help = false validate_params = true show_hidden_params = false - schema_ignore_params = 'genomes' + schema_ignore_params = 'genomes,fasta' enable_conda = false // Config options @@ -50,6 +50,9 @@ params { max_cpus = 16 max_time = '240.h' + // FASTQ preprocessing + fastp_clip_merge = false + fastp_exclude_unmerged = true } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index 535e08d..cc43add 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -56,15 +56,6 @@ "fa_icon": "fas fa-book", "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." }, - "fasta": { - "type": "string", - "format": "file-path", - "mimetype": "text/plain", - "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", - "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", - "fa_icon": "far fa-file-code" - }, "igenomes_base": { "type": "string", "format": "directory-path", diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index cddcbb3..241a8a7 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -9,22 +9,38 @@ workflow INPUT_CHECK { samplesheet // file: /path/to/samplesheet.csv main: - SAMPLESHEET_CHECK ( samplesheet ) + parsed_samplesheet = SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) + .dump(tag: "split_csv_out") + .branch { + fasta: it['fasta'] != '' + fastq: true + } + + parsed_samplesheet.fastq .map { create_fastq_channels(it) } - .set { reads } + .dump(tag: "fastq_channel_init") + .set { fastq } + + parsed_samplesheet.fasta + .map { create_fasta_channels(it) } + .dump(tag: "fasta_channel_init") + .set { fasta } emit: - reads // channel: [ val(meta), [ reads ] ] + fastq // channel: [ val(meta), [ reads ] ] + fasta // channel: [ val(meta), fasta ] versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } // Function to get list of [ meta, [ fastq_1, fastq_2 ] ] def create_fastq_channels(LinkedHashMap row) { def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() + meta.id = row.sample + meta.run_accession = row.run_accession + meta.instrument_platform = row.instrument_platform + meta.single_end = row.single_end.toBoolean() def array = [] if (!file(row.fastq_1).exists()) { @@ -40,3 +56,20 @@ def create_fastq_channels(LinkedHashMap row) { } return array } + +// Function to get list of [ meta, fasta ] +def create_fasta_channels(LinkedHashMap row) { + def meta = [:] + meta.id = row.sample + meta.run_accession = row.run_accession + meta.instrument_platform = row.instrument_platform + meta.single_end = true + + def array = [] + if (!file(row.fasta).exists()) { + exit 1, "ERROR: Please check input samplesheet -> FastA file does not exist!\n${row.fasta}" + } + array = [ meta, [ file(row.fasta) ] ] + + return array +} diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 56c532b..3f10a9d 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -11,7 +11,7 @@ WorkflowTaxprofiler.initialise(params, log) // TODO nf-core: Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta ] +def checkPathParamList = [ params.input, params.multiqc_config ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters @@ -50,6 +50,11 @@ include { FASTQC } from '../modules/nf-core/modules/fastqc/ include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' +include { FASTP as FASTP_SINGLE } from '../modules/nf-core/modules/fastp/main' +include { FASTP as FASTP_PAIRED } from '../modules/nf-core/modules/fastp/main' +include { FASTQC as FASTQC_POST } from '../modules/nf-core/modules/fastqc/main' +include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' + /* ======================================================================================== RUN MAIN WORKFLOW @@ -75,7 +80,7 @@ workflow TAXPROFILER { // MODULE: Run FastQC // FASTQC ( - INPUT_CHECK.out.reads + INPUT_CHECK.out.fastq ) ch_versions = ch_versions.mix(FASTQC.out.versions.first()) @@ -83,6 +88,71 @@ workflow TAXPROFILER { ch_versions.unique().collectFile(name: 'collated_versions.yml') ) + // + // MODULE: Run Clip/Merge/Complexity + // + // TODO give option to clip only and retain pairs + // TODO give option to retain singletons (probably fastp option likely) + // TODO move to subworkflow + if ( params.fastp_clip_merge ) { + + ch_input_for_fastp = INPUT_CHECK.out.fastq + .dump(tag: "pre-fastp_branch") + .branch{ + single: it[0]['single_end'] == true + paired: it[0]['single_end'] == false + } + + ch_input_for_fastp.single.dump(tag: "input_fastp_single") + ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") + + FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) + FASTP_PAIRED ( ch_input_for_fastp.paired, false, true ) + + ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged + .mix( FASTP_SINGLE.out.reads ) + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] + } + + FASTQC_POST ( ch_fastp_reads_prepped ) + + ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) + ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) + + ch_processed_reads = ch_fastp_reads_prepped + + } else { + ch_processed_reads = INPUT_CHECK.out.fastq + } + + + // MODULE: Cat merge runs of same sample + ch_processed_for_combine = ch_processed_reads + .dump(tag: "prep_for_combine_grouping") + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['run_accession'] = 'combined' + [ meta_new, reads ] + } + .groupTuple ( by: 0 ) + .branch{ + combine: it[1].size() >= 2 + skip: it[1].size() < 2 + } + + CAT_FASTQ ( ch_processed_for_combine.combine ) + + // Ready for profiling! + ch_reads_for_profiling = ch_processed_for_combine.skip + .dump(tag: "skip_combine") + .mix( CAT_FASTQ.out.reads ) + .dump(tag: "files_for_profiling") + // // MODULE: MultiQC // @@ -95,6 +165,12 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + if (params.fastp_clip_merge) { + ch_multiqc_files = ch_multiqc_files.mix(FASTP_SINGLE.out.json.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTP_PAIRED.out.json.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC_POST.out.zip.collect{it[1]}.ifEmpty([])) + } + MULTIQC ( ch_multiqc_files.collect() From 278f5605ca831338384b5045e34f500f0b5ca1a2 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 19 Feb 2022 12:36:08 +0100 Subject: [PATCH 008/214] Added database preparation and final channel for profiling --- modules/local/database_check.nf | 25 ++++++++++ nextflow.config | 3 ++ subworkflows/local/db_check.nf | 40 ++++++++++++++++ subworkflows/local/input_check.nf | 2 +- subworkflows/local/preprocessing.nf | 73 +++++++++++++++++++++++++++++ workflows/taxprofiler.nf | 65 ++++++++----------------- 6 files changed, 161 insertions(+), 47 deletions(-) create mode 100644 modules/local/database_check.nf create mode 100644 subworkflows/local/db_check.nf create mode 100644 subworkflows/local/preprocessing.nf diff --git a/modules/local/database_check.nf b/modules/local/database_check.nf new file mode 100644 index 0000000..4da4313 --- /dev/null +++ b/modules/local/database_check.nf @@ -0,0 +1,25 @@ +process DATABASE_CHECK { + tag "$databasesheet" + + conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'quay.io/biocontainers/python:3.8.3' }" + + input: + path databasesheet + + output: + path '*.csv' , emit: csv + path "versions.yml", emit: versions + + script: // This script is bundled with the pipeline, in nf-core/taxprofiler/bin/ + """ + cat $databasesheet >> database_sheet.valid.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index 160dba7..60b5a42 100644 --- a/nextflow.config +++ b/nextflow.config @@ -50,6 +50,9 @@ params { max_cpus = 16 max_time = '240.h' + // Databaess + databases = null + // FASTQ preprocessing fastp_clip_merge = false fastp_exclude_unmerged = true diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf new file mode 100644 index 0000000..909d98f --- /dev/null +++ b/subworkflows/local/db_check.nf @@ -0,0 +1,40 @@ +// +// Check input samplesheet and get read channels +// + +include { DATABASE_CHECK } from '../../modules/local/database_check' + +workflow DB_CHECK { + take: + dbsheet // file: /path/to/dbsheet.csv + + main: + + // TODO: make database sheet check + parsed_samplesheet = DATABASE_CHECK ( dbsheet ) + .csv + .splitCsv ( header:true, sep:',' ) + .dump(tag: "db_split_csv_out") + .map { create_db_channels(it) } + .dump(tag: "db_channel_prepped") + .set{ dbs } + + emit: + dbs // channel: [ val(meta), [ db ] ] + versions = DATABASE_CHECK.out.versions // channel: [ versions.yml ] +} + +def create_db_channels(LinkedHashMap row) { + def meta = [:] + meta.tool = row.tool + meta.db_name = row.db_name + meta.db_params = row.db_params + + def array = [] + if (!file(row.db_path, type: 'dir').exists()) { + exit 1, "ERROR: Please check input samplesheet -> database could not be found!\n${row.db_path}" + } + array = [ meta, file(row.db_path) ] + + return array +} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 241a8a7..8497faa 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -12,7 +12,7 @@ workflow INPUT_CHECK { parsed_samplesheet = SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) - .dump(tag: "split_csv_out") + .dump(tag: "input_split_csv_out") .branch { fasta: it['fasta'] != '' fastq: true diff --git a/subworkflows/local/preprocessing.nf b/subworkflows/local/preprocessing.nf new file mode 100644 index 0000000..5832824 --- /dev/null +++ b/subworkflows/local/preprocessing.nf @@ -0,0 +1,73 @@ +// +// Check input samplesheet and get read channels +// + + +include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main' +include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' +include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main' + +workflow FASTQ_PREPROCESSING { + take: + reads // file: /path/to/samplesheet.csv + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + // + // STEP: Read clipping and merging + // + // TODO give option to clip only and retain pairs + // TODO give option to retain singletons (probably fastp option likely) + // TODO move to subworkflow + + + if ( params.fastp_clip_merge ) { + + ch_input_for_fastp = reads + .dump(tag: "pre-fastp_branch") + .branch{ + single: it[0]['single_end'] == true + paired: it[0]['single_end'] == false + } + + ch_input_for_fastp.single.dump(tag: "input_fastp_single") + ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") + + FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) + FASTP_PAIRED ( ch_input_for_fastp.paired, false, true ) + + ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged + .mix( FASTP_SINGLE.out.reads ) + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] + } + + FASTQC_POST ( ch_fastp_reads_prepped ) + + ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) + ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) + + ch_processed_reads = ch_fastp_reads_prepped + + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} ) + + ch_multiqc_files.dump(tag: "preprocessing_mqc_final") + + } else { + ch_processed_reads = reads + } + + + emit: + reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 3f10a9d..4a356a5 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -11,11 +11,12 @@ WorkflowTaxprofiler.initialise(params, log) // TODO nf-core: Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config ] +def checkPathParamList = [ params.input, params.databases, params.multiqc_config ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters -if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } /* ======================================================================================== @@ -35,7 +36,11 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { INPUT_CHECK } from '../subworkflows/local/input_check' + +include { DB_CHECK } from '../subworkflows/local/db_check' +include { FASTQ_PREPROCESSING } from '../subworkflows/local/preprocessing' + /* ======================================================================================== @@ -50,9 +55,6 @@ include { FASTQC } from '../modules/nf-core/modules/fastqc/ include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' -include { FASTP as FASTP_SINGLE } from '../modules/nf-core/modules/fastp/main' -include { FASTP as FASTP_PAIRED } from '../modules/nf-core/modules/fastp/main' -include { FASTQC as FASTQC_POST } from '../modules/nf-core/modules/fastqc/main' include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' /* @@ -76,6 +78,10 @@ workflow TAXPROFILER { ) ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) + DB_CHECK ( + ch_databases + ) + // // MODULE: Run FastQC // @@ -91,47 +97,12 @@ workflow TAXPROFILER { // // MODULE: Run Clip/Merge/Complexity // - // TODO give option to clip only and retain pairs - // TODO give option to retain singletons (probably fastp option likely) - // TODO move to subworkflow if ( params.fastp_clip_merge ) { - - ch_input_for_fastp = INPUT_CHECK.out.fastq - .dump(tag: "pre-fastp_branch") - .branch{ - single: it[0]['single_end'] == true - paired: it[0]['single_end'] == false - } - - ch_input_for_fastp.single.dump(tag: "input_fastp_single") - ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") - - FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) - FASTP_PAIRED ( ch_input_for_fastp.paired, false, true ) - - ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged - .mix( FASTP_SINGLE.out.reads ) - .map { - meta, reads -> - def meta_new = meta.clone() - meta_new['single_end'] = 1 - [ meta_new, reads ] - } - - FASTQC_POST ( ch_fastp_reads_prepped ) - - ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) - ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) - - ch_processed_reads = ch_fastp_reads_prepped - - } else { - ch_processed_reads = INPUT_CHECK.out.fastq + FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq ) } - // MODULE: Cat merge runs of same sample - ch_processed_for_combine = ch_processed_reads + ch_processed_for_combine = FASTQ_PREPROCESSING.out.reads .dump(tag: "prep_for_combine_grouping") .map { meta, reads -> @@ -153,6 +124,10 @@ workflow TAXPROFILER { .mix( CAT_FASTQ.out.reads ) .dump(tag: "files_for_profiling") + // Combine reads with possible databases + + ch_reads_for_profiling.combine(DB_CHECK.out.dbs).dump(tag: "reads_plus_db") + // // MODULE: MultiQC // @@ -166,9 +141,7 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) if (params.fastp_clip_merge) { - ch_multiqc_files = ch_multiqc_files.mix(FASTP_SINGLE.out.json.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(FASTP_PAIRED.out.json.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC_POST.out.zip.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc) } From 2c183ed2edc61ab058580cb63441917d516eb564 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 3 Mar 2022 17:42:02 +0100 Subject: [PATCH 009/214] Add Kraken2 and MALT/run as Proof of Concept (currnetly MQC issue) --- conf/modules.config | 20 ++++++ modules.json | 6 ++ .../nf-core/modules/kraken2/kraken2/main.nf | 49 +++++++++++++ .../nf-core/modules/kraken2/kraken2/meta.yml | 60 ++++++++++++++++ modules/nf-core/modules/malt/run/main.nf | 49 +++++++++++++ modules/nf-core/modules/malt/run/meta.yml | 58 ++++++++++++++++ nextflow.config | 11 ++- workflows/taxprofiler.nf | 69 +++++++++++++++++-- 8 files changed, 315 insertions(+), 7 deletions(-) create mode 100644 modules/nf-core/modules/kraken2/kraken2/main.nf create mode 100644 modules/nf-core/modules/kraken2/kraken2/meta.yml create mode 100644 modules/nf-core/modules/malt/run/main.nf create mode 100644 modules/nf-core/modules/malt/run/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 2d533e9..dbc926c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -68,6 +68,26 @@ process { ] } + withName: MALT_RUN { + publishDir = [ + path: { "${params.outdir}/malt/${meta.db_name}" }, + mode: 'copy', + pattern: '*.{rma6,tab,text,sam,log}' + ] + ext.args = { "${meta.db_params}" } + ext.when = params.run_malt + } + + withName: KRAKEN2_KRAKEN2 { + publishDir = [ + path: { "${params.outdir}/kraken2/${meta.db_name}" }, + mode: 'copy', + pattern: '.{fastq.gz,txt}' + ] + ext.args = { "${meta.db_params}" } + ext.when = params.run_kraken2 + ext.prefix = { "${meta.id}-${meta.db_name}" } + } withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ diff --git a/modules.json b/modules.json index 6cf2b3e..844b33a 100644 --- a/modules.json +++ b/modules.json @@ -15,6 +15,12 @@ "fastqc": { "git_sha": "9d0cad583b9a71a6509b754fdf589cbfbed08961" }, + "kraken2/kraken2": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + }, + "malt/run": { + "git_sha": "76cdd46f3f8a77fb5023fb5a39c4ab99925b8b56" + }, "multiqc": { "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41" } diff --git a/modules/nf-core/modules/kraken2/kraken2/main.nf b/modules/nf-core/modules/kraken2/kraken2/main.nf new file mode 100644 index 0000000..3ec5df5 --- /dev/null +++ b/modules/nf-core/modules/kraken2/kraken2/main.nf @@ -0,0 +1,49 @@ +process KRAKEN2_KRAKEN2 { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? 'bioconda::kraken2=2.1.2 conda-forge::pigz=2.6' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' : + 'quay.io/biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }" + + input: + tuple val(meta), path(reads) + path db + + output: + tuple val(meta), path('*classified*') , emit: classified + tuple val(meta), path('*unclassified*'), emit: unclassified + tuple val(meta), path('*report.txt') , emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "" : "--paired" + def classified = meta.single_end ? "${prefix}.classified.fastq" : "${prefix}.classified#.fastq" + def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq" + """ + kraken2 \\ + --db $db \\ + --threads $task.cpus \\ + --unclassified-out $unclassified \\ + --classified-out $classified \\ + --report ${prefix}.kraken2.report.txt \\ + --gzip-compressed \\ + $paired \\ + $args \\ + $reads + + pigz -p $task.cpus *.fastq + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/kraken2/kraken2/meta.yml b/modules/nf-core/modules/kraken2/kraken2/meta.yml new file mode 100644 index 0000000..9d6a385 --- /dev/null +++ b/modules/nf-core/modules/kraken2/kraken2/meta.yml @@ -0,0 +1,60 @@ +name: kraken2_kraken2 +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - kraken2: + description: | + Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads + homepage: https://ccb.jhu.edu/software/kraken2/ + documentation: https://github.com/DerrickWood/kraken2/wiki/Manual + doi: 10.1186/s13059-019-1891-0 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Kraken2 database +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classified: + type: file + description: | + Reads classified to belong to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - unclassified: + type: file + description: | + Reads not classified to belong to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - txt: + type: file + description: | + Kraken2 report containing stats about classified + and not classifed reads. + pattern: "*.{report.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/modules/malt/run/main.nf b/modules/nf-core/modules/malt/run/main.nf new file mode 100644 index 0000000..61c02ec --- /dev/null +++ b/modules/nf-core/modules/malt/run/main.nf @@ -0,0 +1,49 @@ +process MALT_RUN { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? "bioconda::malt=0.53" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/malt:0.53--hdfd78af_0' : + 'quay.io/biocontainers/malt:0.53--hdfd78af_0' }" + + input: + tuple val(meta), path(fastqs) + val mode + path index + + output: + tuple val(meta), path("*.rma6") , emit: rma6 + tuple val(meta), path("*.{tab,text,sam}"), optional:true, emit: alignments + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def avail_mem = 6 + if (!task.memory) { + log.info '[MALT_RUN] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.' + } else { + avail_mem = task.memory.giga + } + + """ + malt-run \\ + -J-Xmx${avail_mem}g \\ + -t $task.cpus \\ + -v \\ + -o . \\ + $args \\ + --inFile ${fastqs.join(' ')} \\ + -m $mode \\ + --index $index/ |&tee malt-run.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + malt: \$(malt-run --help 2>&1 | grep -o 'version.* ' | cut -f 1 -d ',' | cut -f2 -d ' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/malt/run/meta.yml b/modules/nf-core/modules/malt/run/meta.yml new file mode 100644 index 0000000..ae4277a --- /dev/null +++ b/modules/nf-core/modules/malt/run/meta.yml @@ -0,0 +1,58 @@ +name: malt_run +description: MALT, an acronym for MEGAN alignment tool, is a sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics. +keywords: + - malt + - alignment + - metagenomics + - ancient DNA + - aDNA + - palaeogenomics + - archaeogenomics + - microbiome +tools: + - malt: + description: A tool for mapping metagenomic data + homepage: https://www.wsi.uni-tuebingen.de/lehrstuehle/algorithms-in-bioinformatics/software/malt/ + documentation: https://software-ab.informatik.uni-tuebingen.de/download/malt/manual.pdf + tool_dev_url: None + doi: "10.1038/s41559-017-0446-6" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastqs: + type: file + description: Input FASTQ files + pattern: "*.{fastq.gz,fq.gz}" + - mode: + type: string + description: Program mode + pattern: "Unknown|BlastN|BlastP|BlastX|Classifier" + - index: + type: directory + description: Index/database directory from malt-build + pattern: "*/" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - rma6: + type: file + description: MEGAN6 RMA6 file + pattern: "*.rma6" + - sam: + type: file + description: Alignment files in Tab, Text or MEGAN-compatible SAM format + pattern: "*.{tab,txt,sam}" + - log: + type: file + description: Log of verbose MALT stdout + pattern: "malt-run.log" + +authors: + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index 60b5a42..7991bdf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -54,8 +54,15 @@ params { databases = null // FASTQ preprocessing - fastp_clip_merge = false - fastp_exclude_unmerged = true + fastp_clip_merge = false + fastp_exclude_unmerged = true + + // MALT + run_malt = false + malt_mode = 'BlastN' + + // kraken2 + run_kraken2 = false } // Load base.config by default for all pipelines diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 4a356a5..bd25563 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -56,6 +56,9 @@ include { MULTIQC } from '../modules/nf-core/modules/multiqc include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' +include { MALT_RUN } from '../modules/nf-core/modules/malt/run/main' +include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/modules/kraken2/kraken2/main' + /* ======================================================================================== @@ -95,13 +98,15 @@ workflow TAXPROFILER { ) // - // MODULE: Run Clip/Merge/Complexity + // PERFORM PREPROCESSING // if ( params.fastp_clip_merge ) { FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq ) } - // MODULE: Cat merge runs of same sample + // + // PERFORM RUN MERGING + // ch_processed_for_combine = FASTQ_PREPROCESSING.out.reads .dump(tag: "prep_for_combine_grouping") .map { @@ -118,15 +123,61 @@ workflow TAXPROFILER { CAT_FASTQ ( ch_processed_for_combine.combine ) - // Ready for profiling! ch_reads_for_profiling = ch_processed_for_combine.skip .dump(tag: "skip_combine") .mix( CAT_FASTQ.out.reads ) .dump(tag: "files_for_profiling") - // Combine reads with possible databases + // + // COMBINE READS WITH POSSIBLE DATABASES + // + + // output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] + ch_input_for_profiling = ch_reads_for_profiling + .combine(DB_CHECK.out.dbs) + .dump(tag: "reads_plus_db") + .branch { + malt: it[2]['tool'] == 'malt' + kraken2: it[2]['tool'] == 'kraken2' + unknown: true + } + + // + // PREP PROFILER INPUT CHANNELS ON PER TOOL BASIS + // + + // We groupTuple to have all samples in one channel for MALT as database + // loading takes a long time, so we only want to run it once per database + ch_input_for_malt = ch_input_for_profiling.malt + .map { + it -> + def temp_meta = [ id: it[2]['db_name']] + it[2] + def db = it[3] + [ temp_meta, it[1], db ] + } + .groupTuple(by: [0,2]) + .dump(tag: "input for malt") + .multiMap { + it -> + reads: [ it[0], it[1].flatten() ] + db: it[2] + } + + // We can run Kraken2 one-by-one sample-wise + ch_input_for_kraken2 = ch_input_for_profiling.kraken2 + .dump(tag: "input for kraken") + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + // + // RUN PROFILING + // + MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) - ch_reads_for_profiling.combine(DB_CHECK.out.dbs).dump(tag: "reads_plus_db") // // MODULE: MultiQC @@ -143,6 +194,14 @@ workflow TAXPROFILER { if (params.fastp_clip_merge) { ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc) } + if (params.run_kraken2) { + ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([])) + ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first()) + } + if (params.run_malt) { + ch_multiqc_files = ch_multiqc_files.mix(MALT_RUN.out.log.collect{it[1]}.ifEmpty([])) + ch_versions = ch_versions.mix(MALT_RUN.out.versions.first()) + } MULTIQC ( From e39c6a8ccb337b5889271df4f1e57bcbd3dd2c74 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 3 Mar 2022 18:04:03 +0100 Subject: [PATCH 010/214] Fix MALT multiqc report clash --- modules.json | 2 +- modules/nf-core/modules/malt/run/main.nf | 3 ++- modules/nf-core/modules/malt/run/meta.yml | 2 +- workflows/taxprofiler.nf | 3 ++- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/modules.json b/modules.json index 844b33a..6a785b8 100644 --- a/modules.json +++ b/modules.json @@ -19,7 +19,7 @@ "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, "malt/run": { - "git_sha": "76cdd46f3f8a77fb5023fb5a39c4ab99925b8b56" + "git_sha": "72b96f4e504eef673f2b5c13560a9d90b669129b" }, "multiqc": { "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41" diff --git a/modules/nf-core/modules/malt/run/main.nf b/modules/nf-core/modules/malt/run/main.nf index 61c02ec..4e2e50c 100644 --- a/modules/nf-core/modules/malt/run/main.nf +++ b/modules/nf-core/modules/malt/run/main.nf @@ -23,6 +23,7 @@ process MALT_RUN { script: def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" def avail_mem = 6 if (!task.memory) { log.info '[MALT_RUN] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.' @@ -39,7 +40,7 @@ process MALT_RUN { $args \\ --inFile ${fastqs.join(' ')} \\ -m $mode \\ - --index $index/ |&tee malt-run.log + --index $index/ |&tee ${prefix}-malt-run.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/modules/malt/run/meta.yml b/modules/nf-core/modules/malt/run/meta.yml index ae4277a..66f2d7a 100644 --- a/modules/nf-core/modules/malt/run/meta.yml +++ b/modules/nf-core/modules/malt/run/meta.yml @@ -52,7 +52,7 @@ output: - log: type: file description: Log of verbose MALT stdout - pattern: "malt-run.log" + pattern: "*-malt-run.log" authors: - "@jfy133" diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index bd25563..bf3e6ee 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -203,7 +203,8 @@ workflow TAXPROFILER { ch_versions = ch_versions.mix(MALT_RUN.out.versions.first()) } - + // TODO MALT results overwriting per database? + // TODO Versions for Karken/MALT not report? MULTIQC ( ch_multiqc_files.collect() ) From 424f11f5ed0898a0e0524a7386b35164f58beccb Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 17 Mar 2022 13:16:16 +0100 Subject: [PATCH 011/214] Update README.md --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 60baf67..5d0c74b 100644 --- a/README.md +++ b/README.md @@ -61,10 +61,10 @@ On release, automated continuous integration tests run the pipeline on a full-si Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string. - > * The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`. - > * Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. - > * If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs. - > * If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. + > - The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`. + > - Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. + > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs. + > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. 4. Start running your own analysis! From 0f0ed6cd4698df3bea9d4168c056085a820eb056 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Fri, 18 Mar 2022 10:45:06 +0100 Subject: [PATCH 012/214] Fix function name --- subworkflows/local/input_check.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index d66fb3a..481028f 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -15,6 +15,7 @@ workflow INPUT_CHECK { .dump(tag: "input_split_csv_out") .branch { fasta: it['fasta'] != '' + nanopore: it['instrument_platform'] == 'OXFORD_NANOPORE' fastq: true } From 41b3d8db822caab916ec82fe4b4f581f17ab1ca5 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Fri, 18 Mar 2022 10:47:41 +0100 Subject: [PATCH 013/214] Add nanopore channel --- subworkflows/local/input_check.nf | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 481028f..2e30bcc 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -20,10 +20,15 @@ workflow INPUT_CHECK { } parsed_samplesheet.fastq - .map { create_fastq_channels(it) } + .map { create_fastq_channel(it) } .dump(tag: "fastq_channel_init") .set { fastq } + parsed_samplesheet.nanopore + .map { create_fastq_channel(it) } + .dump(tag: "fastq_nanopore_channel_init") + .set { nanopore } + parsed_samplesheet.fasta .map { create_fasta_channels(it) } .dump(tag: "fasta_channel_init") @@ -31,6 +36,7 @@ workflow INPUT_CHECK { emit: fastq // channel: [ val(meta), [ reads ] ] + nanopore // channel: [ val(meta), [ reads ] ] fasta // channel: [ val(meta), fasta ] versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } @@ -52,10 +58,17 @@ def create_fastq_channel(LinkedHashMap row) { if (meta.single_end) { fastq_meta = [ meta, [ file(row.fastq_1) ] ] } else { - if (!file(row.fastq_2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" + if (meta.instrument_platform == 'OXFORD_NANOPORE') { + if (row.fastq_2 != '') { + exit 1, "ERROR: Please check input samplesheet -> For Oxford Nanopore reads Read 2 FastQ should be empty!\n${row.fastq_2}" + } + fastq_meta = [ meta, [ file(row.fastq_1) ] ] + } else { + if (!file(row.fastq_2).exists()) { + exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" + } + fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] } return fastq_meta } From 7f7ddc9f14237f1616918963a002cbc64fea2687 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Fri, 18 Mar 2022 10:48:06 +0100 Subject: [PATCH 014/214] Update comment --- bin/check_samplesheet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 16e668b..d10ee90 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -173,7 +173,7 @@ def check_samplesheet(file_in, file_out): ## Auto-detect paired-end/single-end if sample and fastq_1 and fastq_2: ## Paired-end short reads sample_info.extend(["0", fastq_1, fastq_2, fasta]) - elif sample and fastq_1 and not fastq_2: ## Single-end short reads + elif sample and fastq_1 and not fastq_2: ## Single-end short/long fastq reads sample_info.extend(["1", fastq_1, fastq_2, fasta]) elif ( sample and fasta and not fastq_1 and not fastq_2 From 2e1b6c5d0a3b7c455bba5cb4d20dbbceac43dffe Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Fri, 18 Mar 2022 11:00:36 +0100 Subject: [PATCH 015/214] Add info on Nanopore reads to fastq_1 column --- docs/usage.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index a8b0448..38c063e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -44,11 +44,11 @@ TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, ``` -| Column | Description | -|----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| Column | Description | +| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | +| `fastq_1` | Full path to FastQ file for Illumina short reads 1 or Nanopore reads. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. From c8e49c56f4f6b26dde209acfd474fc5c4f43caf7 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Fri, 18 Mar 2022 13:47:44 +0100 Subject: [PATCH 016/214] Perform fastqc on nanopore reads before trimming --- workflows/taxprofiler.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index f740324..c3d3eb6 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -89,7 +89,7 @@ workflow TAXPROFILER { // MODULE: Run FastQC // FASTQC ( - INPUT_CHECK.out.fastq + INPUT_CHECK.out.fastq.concat( INPUT_CHECK.out.nanopore ) ) ch_versions = ch_versions.mix(FASTQC.out.versions.first()) From 0936b9b28e52986576d9bb62473373f209565d6d Mon Sep 17 00:00:00 2001 From: Lauri Mesilaakso Date: Fri, 18 Mar 2022 14:18:38 +0100 Subject: [PATCH 017/214] Update workflows/taxprofiler.nf Co-authored-by: James A. Fellows Yates --- workflows/taxprofiler.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index c3d3eb6..f48cff6 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -89,7 +89,7 @@ workflow TAXPROFILER { // MODULE: Run FastQC // FASTQC ( - INPUT_CHECK.out.fastq.concat( INPUT_CHECK.out.nanopore ) + INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ) ) ch_versions = ch_versions.mix(FASTQC.out.versions.first()) From 16be676d72f9964038a52fb0ddc58c9bdafd4f9b Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Fri, 18 Mar 2022 14:34:10 +0100 Subject: [PATCH 018/214] Add Porechop module --- conf/modules.config | 9 ++++ modules.json | 3 ++ modules/nf-core/modules/porechop/main.nf | 35 ++++++++++++++++ modules/nf-core/modules/porechop/meta.yml | 50 +++++++++++++++++++++++ 4 files changed, 97 insertions(+) create mode 100644 modules/nf-core/modules/porechop/main.nf create mode 100644 modules/nf-core/modules/porechop/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 9e334bc..050772e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -50,6 +50,15 @@ process { ] } + withName: PORECHOP { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/porechop" }, + mode: 'copy', + pattern: '*.fastq.gz' + ] + } + withName: FASTQC_POST { ext.args = '--quiet' ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } diff --git a/modules.json b/modules.json index 6a785b8..284cf13 100644 --- a/modules.json +++ b/modules.json @@ -23,6 +23,9 @@ }, "multiqc": { "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41" + }, + "porechop": { + "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046" } } } diff --git a/modules/nf-core/modules/porechop/main.nf b/modules/nf-core/modules/porechop/main.nf new file mode 100644 index 0000000..65982b8 --- /dev/null +++ b/modules/nf-core/modules/porechop/main.nf @@ -0,0 +1,35 @@ +process PORECHOP { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? "bioconda::porechop=0.2.4" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/porechop:0.2.4--py39h7cff6ad_2' : + 'quay.io/biocontainers/porechop:0.2.4--py39h7cff6ad_2' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + porechop \\ + -i $reads \\ + -t $task.cpus \\ + $args \\ + -o ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + porechop: \$( porechop --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/porechop/meta.yml b/modules/nf-core/modules/porechop/meta.yml new file mode 100644 index 0000000..81399d2 --- /dev/null +++ b/modules/nf-core/modules/porechop/meta.yml @@ -0,0 +1,50 @@ +name: porechop +description: Adapter removal and demultiplexing of Oxford Nanopore reads +keywords: + - adapter + - nanopore + - demultiplexing +tools: + - porechop: + description: Adapter removal and demultiplexing of Oxford Nanopore reads + homepage: "https://github.com/rrwick/Porechop" + documentation: "https://github.com/rrwick/Porechop" + tool_dev_url: "https://github.com/rrwick/Porechop" + doi: "10.1099/mgen.0.000132" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: fastq/fastq.gz file + pattern: "*.{fastq,fastq.gz,fq,fq.gz}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: Demultiplexed and/or adapter-trimmed fastq.gz file + pattern: "*.{fastq.gz}" + +authors: + - "@ggabernet" + - "@jasmezz" + - "@d4straub" + - "@LaurenceKuhl" + - "@SusiJo" + - "@jonasscheid" + - "@jonoave" + - "@GokceOGUZ" From 1e42f1d9f295e909cc75d10b6306cce2d1f4bf22 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Fri, 18 Mar 2022 15:10:44 +0100 Subject: [PATCH 019/214] Add long read preprocessing subworkflow --- subworkflows/local/longread_preprocessing.nf | 34 ++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 subworkflows/local/longread_preprocessing.nf diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf new file mode 100644 index 0000000..da1049a --- /dev/null +++ b/subworkflows/local/longread_preprocessing.nf @@ -0,0 +1,34 @@ + +include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main' +include { PORECHOP } from '../../modules/nf-core/modules/porechop/main' + +workflow LONGREAD_PREPROCESSING { + take: + reads + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + PORECHOP ( reads ) + + ch_processed_reads = PORECHOP.out.reads + .dump(tag: "pre_fastqc_check") + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] + } + + FASTQC_POST ( PORECHOP.out.reads ) + ch_versions = ch_versions.mix(PORECHOP.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} ) + + + emit: + reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + From 582aaa105fff0328f8df314d88ab84fe6c8e528b Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Fri, 18 Mar 2022 15:12:07 +0100 Subject: [PATCH 020/214] Include long reads preprocessing subworkflow --- workflows/taxprofiler.nf | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index f48cff6..9e52b59 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -40,7 +40,7 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' include { DB_CHECK } from '../subworkflows/local/db_check' include { FASTQ_PREPROCESSING } from '../subworkflows/local/preprocessing' - +include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -104,6 +104,10 @@ workflow TAXPROFILER { FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq ) } + LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ) + + ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first()) + // // PERFORM RUN MERGING // @@ -191,6 +195,7 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc) if (params.fastp_clip_merge) { ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc) } From d09a3c170edac3afb7e6932cbcca824d6b77e202 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Fri, 18 Mar 2022 15:46:03 +0100 Subject: [PATCH 021/214] Add Porechop --- CITATIONS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CITATIONS.md b/CITATIONS.md index 192b2f4..53c53c3 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -15,6 +15,8 @@ * [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +* [Porechop](https://github.com/rrwick/Porechop) + ## Software packaging/containerisation tools * [Anaconda](https://anaconda.com) From 24a01529f5053c51a0f548ad04790f9bd5e3df9d Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Fri, 18 Mar 2022 15:47:46 +0100 Subject: [PATCH 022/214] Add mentioning about Nanopore reads pre-processing --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5d0c74b..622b8de 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ On release, automated continuous integration tests run the pipeline on a full-si - Low complexity filtering - Host read removal - Run merging + - Adapter and quality trimming of Nanopore reads 3. Performs taxonomic profiling a choice of: - Kraken2 - MetaPhlAn3 From c7f022008c561e70e2ed4a875c17ea1ece109f43 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 21 Mar 2022 15:07:59 +0100 Subject: [PATCH 023/214] Start getting database prep ready --- modules.json | 3 +++ modules/nf-core/modules/untar/main.nf | 36 ++++++++++++++++++++++++++ modules/nf-core/modules/untar/meta.yml | 28 ++++++++++++++++++++ subworkflows/local/db_check.nf | 14 +++++++++- subworkflows/local/input_check.nf | 2 +- 5 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 modules/nf-core/modules/untar/main.nf create mode 100644 modules/nf-core/modules/untar/meta.yml diff --git a/modules.json b/modules.json index 6a785b8..9a7f1df 100644 --- a/modules.json +++ b/modules.json @@ -23,6 +23,9 @@ }, "multiqc": { "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41" + }, + "untar": { + "git_sha": "7ec09d0ef4df89617baacc9b2dafcddb7cd4b05a" } } } diff --git a/modules/nf-core/modules/untar/main.nf b/modules/nf-core/modules/untar/main.nf new file mode 100644 index 0000000..bbae948 --- /dev/null +++ b/modules/nf-core/modules/untar/main.nf @@ -0,0 +1,36 @@ +process UNTAR { + tag "$archive" + label 'process_low' + + conda (params.enable_conda ? "conda-forge::tar=1.32" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img' : + 'biocontainers/biocontainers:v1.2.0_cv1' }" + + input: + path archive + + output: + path "$untar" , emit: untar + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + untar = archive.toString() - '.tar.gz' + """ + tar \\ + -xzvf \\ + $args \\ + $archive \\ + $args2 \\ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/untar/meta.yml b/modules/nf-core/modules/untar/meta.yml new file mode 100644 index 0000000..e877a97 --- /dev/null +++ b/modules/nf-core/modules/untar/meta.yml @@ -0,0 +1,28 @@ +name: untar +description: Extract files. +keywords: + - untar + - uncompress +tools: + - untar: + description: | + Extract tar.gz files. + documentation: https://www.gnu.org/software/tar/manual/ + licence: ["GPL-3.0-or-later"] +input: + - archive: + type: file + description: File to be untar + pattern: "*.{tar}.{gz}" +output: + - untar: + type: file + description: + pattern: "*.*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index 909d98f..6f061d7 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -19,8 +19,20 @@ workflow DB_CHECK { .dump(tag: "db_channel_prepped") .set{ dbs } + + parsed_samplesheet + .branch { + untar: it[0]['db_path'].toString().endsWith(".tar.gz") + skip: true + } + .set{ ch_dbs_for_untar } + + UNTAR ( ch_dbs_for_untar.untar ) + + ch_final_dbs = ch_dbs_for_untar.skip.mix( ch_dbs_untarred ) + emit: - dbs // channel: [ val(meta), [ db ] ] + dbs = ch_final_dbs // channel: [ val(meta), [ db ] ] versions = DATABASE_CHECK.out.versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index d66fb3a..938c87f 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -35,7 +35,7 @@ workflow INPUT_CHECK { } // Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channel(LinkedHashMap row) { +def create_fastq_channels(LinkedHashMap row) { // create meta map def meta = [:] meta.id = row.sample From 631c115e1003fc7cb9b0c893bf7cde761f4287ef Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Mon, 21 Mar 2022 14:58:19 +0000 Subject: [PATCH 024/214] Adds PoC of untarring system --- conf/modules.config | 2 +- modules.json | 2 +- modules/nf-core/modules/untar/main.nf | 10 +++++----- modules/nf-core/modules/untar/meta.yml | 10 ++++++++++ subworkflows/local/db_check.nf | 9 +++++---- 5 files changed, 22 insertions(+), 11 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 9e334bc..620ae1d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -82,7 +82,7 @@ process { publishDir = [ path: { "${params.outdir}/kraken2/${meta.db_name}" }, mode: 'copy', - pattern: '.{fastq.gz,txt}' + pattern: '*.{fastq.gz,txt}' ] ext.args = { "${meta.db_params}" } ext.when = params.run_kraken2 diff --git a/modules.json b/modules.json index 9a7f1df..96a43d8 100644 --- a/modules.json +++ b/modules.json @@ -25,7 +25,7 @@ "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41" }, "untar": { - "git_sha": "7ec09d0ef4df89617baacc9b2dafcddb7cd4b05a" + "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" } } } diff --git a/modules/nf-core/modules/untar/main.nf b/modules/nf-core/modules/untar/main.nf index bbae948..dc43fb7 100644 --- a/modules/nf-core/modules/untar/main.nf +++ b/modules/nf-core/modules/untar/main.nf @@ -8,19 +8,19 @@ process UNTAR { 'biocontainers/biocontainers:v1.2.0_cv1' }" input: - path archive + tuple val(meta), path(archive) output: - path "$untar" , emit: untar - path "versions.yml", emit: versions + tuple val(meta), path("$untar"), emit: untar + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' + def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' - untar = archive.toString() - '.tar.gz' + untar = archive.toString() - '.tar.gz' """ tar \\ -xzvf \\ diff --git a/modules/nf-core/modules/untar/meta.yml b/modules/nf-core/modules/untar/meta.yml index e877a97..d426919 100644 --- a/modules/nf-core/modules/untar/meta.yml +++ b/modules/nf-core/modules/untar/meta.yml @@ -10,11 +10,21 @@ tools: documentation: https://www.gnu.org/software/tar/manual/ licence: ["GPL-3.0-or-later"] input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] - archive: type: file description: File to be untar pattern: "*.{tar}.{gz}" output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] - untar: type: file description: diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index 6f061d7..641108d 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -3,6 +3,7 @@ // include { DATABASE_CHECK } from '../../modules/local/database_check' +include { UNTAR } from '../../modules/nf-core/modules/untar/main' workflow DB_CHECK { take: @@ -17,19 +18,19 @@ workflow DB_CHECK { .dump(tag: "db_split_csv_out") .map { create_db_channels(it) } .dump(tag: "db_channel_prepped") - .set{ dbs } - parsed_samplesheet .branch { - untar: it[0]['db_path'].toString().endsWith(".tar.gz") + untar: it[1].toString().endsWith(".tar.gz") skip: true } .set{ ch_dbs_for_untar } + // TODO Filter to only run UNTAR on DBs of tools actually using? + // TODO make optional whether to save UNTAR ( ch_dbs_for_untar.untar ) - ch_final_dbs = ch_dbs_for_untar.skip.mix( ch_dbs_untarred ) + ch_final_dbs = ch_dbs_for_untar.skip.mix( UNTAR.out.untar ) emit: dbs = ch_final_dbs // channel: [ val(meta), [ db ] ] From c97de32434d03fdc97c0a5dc9c75cf328b027193 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Mon, 21 Mar 2022 18:17:08 +0100 Subject: [PATCH 025/214] Make adapter and quality trimming optional --- nextflow.config | 1 + workflows/taxprofiler.nf | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/nextflow.config b/nextflow.config index 7b897ab..4a3a56d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -57,6 +57,7 @@ params { // FASTQ preprocessing fastp_clip_merge = false fastp_exclude_unmerged = true + remove_adapters = false // MALT run_malt = false diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 9e52b59..0e144f3 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -104,9 +104,16 @@ workflow TAXPROFILER { FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq ) } - LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ) + ch_multiqc_files = Channel.empty() + if ( params.remove_adapters ) { + ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ).reads + .map { it -> [ it[0], [it[1]] ] } ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc) + } else { + ch_longreads_preprocessed = INPUT_CHECK.out.nanopore + } // // PERFORM RUN MERGING @@ -138,6 +145,7 @@ workflow TAXPROFILER { // output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] ch_input_for_profiling = ch_reads_for_profiling + .mix( ch_longreads_preprocessed ) .combine(DB_CHECK.out.dbs) .dump(tag: "reads_plus_db") .branch { @@ -189,13 +197,11 @@ workflow TAXPROFILER { workflow_summary = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) - ch_multiqc_files = Channel.empty() ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config)) ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc) if (params.fastp_clip_merge) { ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc) } From f6fe26de466446379f49ba00dd80eb995bafd0bb Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Mon, 21 Mar 2022 18:25:56 +0100 Subject: [PATCH 026/214] Rename shortread subworkflow to be more consistent --- .../{preprocessing.nf => shortread_preprocessing.nf} | 0 workflows/taxprofiler.nf | 8 ++++---- 2 files changed, 4 insertions(+), 4 deletions(-) rename subworkflows/local/{preprocessing.nf => shortread_preprocessing.nf} (100%) diff --git a/subworkflows/local/preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf similarity index 100% rename from subworkflows/local/preprocessing.nf rename to subworkflows/local/shortread_preprocessing.nf diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 0e144f3..22c7518 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -39,7 +39,7 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi include { INPUT_CHECK } from '../subworkflows/local/input_check' include { DB_CHECK } from '../subworkflows/local/db_check' -include { FASTQ_PREPROCESSING } from '../subworkflows/local/preprocessing' +include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' /* @@ -101,7 +101,7 @@ workflow TAXPROFILER { // PERFORM PREPROCESSING // if ( params.fastp_clip_merge ) { - FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq ) + SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ) } ch_multiqc_files = Channel.empty() @@ -118,7 +118,7 @@ workflow TAXPROFILER { // // PERFORM RUN MERGING // - ch_processed_for_combine = FASTQ_PREPROCESSING.out.reads + ch_processed_for_combine = SHORTREAD_PREPROCESSING.out.reads .dump(tag: "prep_for_combine_grouping") .map { meta, reads -> @@ -203,7 +203,7 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) if (params.fastp_clip_merge) { - ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc) + ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_PREPROCESSING.out.mqc) } if (params.run_kraken2) { ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([])) From 4940ec57ffee76378b9bfe8dcff1d73f3097846e Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Mon, 21 Mar 2022 18:26:26 +0100 Subject: [PATCH 027/214] Remove unnecessary extra point --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 622b8de..d454a9b 100644 --- a/README.md +++ b/README.md @@ -30,11 +30,10 @@ On release, automated continuous integration tests run the pipeline on a full-si 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) 2. Performs optional read pre-processing - - Adapter clipping and merging + - Adapter clipping and merging (short, and nanopore reads) - Low complexity filtering - Host read removal - Run merging - - Adapter and quality trimming of Nanopore reads 3. Performs taxonomic profiling a choice of: - Kraken2 - MetaPhlAn3 From 5b1b48e59e17d271d03450daca083dd8cec502af Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Mon, 21 Mar 2022 18:29:47 +0100 Subject: [PATCH 028/214] Update subworkflow name to be more consistent --- subworkflows/local/shortread_preprocessing.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index 5832824..406c198 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -7,7 +7,7 @@ include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fast include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main' -workflow FASTQ_PREPROCESSING { +workflow SHORTREAD_PREPROCESSING { take: reads // file: /path/to/samplesheet.csv From 80129985424b214ee22213a8db7cc139e2793ff5 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 21 Mar 2022 19:52:50 +0100 Subject: [PATCH 029/214] Make parameter naming more consistent for clipmerge --- conf/modules.config | 5 ++--- nextflow.config | 6 ++--- subworkflows/local/shortread_preprocessing.nf | 2 +- workflows/taxprofiler.nf | 22 ++++++++++++------- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 050772e..c09a011 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -41,7 +41,7 @@ process { // TODO also include option to NOT merge ext.args = [ { ${meta.single_end} } == 0 ? "-m" : '', - params.fastp_exclude_unmerged ? '' : "--include_unmerged" + params.shortread_excludeunmerged ? '' : "--include_unmerged" ].join(' ').trim() publishDir = [ path: { "${params.outdir}/fastp" }, @@ -84,7 +84,7 @@ process { pattern: '*.{rma6,tab,text,sam,log}' ] ext.args = { "${meta.db_params}" } - ext.when = params.run_malt + ext.prefix = { "${meta.id}-${meta.db_name}" } } withName: KRAKEN2_KRAKEN2 { @@ -94,7 +94,6 @@ process { pattern: '.{fastq.gz,txt}' ] ext.args = { "${meta.db_params}" } - ext.when = params.run_kraken2 ext.prefix = { "${meta.id}-${meta.db_name}" } } diff --git a/nextflow.config b/nextflow.config index 4a3a56d..5f7aec6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -55,9 +55,9 @@ params { databases = null // FASTQ preprocessing - fastp_clip_merge = false - fastp_exclude_unmerged = true - remove_adapters = false + shortread_clipmerge = false + shortread_excludeunmerged = true + longread_clip = false // MALT run_malt = false diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index 406c198..d996a76 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -23,7 +23,7 @@ workflow SHORTREAD_PREPROCESSING { // TODO move to subworkflow - if ( params.fastp_clip_merge ) { + if ( params.shortread_clipmerge ) { ch_input_for_fastp = reads .dump(tag: "pre-fastp_branch") diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 22c7518..4aa0684 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -100,17 +100,14 @@ workflow TAXPROFILER { // // PERFORM PREPROCESSING // - if ( params.fastp_clip_merge ) { + if ( params.shortread_clipmerge ) { SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ) } - ch_multiqc_files = Channel.empty() - - if ( params.remove_adapters ) { + if ( params.longread_clip ) { ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ).reads .map { it -> [ it[0], [it[1]] ] } ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first()) - ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc) } else { ch_longreads_preprocessed = INPUT_CHECK.out.nanopore } @@ -187,9 +184,13 @@ workflow TAXPROFILER { // // RUN PROFILING // - MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) - KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) + if ( params.run_malt ) { + MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) + } + if ( params.run_kraken2 ) { + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) + } // // MODULE: MultiQC @@ -197,14 +198,19 @@ workflow TAXPROFILER { workflow_summary = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) + ch_multiqc_files = Channel.empty() ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config)) ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) - if (params.fastp_clip_merge) { + + if (params.shortread_clipmerge) { ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_PREPROCESSING.out.mqc) } + if (params.longread_clip) { + ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc) + } if (params.run_kraken2) { ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([])) ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first()) From 07eed435c628ae0518246aa31e87d81fb02c69ad Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 21 Mar 2022 19:54:51 +0100 Subject: [PATCH 030/214] Replace set with explicity assignment --- subworkflows/local/db_check.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index 641108d..890e373 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -19,12 +19,11 @@ workflow DB_CHECK { .map { create_db_channels(it) } .dump(tag: "db_channel_prepped") - parsed_samplesheet + ch_dbs_for_untar = parsed_samplesheet .branch { untar: it[1].toString().endsWith(".tar.gz") skip: true } - .set{ ch_dbs_for_untar } // TODO Filter to only run UNTAR on DBs of tools actually using? // TODO make optional whether to save From 81bfb629cadfa76ac6f639f320f3bd0fb0ae4dfd Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 21 Mar 2022 20:28:09 +0100 Subject: [PATCH 031/214] Add working basic test to begin --- .github/workflows/ci.yml | 1 + conf/modules.config | 16 ++++++++++++++++ conf/test.config | 8 +++++++- subworkflows/local/input_check.nf | 6 +++--- workflows/taxprofiler.nf | 13 ++++++++----- 5 files changed, 35 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 033eb63..5fe2777 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,3 +48,4 @@ jobs: # Remember that you can parallelise this by using strategy.matrix run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + # TODO Add test that runs with pre-downloaded and decompressed databases diff --git a/conf/modules.config b/conf/modules.config index ab8f021..b9c1008 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -26,6 +26,22 @@ process { ] } + withName: DATABASE_CHECK { + publishDir = [ + path: { "${params.outdir}/pipeline_info" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: UNTAR { + publishDir = [ + path: { "${params.outdir}/databases" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: FASTQC { ext.args = '--quiet' ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } diff --git a/conf/test.config b/conf/test.config index 51f3bb6..42d8de6 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,6 +22,12 @@ params { // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + outdir = "./results" + // TODO replace with official once ready + databases = 'https://raw.githubusercontent.com/jfy133/nf-core-test-datasets/taxprofiler/database.csv' + run_kraken2 = true + run_malt = true + shortread_clipmerge = true } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 67dadc2..4501386 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -30,7 +30,7 @@ workflow INPUT_CHECK { .set { nanopore } parsed_samplesheet.fasta - .map { create_fasta_channels(it) } + .map { create_fasta_channel(it) } .dump(tag: "fasta_channel_init") .set { fasta } @@ -42,7 +42,7 @@ workflow INPUT_CHECK { } // Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channels(LinkedHashMap row) { +def create_fastq_channel(LinkedHashMap row) { // create meta map def meta = [:] meta.id = row.sample @@ -74,7 +74,7 @@ def create_fastq_channels(LinkedHashMap row) { } // Function to get list of [ meta, fasta ] -def create_fasta_channels(LinkedHashMap row) { +def create_fasta_channel(LinkedHashMap row) { def meta = [:] meta.id = row.sample meta.run_accession = row.run_accession diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 4aa0684..6fc5450 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -101,7 +101,9 @@ workflow TAXPROFILER { // PERFORM PREPROCESSING // if ( params.shortread_clipmerge ) { - SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ) + ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads + } else { + ch_shortreads_preprocessed = INPUT_CHECK.out.fastq } if ( params.longread_clip ) { @@ -113,9 +115,10 @@ workflow TAXPROFILER { } // - // PERFORM RUN MERGING + // PERFORM SHORT READ RUN MERGING + // TODO: Check not necessary for long reads too? // - ch_processed_for_combine = SHORTREAD_PREPROCESSING.out.reads + ch_processed_for_combine = ch_shortreads_preprocessed .dump(tag: "prep_for_combine_grouping") .map { meta, reads -> @@ -140,7 +143,7 @@ workflow TAXPROFILER { // COMBINE READS WITH POSSIBLE DATABASES // - // output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] + // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] ch_input_for_profiling = ch_reads_for_profiling .mix( ch_longreads_preprocessed ) .combine(DB_CHECK.out.dbs) @@ -152,7 +155,7 @@ workflow TAXPROFILER { } // - // PREP PROFILER INPUT CHANNELS ON PER TOOL BASIS + // PREPARE PROFILER INPUT CHANNELS // // We groupTuple to have all samples in one channel for MALT as database From 358b89a4c6d18f195c7aae3115e28080cfa32b3e Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 21 Mar 2022 20:30:29 +0100 Subject: [PATCH 032/214] Linting --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index b9c1008..29a5135 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,7 +35,7 @@ process { } withName: UNTAR { - publishDir = [ + publishDir = [ path: { "${params.outdir}/databases" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } From 48b6ef508dbb0d0a35b91aa8e670616f640b36ae Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 23 Mar 2022 10:06:01 +0100 Subject: [PATCH 033/214] Update path to samplesheet --- conf/test.config | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/conf/test.config b/conf/test.config index 42d8de6..2e08499 100644 --- a/conf/test.config +++ b/conf/test.config @@ -24,8 +24,7 @@ params { // TODO nf-core: Give any required params for the test so that command line flags are not needed input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' outdir = "./results" - // TODO replace with official once ready - databases = 'https://raw.githubusercontent.com/jfy133/nf-core-test-datasets/taxprofiler/database.csv' + databases = 'https://raw.githubusercontent.com/nf-core/nf-core-test-datasets/taxprofiler/database.csv' run_kraken2 = true run_malt = true shortread_clipmerge = true From 038a8d106a3e027d48c781ab15197270ec38e7a5 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 23 Mar 2022 11:25:47 +0100 Subject: [PATCH 034/214] Fix test with correct URL --- conf/test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index 2e08499..5924d7a 100644 --- a/conf/test.config +++ b/conf/test.config @@ -24,7 +24,7 @@ params { // TODO nf-core: Give any required params for the test so that command line flags are not needed input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' outdir = "./results" - databases = 'https://raw.githubusercontent.com/nf-core/nf-core-test-datasets/taxprofiler/database.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' run_kraken2 = true run_malt = true shortread_clipmerge = true From 3ff54e620e9b9212a3bad5c687769b8c37e5b89d Mon Sep 17 00:00:00 2001 From: sofstam Date: Thu, 24 Mar 2022 12:51:45 +0100 Subject: [PATCH 035/214] Add centrifuge classification --- conf/modules.config | 10 +++ conf/test.config | 1 + modules.json | 5 +- modules/nf-core/modules/centrifuge/main.nf | 63 ++++++++++++++++++ modules/nf-core/modules/centrifuge/meta.yml | 73 +++++++++++++++++++++ nextflow.config | 8 ++- subworkflows/local/db_check.nf | 2 +- subworkflows/local/input_check.nf | 3 +- workflows/taxprofiler.nf | 22 +++++-- 9 files changed, 179 insertions(+), 8 deletions(-) create mode 100644 modules/nf-core/modules/centrifuge/main.nf create mode 100644 modules/nf-core/modules/centrifuge/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 29a5135..20e6bba 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -121,4 +121,14 @@ process { ] } + withName: CENTRIFUGE { + publishDir = [ + path: { "${params.outdir}/centrifuge/${meta.db_name}" }, + mode: 'copy', + pattern: '*.{fastq.gz,txt}' + ] + ext.args = { "${meta.db_params}" } + ext.prefix = { "${meta.id}-${meta.db_name}" } + } + } diff --git a/conf/test.config b/conf/test.config index 42d8de6..6fca9c0 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,5 +29,6 @@ params { run_kraken2 = true run_malt = true shortread_clipmerge = true + run_centrifuge = true } diff --git a/modules.json b/modules.json index 673a69b..b9dfc87 100644 --- a/modules.json +++ b/modules.json @@ -29,6 +29,9 @@ "porechop": { "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046" } + "centrifuge": { + "git_sha": "ea41a8a6f761b9993d857570e872abaae3fea555" + } } } -} \ No newline at end of file +} diff --git a/modules/nf-core/modules/centrifuge/main.nf b/modules/nf-core/modules/centrifuge/main.nf new file mode 100644 index 0000000..7eb566d --- /dev/null +++ b/modules/nf-core/modules/centrifuge/main.nf @@ -0,0 +1,63 @@ +process CENTRIFUGE { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? "bioconda::centrifuge=1.0.4_beta" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6' : + 'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }" + + input: + tuple val(meta), path(reads) + path db + val save_unaligned + val save_aligned + val sam_format + + output: + tuple val(meta), path('*report.txt') , emit: report + tuple val(meta), path('*results.txt') , emit: results + tuple val(meta), path('*kreport.txt') , emit: kreport + tuple val(meta), path('*.sam') , optional: true, emit: sam + tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_mapped + tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "-U ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" + def db_name = db.toString().replace(".tar.gz","") + def unaligned = '' + def aligned = '' + if (meta.single_end) { + unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : '' + } else { + unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : '' + } + def sam_output = sam_format ? "--out-fmt 'sam'" : '' + """ + tar -xf $db + centrifuge \\ + -x $db_name \\ + -p $task.cpus \\ + $paired \\ + --report-file ${prefix}.report.txt \\ + -S ${prefix}.results.txt \\ + $unaligned \\ + $aligned \\ + $sam_output \\ + $args + centrifuge-kreport -x $db_name ${prefix}.results.txt > ${prefix}.kreport.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/centrifuge/meta.yml b/modules/nf-core/modules/centrifuge/meta.yml new file mode 100644 index 0000000..3adf0e2 --- /dev/null +++ b/modules/nf-core/modules/centrifuge/meta.yml @@ -0,0 +1,73 @@ +name: centrifuge +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - centrifuge: + description: Centrifuge is a classifier for metagenomic sequences. + homepage: https://ccb.jhu.edu/software/centrifuge/ + documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml + doi: 10.1101/gr.210641.116 + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Centrifuge database in .tar.gz format + pattern: "*.tar.gz" + - save_unaligned: + type: value + description: If true unmapped fastq files are saved + - save_aligned: + type: value + description: If true mapped fastq files are saved +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - report: + type: file + description: | + File containing a classification summary + pattern: "*.{report.txt}" + - results: + type: file + description: | + File containing classification results + pattern: "*.{results.txt}" + - kreport: + type: file + description: | + File containing kraken-style report from centrifuge + out files. + pattern: "*.{kreport.txt}" + - fastq_unmapped: + type: file + description: Unmapped fastq files + pattern: "*.unmapped.fastq.gz" + - fastq_mapped: + type: file + description: Mapped fastq files + pattern: "*.mapped.fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@sofstam" + - "@jfy133" + - "@sateeshperi" diff --git a/nextflow.config b/nextflow.config index 5f7aec6..5bd8f39 100644 --- a/nextflow.config +++ b/nextflow.config @@ -56,7 +56,7 @@ params { // FASTQ preprocessing shortread_clipmerge = false - shortread_excludeunmerged = true + shortread_excludeunmerged = true longread_clip = false // MALT @@ -65,6 +65,12 @@ params { // kraken2 run_kraken2 = false + + // centrifuge + run_centrifuge = false + save_unaligned = false + save_aligned = false + sam_format = false } // Load base.config by default for all pipelines diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index 890e373..28268c3 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -21,7 +21,7 @@ workflow DB_CHECK { ch_dbs_for_untar = parsed_samplesheet .branch { - untar: it[1].toString().endsWith(".tar.gz") + untar: it[1].toString().endsWith(".tar.gz") && it[0]['tool']!="centrifuge" skip: true } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 4501386..b64e31e 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -67,8 +67,9 @@ def create_fastq_channel(LinkedHashMap row) { if (!file(row.fastq_2).exists()) { exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] + fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] } + } return fastq_meta } diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 6fc5450..ea3ef18 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -58,7 +58,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/ include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' include { MALT_RUN } from '../modules/nf-core/modules/malt/run/main' include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/modules/kraken2/kraken2/main' - +include { CENTRIFUGE } from '../modules/nf-core/modules/centrifuge/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -149,9 +149,10 @@ workflow TAXPROFILER { .combine(DB_CHECK.out.dbs) .dump(tag: "reads_plus_db") .branch { - malt: it[2]['tool'] == 'malt' - kraken2: it[2]['tool'] == 'kraken2' - unknown: true + malt: it[2]['tool'] == 'malt' + kraken2: it[2]['tool'] == 'kraken2' + centrifuge: it[2]['tool'] == 'centrifuge' + unknown: true } // @@ -184,6 +185,15 @@ workflow TAXPROFILER { db: it[3] } + // We can run centrifuge one-by-one sample-wise + ch_input_for_centrifuge = ch_input_for_profiling.centrifuge + .dump(tag: "input for centrifuge") + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + // // RUN PROFILING // @@ -195,6 +205,10 @@ workflow TAXPROFILER { KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) } + if ( params.run_centrifuge ) { + CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.save_unaligned, params.save_aligned, params.sam_format ) + } + // // MODULE: MultiQC // From e3be393f16025f76cb8bc2389ccb2eb841b569f5 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 24 Mar 2022 14:18:15 +0100 Subject: [PATCH 036/214] Fix broken json --- modules.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules.json b/modules.json index c24ff6a..b78ba31 100644 --- a/modules.json +++ b/modules.json @@ -26,9 +26,10 @@ }, "untar": { "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" + }, "porechop": { "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046" } } } -} +} \ No newline at end of file From 203cd2e7d39936dec20d75901fe6afc94b7f8a6f Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 24 Mar 2022 14:31:53 +0100 Subject: [PATCH 037/214] Prettier --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ea659bb..0c1d3c7 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ ## Introduction + **nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling against multiple profiling tools and databases and produces standardised output tables. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! @@ -44,7 +45,7 @@ On release, automated continuous integration tests run the pipeline on a full-si - Kaiju - mOTUs 4. Perform optional post-processing with: - - bracken + - bracken 5. Standardises output tables 6. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) From cc9368ee527243fc1809f891154ae0eb569887f3 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 24 Mar 2022 15:35:09 +0100 Subject: [PATCH 038/214] Prettier linting and update schema --- nextflow.config | 2 +- nextflow_schema.json | 48 ++++++++++++++++++++++++++++++++++++---- workflows/taxprofiler.nf | 3 ++- 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/nextflow.config b/nextflow.config index 5f7aec6..cc77a99 100644 --- a/nextflow.config +++ b/nextflow.config @@ -56,7 +56,7 @@ params { // FASTQ preprocessing shortread_clipmerge = false - shortread_excludeunmerged = true + shortread_excludeunmerged = true longread_clip = false // MALT diff --git a/nextflow_schema.json b/nextflow_schema.json index de1e515..03910e9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -256,5 +266,35 @@ { "$ref": "#/definitions/generic_options" } - ] -} + ], + "properties": { + "databases": { + "type": "string", + "default": null + }, + "shortread_clipmerge": { + "type": "string", + "default": "false" + }, + "shortread_excludeunmerged": { + "type": "string", + "default": "true" + }, + "longread_clip": { + "type": "string", + "default": "false" + }, + "run_malt": { + "type": "string", + "default": "false" + }, + "malt_mode": { + "type": "string", + "default": "BlastN" + }, + "run_kraken2": { + "type": "string", + "default": "false" + } + } +} \ No newline at end of file diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 7320922..29058e6 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -88,8 +88,9 @@ workflow TAXPROFILER { // // MODULE: Run FastQC // + ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ).dump(tag: "input_to_fastq") FASTQC ( - INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ) + ch_input_for_fastqc ) ch_versions = ch_versions.mix(FASTQC.out.versions.first()) From 494c641fb8ade3f6f6e73c537f699df182b16b9d Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 24 Mar 2022 15:37:42 +0100 Subject: [PATCH 039/214] Prettier --- .prettierignore | 2 ++ modules.json | 2 +- nextflow_schema.json | 16 +++------------- 3 files changed, 6 insertions(+), 14 deletions(-) create mode 100644 .prettierignore diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..c037321 --- /dev/null +++ b/.prettierignore @@ -0,0 +1,2 @@ +testing/ +tests/ diff --git a/modules.json b/modules.json index b78ba31..9e6428a 100644 --- a/modules.json +++ b/modules.json @@ -32,4 +32,4 @@ } } } -} \ No newline at end of file +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 03910e9..7bb03c3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -297,4 +287,4 @@ "default": "false" } } -} \ No newline at end of file +} From 92a3c8ec8d6e6698f748f154b0dfeaf92fd9d52b Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 24 Mar 2022 15:40:14 +0100 Subject: [PATCH 040/214] Prettier --- nextflow_schema.json | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 7bb03c3..9527da4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -260,31 +260,27 @@ "properties": { "databases": { "type": "string", - "default": null + "default": "None" }, "shortread_clipmerge": { - "type": "string", - "default": "false" + "type": "boolean" }, "shortread_excludeunmerged": { - "type": "string", - "default": "true" + "type": "boolean", + "default": true }, "longread_clip": { - "type": "string", - "default": "false" + "type": "boolean" }, "run_malt": { - "type": "string", - "default": "false" + "type": "boolean" }, "malt_mode": { "type": "string", "default": "BlastN" }, "run_kraken2": { - "type": "string", - "default": "false" + "type": "boolean" } } } From fafb7e0f6fce56a57f6d375976b3cffe37ebe14b Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 24 Mar 2022 16:59:12 +0100 Subject: [PATCH 041/214] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0c1d3c7..e976f73 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# ![nf-core/taxprofiler](docs/images/nf-core/taxprofiler_logo_light.png#gh-light-mode-only) ![nf-core/taxprofiler](docs/images/nf-core/taxprofiler_logo_dark.png#gh-dark-mode-only) +# ![nf-core/taxprofiler](docs/images/nf-core-taxprofiler_logo_light.png#gh-light-mode-only) ![nf-core/taxprofiler](docs/images/nf-core-taxprofiler_logo_dark.png#gh-dark-mode-only) [![GitHub Actions CI Status](https://github.com/nf-core/taxprofiler/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/taxprofiler/actions?query=workflow%3A%22nf-core+CI%22) [![GitHub Actions Linting Status](https://github.com/nf-core/taxprofiler/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/taxprofiler/actions?query=workflow%3A%22nf-core+linting%22) From 4e93abc7c06b35eae584af4338c8aeaa60b8fcf8 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 25 Mar 2022 14:58:06 +0100 Subject: [PATCH 042/214] Add read improved read preprocessing --- conf/modules.config | 21 +++++- nextflow.config | 12 +++- nextflow_schema.json | 2 +- subworkflows/local/shortread_fastp.nf | 65 +++++++++++++++++++ subworkflows/local/shortread_preprocessing.nf | 55 ++++------------ workflows/taxprofiler.nf | 2 + 6 files changed, 107 insertions(+), 50 deletions(-) create mode 100644 subworkflows/local/shortread_fastp.nf diff --git a/conf/modules.config b/conf/modules.config index 29a5135..36bc626 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -52,12 +52,27 @@ process { ] } + withName: FASTQC_PROCESSED { + ext.args = '--quiet' + ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } + publishDir = [ + path: { "${params.outdir}/fastqc/processed" }, + mode: 'copy', + pattern: '*.html' + ] + } + withName: FASTP { ext.prefix = { "${meta.id}_${meta.run_accession}" } - // TODO also include option to NOT merge ext.args = [ - { ${meta.single_end} } == 0 ? "-m" : '', - params.shortread_excludeunmerged ? '' : "--include_unmerged" + // collapsing options + params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged", + // trimming options + params.shortread_clipmerge_skiptrim ? "--disable_adapter_trimming" : "", + params.shortread_adapter1 ? "--adapter_sequence ${params.shortread_adapter1}" : "", + !{ ${meta.single_end} } && params.shortread_adapter2 ? "--adapter_sequence_r2 ${params.shortread_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : "" + // filtering options + "--length_required ${params.shortread_clipmerge_minlength}" ].join(' ').trim() publishDir = [ path: { "${params.outdir}/fastp" }, diff --git a/nextflow.config b/nextflow.config index cc77a99..a312d0c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -55,9 +55,15 @@ params { databases = null // FASTQ preprocessing - shortread_clipmerge = false - shortread_excludeunmerged = true - longread_clip = false + shortread_clipmerge = false + shortread_clipmerge_tool = 'fastp' + shortread_clipmerge_skiptrim = false + shortread_clipmerge_mergepairs = false + shortread_clipmerge_excludeunmerged = true + shortread_clipmerge_adapter1 = null + shortread_clipmerge_adapter2 = null + shortread_clipmerge_minlength = 15 + longread_clip = false // MALT run_malt = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 9527da4..0fa217f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -265,7 +265,7 @@ "shortread_clipmerge": { "type": "boolean" }, - "shortread_excludeunmerged": { + "shortread_clipmerge_excludeunmerged": { "type": "boolean", "default": true }, diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf new file mode 100644 index 0000000..87aba25 --- /dev/null +++ b/subworkflows/local/shortread_fastp.nf @@ -0,0 +1,65 @@ +// +// Check input samplesheet and get read channels +// + + +include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main' +include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' + +workflow SHORTREAD_FASTP { + take: + reads // file: /path/to/samplesheet.csv + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + // + // STEP: Read clipping and merging + // + // TODO give option to retain singletons (probably fastp option likely) + // TODO move to subworkflow + + ch_input_for_fastp = reads + .dump(tag: "pre-fastp_branch") + .branch{ + single: it[0]['single_end'] == true + paired: it[0]['single_end'] == false + } + + ch_input_for_fastp.single.dump(tag: "input_fastp_single") + ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") + + FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) + FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs ) + + if ( params.shortread_clipmerge_mergepairs ) { + ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged + .mix( FASTP_SINGLE.out.reads ) + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] + } + } else { + ch_fastp_reads_prepped = FASTP_PAIRED.out.reads + .mix( FASTP_SINGLE.out.reads ) + } + + ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) + ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) + + ch_processed_reads = ch_fastp_reads_prepped.dump(tag: "ch_fastp_reads_prepped") + + ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} ) + + ch_multiqc_files.dump(tag: "preprocessing_fastp_mqc_final") + + emit: + reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index d996a76..c31289d 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -3,17 +3,16 @@ // -include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main' -include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' -include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main' +include { SHORTREAD_FASTP } from './shortread_fastp' +include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main' workflow SHORTREAD_PREPROCESSING { take: reads // file: /path/to/samplesheet.csv main: - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() // // STEP: Read clipping and merging @@ -22,50 +21,20 @@ workflow SHORTREAD_PREPROCESSING { // TODO give option to retain singletons (probably fastp option likely) // TODO move to subworkflow - - if ( params.shortread_clipmerge ) { - - ch_input_for_fastp = reads - .dump(tag: "pre-fastp_branch") - .branch{ - single: it[0]['single_end'] == true - paired: it[0]['single_end'] == false - } - - ch_input_for_fastp.single.dump(tag: "input_fastp_single") - ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") - - FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) - FASTP_PAIRED ( ch_input_for_fastp.paired, false, true ) - - ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged - .mix( FASTP_SINGLE.out.reads ) - .map { - meta, reads -> - def meta_new = meta.clone() - meta_new['single_end'] = 1 - [ meta_new, reads ] - } - - FASTQC_POST ( ch_fastp_reads_prepped ) - - ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) - ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) - - ch_processed_reads = ch_fastp_reads_prepped - - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} ) - ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} ) - ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} ) - - ch_multiqc_files.dump(tag: "preprocessing_mqc_final") - + if ( params.shortread_clipmerge_tool == "fastp" ) { + ch_processed_reads = SHORTREAD_FASTP ( reads ).reads + ch_versions = ch_versions.mix( SHORTREAD_FASTP.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc ) } else { ch_processed_reads = reads } + //FASTQC_PROCESSED ( ch_processed_reads ) + //ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) + //ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} ) emit: + // TODO: problem, this is being exported as a multi-channel output? This is why FASTQC is broken reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] versions = ch_versions // channel: [ versions.yml ] mqc = ch_multiqc_files diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 29058e6..0a907bf 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -17,6 +17,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } +if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not except uncollapsed paired-reads. Pairs will be profiled as separate files." /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -135,6 +136,7 @@ workflow TAXPROFILER { CAT_FASTQ ( ch_processed_for_combine.combine ) + // TODO May need to flatten reads? ch_reads_for_profiling = ch_processed_for_combine.skip .dump(tag: "skip_combine") .mix( CAT_FASTQ.out.reads ) From ff1f28f4f0b8e88898e0f568f83e2da90ff81bc6 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 25 Mar 2022 15:01:25 +0100 Subject: [PATCH 043/214] Update schema --- conf/modules.config | 4 ++-- nextflow_schema.json | 38 +++++++++++++++++++++++++++++++++++--- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 36bc626..1052bed 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -69,8 +69,8 @@ process { params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged", // trimming options params.shortread_clipmerge_skiptrim ? "--disable_adapter_trimming" : "", - params.shortread_adapter1 ? "--adapter_sequence ${params.shortread_adapter1}" : "", - !{ ${meta.single_end} } && params.shortread_adapter2 ? "--adapter_sequence_r2 ${params.shortread_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : "" + params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", + !{ ${meta.single_end} } && params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : "", // filtering options "--length_required ${params.shortread_clipmerge_minlength}" ].join(' ').trim() diff --git a/nextflow_schema.json b/nextflow_schema.json index 0fa217f..42307e9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -281,6 +291,28 @@ }, "run_kraken2": { "type": "boolean" + }, + "shortread_clipmerge_tool": { + "type": "string", + "default": "fastp" + }, + "shortread_clipmerge_skiptrim": { + "type": "boolean" + }, + "shortread_clipmerge_mergepairs": { + "type": "boolean" + }, + "shortread_clipmerge_adapter1": { + "type": "string", + "default": null + }, + "shortread_clipmerge_adapter2": { + "type": "string", + "default": null + }, + "shortread_clipmerge_minlength": { + "type": "integer", + "default": 15 } } -} +} \ No newline at end of file From dfcf8f7b1ae183a233d8a1c94f39a1f27084a966 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 25 Mar 2022 15:10:52 +0100 Subject: [PATCH 044/214] Prettier --- conf/test.config | 1 - nextflow_schema.json | 16 +++------------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/conf/test.config b/conf/test.config index 5924d7a..92a10e4 100644 --- a/conf/test.config +++ b/conf/test.config @@ -23,7 +23,6 @@ params { // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - outdir = "./results" databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' run_kraken2 = true run_malt = true diff --git a/nextflow_schema.json b/nextflow_schema.json index 42307e9..545f990 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -315,4 +305,4 @@ "default": 15 } } -} \ No newline at end of file +} From b5f5c755046b3ae1af9648e879dfa48519385891 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 25 Mar 2022 15:21:52 +0100 Subject: [PATCH 045/214] Fix file header descriptions --- subworkflows/local/longread_preprocessing.nf | 9 ++++++--- subworkflows/local/shortread_fastp.nf | 10 +++------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf index da1049a..58968e8 100644 --- a/subworkflows/local/longread_preprocessing.nf +++ b/subworkflows/local/longread_preprocessing.nf @@ -1,6 +1,9 @@ +/* +Process long raw reads with porechop +*/ -include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main' -include { PORECHOP } from '../../modules/nf-core/modules/porechop/main' +include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main' +include { PORECHOP } from '../../modules/nf-core/modules/porechop/main' workflow LONGREAD_PREPROCESSING { take: @@ -23,7 +26,7 @@ workflow LONGREAD_PREPROCESSING { FASTQC_POST ( PORECHOP.out.reads ) ch_versions = ch_versions.mix(PORECHOP.out.versions.first()) - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} ) emit: diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index 87aba25..f457cf3 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -1,7 +1,6 @@ -// -// Check input samplesheet and get read channels -// - +/* +Process short raw reads with FastP +*/ include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main' include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' @@ -17,9 +16,6 @@ workflow SHORTREAD_FASTP { // // STEP: Read clipping and merging // - // TODO give option to retain singletons (probably fastp option likely) - // TODO move to subworkflow - ch_input_for_fastp = reads .dump(tag: "pre-fastp_branch") .branch{ From 7ddcb09a85ed8c50f5d661b398f0e0a5e113add8 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 25 Mar 2022 15:22:41 +0100 Subject: [PATCH 046/214] Some more cleanup --- .../local/shortread_adapterremoval.nf | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 subworkflows/local/shortread_adapterremoval.nf diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf new file mode 100644 index 0000000..e15d2ef --- /dev/null +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -0,0 +1,64 @@ +/* +Process raw reads with AdapterRemoval +*/ + +include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main' +include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' + +workflow SHORTREAD_FASTP { + take: + reads // file: /path/to/samplesheet.csv + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + // + // STEP: Read clipping and merging + // + // TODO give option to retain singletons (probably fastp option likely) + // TODO move to subworkflow + + ch_input_for_fastp = reads + .dump(tag: "pre-fastp_branch") + .branch{ + single: it[0]['single_end'] == true + paired: it[0]['single_end'] == false + } + + ch_input_for_fastp.single.dump(tag: "input_fastp_single") + ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") + + FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) + FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs ) + + if ( params.shortread_clipmerge_mergepairs ) { + ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged + .mix( FASTP_SINGLE.out.reads ) + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] + } + } else { + ch_fastp_reads_prepped = FASTP_PAIRED.out.reads + .mix( FASTP_SINGLE.out.reads ) + } + + ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) + ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) + + ch_processed_reads = ch_fastp_reads_prepped.dump(tag: "ch_fastp_reads_prepped") + + ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} ) + + ch_multiqc_files.dump(tag: "preprocessing_fastp_mqc_final") + + emit: + reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + From 0e6f7e2ca137fbb8306b733ab3abc3b6d8ad83f7 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 25 Mar 2022 15:24:10 +0100 Subject: [PATCH 047/214] Revert "Some more cleanup" This reverts commit 7ddcb09a85ed8c50f5d661b398f0e0a5e113add8. --- .../local/shortread_adapterremoval.nf | 64 ------------------- 1 file changed, 64 deletions(-) delete mode 100644 subworkflows/local/shortread_adapterremoval.nf diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf deleted file mode 100644 index e15d2ef..0000000 --- a/subworkflows/local/shortread_adapterremoval.nf +++ /dev/null @@ -1,64 +0,0 @@ -/* -Process raw reads with AdapterRemoval -*/ - -include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main' -include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' - -workflow SHORTREAD_FASTP { - take: - reads // file: /path/to/samplesheet.csv - - main: - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() - - // - // STEP: Read clipping and merging - // - // TODO give option to retain singletons (probably fastp option likely) - // TODO move to subworkflow - - ch_input_for_fastp = reads - .dump(tag: "pre-fastp_branch") - .branch{ - single: it[0]['single_end'] == true - paired: it[0]['single_end'] == false - } - - ch_input_for_fastp.single.dump(tag: "input_fastp_single") - ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") - - FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) - FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs ) - - if ( params.shortread_clipmerge_mergepairs ) { - ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged - .mix( FASTP_SINGLE.out.reads ) - .map { - meta, reads -> - def meta_new = meta.clone() - meta_new['single_end'] = 1 - [ meta_new, reads ] - } - } else { - ch_fastp_reads_prepped = FASTP_PAIRED.out.reads - .mix( FASTP_SINGLE.out.reads ) - } - - ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) - ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) - - ch_processed_reads = ch_fastp_reads_prepped.dump(tag: "ch_fastp_reads_prepped") - - ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} ) - ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} ) - - ch_multiqc_files.dump(tag: "preprocessing_fastp_mqc_final") - - emit: - reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] - versions = ch_versions // channel: [ versions.yml ] - mqc = ch_multiqc_files -} - From b763bfa2c0a7dabe4ad8c4216cf852a3938a49b6 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 25 Mar 2022 15:26:57 +0100 Subject: [PATCH 048/214] More cleanup --- subworkflows/local/shortread_fastp.nf | 3 --- 1 file changed, 3 deletions(-) diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index f457cf3..57ce2a6 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -13,9 +13,6 @@ workflow SHORTREAD_FASTP { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() - // - // STEP: Read clipping and merging - // ch_input_for_fastp = reads .dump(tag: "pre-fastp_branch") .branch{ From ede362dbf91df9f5118f0122f8bfc09d888cb141 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 25 Mar 2022 15:28:30 +0100 Subject: [PATCH 049/214] Remove duplicate config and sync longread fastqc --- conf/modules.config | 10 ---------- subworkflows/local/longread_preprocessing.nf | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 1052bed..7d50174 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -90,16 +90,6 @@ process { ] } - withName: FASTQC_POST { - ext.args = '--quiet' - ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } - publishDir = [ - path: { "${params.outdir}/fastqc/processed" }, - mode: 'copy', - pattern: '*.html' - ] - } - withName: CAT_FASTQ { publishDir = [ path: { "${params.outdir}/prepared_sequences" }, diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf index 58968e8..7c7c24b 100644 --- a/subworkflows/local/longread_preprocessing.nf +++ b/subworkflows/local/longread_preprocessing.nf @@ -24,7 +24,7 @@ workflow LONGREAD_PREPROCESSING { [ meta_new, reads ] } - FASTQC_POST ( PORECHOP.out.reads ) + FASTQC_PROCESSED ( PORECHOP.out.reads ) ch_versions = ch_versions.mix(PORECHOP.out.versions.first()) ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} ) From 000f129dab1b139cec6e9194670addf8d9e6173a Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 25 Mar 2022 15:32:56 +0100 Subject: [PATCH 050/214] Apply suggestions from code review --- subworkflows/local/shortread_preprocessing.nf | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index c31289d..c82536f 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -29,12 +29,11 @@ workflow SHORTREAD_PREPROCESSING { ch_processed_reads = reads } - //FASTQC_PROCESSED ( ch_processed_reads ) - //ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) - //ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} ) + FASTQC_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} ) emit: - // TODO: problem, this is being exported as a multi-channel output? This is why FASTQC is broken reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] versions = ch_versions // channel: [ versions.yml ] mqc = ch_multiqc_files From 16bdc79cc08750e8df36acc8c14aa90e8c522f76 Mon Sep 17 00:00:00 2001 From: sofstam Date: Fri, 25 Mar 2022 16:30:26 +0100 Subject: [PATCH 051/214] Apply prettier --- modules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 2dbde56..ffaff90 100644 --- a/modules.json +++ b/modules.json @@ -29,7 +29,7 @@ }, "porechop": { "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046" - } + }, "centrifuge": { "git_sha": "ea41a8a6f761b9993d857570e872abaae3fea555" } From 59d3f18a753bf5b06ab495c98f96df77a84513d5 Mon Sep 17 00:00:00 2001 From: sofstam Date: Fri, 25 Mar 2022 17:18:04 +0100 Subject: [PATCH 052/214] Update nextflow_schema.json --- nextflow_schema.json | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index b8d5a1d..b4b3e07 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -293,20 +293,16 @@ "type": "boolean" }, "run_centrifuge": { - "type": "string", - "default": "false" + "type": "boolean" }, "centrifuge_save_unaligned": { - "type": "string", - "default": "false" + "type": "boolean" }, "centrifuge_save_aligned": { - "type": "string", - "default": "false" + "type": "boolean" }, "centrifuge_sam_format": { - "type": "string", - "default": "false" + "type": "boolean" } } -} \ No newline at end of file +} From 120382f51d184bed0375ab8c642f6bdab77a617f Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 26 Mar 2022 06:45:16 +0100 Subject: [PATCH 053/214] Fix input channels to kraken2 --- workflows/taxprofiler.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 0a907bf..c1d7f34 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -37,11 +37,11 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { DB_CHECK } from '../subworkflows/local/db_check' +include { DB_CHECK } from '../subworkflows/local/db_check' include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' -include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' +include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -171,7 +171,7 @@ workflow TAXPROFILER { [ temp_meta, it[1], db ] } .groupTuple(by: [0,2]) - .dump(tag: "input for malt") + .dump(tag: "input_for_malt") .multiMap { it -> reads: [ it[0], it[1].flatten() ] @@ -180,10 +180,10 @@ workflow TAXPROFILER { // We can run Kraken2 one-by-one sample-wise ch_input_for_kraken2 = ch_input_for_profiling.kraken2 - .dump(tag: "input for kraken") + .dump(tag: "input_for_kraken") .multiMap { it -> - reads: [ it[0] + it[2], it[1] ] + reads: [ it[0] + it[2], it[1].flatten() ] db: it[3] } From 622fafedc88489026d4c8cd5eb62319e736d1375 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Sat, 26 Mar 2022 20:22:35 +0000 Subject: [PATCH 054/214] Prempt centrifuge database untar crash --- subworkflows/local/db_check.nf | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index 890e373..96f815a 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -12,6 +12,9 @@ workflow DB_CHECK { main: // TODO: make database sheet check + // Checks: + // 1) no duplicates, + // 2) dbs with no special arguments does not have quotes, e.g. just `,,` and NOT `,"",` parsed_samplesheet = DATABASE_CHECK ( dbsheet ) .csv .splitCsv ( header:true, sep:',' ) @@ -21,7 +24,7 @@ workflow DB_CHECK { ch_dbs_for_untar = parsed_samplesheet .branch { - untar: it[1].toString().endsWith(".tar.gz") + untar: it[1].toString().endsWith(".tar.gz") && it[0]['tool'] != 'centrifuge' skip: true } From e6e8ed7cc9904973ce6236c98eb74f8246c3ec09 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Sat, 26 Mar 2022 20:54:50 +0000 Subject: [PATCH 055/214] Add some debugging notes --- conf/modules.config | 4 +-- workflows/taxprofiler.nf | 60 +++++++++++++++++++++------------------- 2 files changed, 34 insertions(+), 30 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 7d50174..5823d3f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -105,7 +105,7 @@ process { pattern: '*.{rma6,tab,text,sam,log}' ] ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.db_name}" } + ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } withName: KRAKEN2_KRAKEN2 { @@ -115,7 +115,7 @@ process { pattern: '*.{fastq.gz,txt}' ] ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.db_name}" } + ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } withName: CUSTOM_DUMPSOFTWAREVERSIONS { diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index c1d7f34..ee921ea 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -74,9 +74,9 @@ workflow TAXPROFILER { ch_versions = Channel.empty() - // - // SUBWORKFLOW: Read in samplesheet, validate and stage input files - // + /* + SUBWORKFLOW: Read in samplesheet, validate and stage input files + */ INPUT_CHECK ( ch_input ) @@ -86,9 +86,9 @@ workflow TAXPROFILER { ch_databases ) - // - // MODULE: Run FastQC - // + /* + MODULE: Run FastQC + */ ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ).dump(tag: "input_to_fastq") FASTQC ( ch_input_for_fastqc @@ -99,9 +99,9 @@ workflow TAXPROFILER { ch_versions.unique().collectFile(name: 'collated_versions.yml') ) - // - // PERFORM PREPROCESSING - // + /* + SUBWORKFLOW: PERFORM PREPROCESSING + */ if ( params.shortread_clipmerge ) { ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads } else { @@ -116,16 +116,19 @@ workflow TAXPROFILER { ch_longreads_preprocessed = INPUT_CHECK.out.nanopore } - // - // PERFORM SHORT READ RUN MERGING + /* + MODULE: PERFORM SHORT READ RUN MERGING + */ + // TODO: Check not necessary for long reads too? - // + // TODO: source of clash - combined should only occur when + // files ARE to be combined. SE/unmerged (see not below) ch_processed_for_combine = ch_shortreads_preprocessed .dump(tag: "prep_for_combine_grouping") .map { meta, reads -> def meta_new = meta.clone() - meta_new['run_accession'] = 'combined' + //meta_new['run_accession'] = 'combined' [ meta_new, reads ] } .groupTuple ( by: 0 ) @@ -134,17 +137,18 @@ workflow TAXPROFILER { skip: it[1].size() < 2 } + // NOTE: this does not allow CATing of SE & PE runs of same sample + // when --shortread_clipmerge_mergepairs is false CAT_FASTQ ( ch_processed_for_combine.combine ) - // TODO May need to flatten reads? ch_reads_for_profiling = ch_processed_for_combine.skip .dump(tag: "skip_combine") .mix( CAT_FASTQ.out.reads ) .dump(tag: "files_for_profiling") - // - // COMBINE READS WITH POSSIBLE DATABASES - // + /* + COMBINE READS WITH POSSIBLE DATABASES + */ // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] ch_input_for_profiling = ch_reads_for_profiling @@ -157,9 +161,9 @@ workflow TAXPROFILER { unknown: true } - // - // PREPARE PROFILER INPUT CHANNELS - // + /* + PREPARE PROFILER INPUT CHANNELS + */ // We groupTuple to have all samples in one channel for MALT as database // loading takes a long time, so we only want to run it once per database @@ -171,7 +175,7 @@ workflow TAXPROFILER { [ temp_meta, it[1], db ] } .groupTuple(by: [0,2]) - .dump(tag: "input_for_malt") + .dump(tag: "input_to_malt") .multiMap { it -> reads: [ it[0], it[1].flatten() ] @@ -180,16 +184,16 @@ workflow TAXPROFILER { // We can run Kraken2 one-by-one sample-wise ch_input_for_kraken2 = ch_input_for_profiling.kraken2 - .dump(tag: "input_for_kraken") + .dump(tag: "input_to_kraken") .multiMap { it -> reads: [ it[0] + it[2], it[1].flatten() ] db: it[3] } - // - // RUN PROFILING - // + /* + MODULE: RUN PROFILING + */ if ( params.run_malt ) { MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) } @@ -198,9 +202,9 @@ workflow TAXPROFILER { KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) } - // - // MODULE: MultiQC - // + /* + MODULE: MultiQC + */ workflow_summary = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) From 8dc9e583add80f51111dc6c796a2ea1413e8731b Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 27 Mar 2022 09:30:23 +0200 Subject: [PATCH 056/214] Debugging run merging --- conf/modules.config | 6 ++--- .../nf-core/modules/kraken2/kraken2/main.nf | 3 ++- subworkflows/local/shortread_fastp.nf | 10 ++++--- subworkflows/local/shortread_preprocessing.nf | 7 ----- workflows/taxprofiler.nf | 27 ++++++++++++++----- 5 files changed, 32 insertions(+), 21 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 5823d3f..41f471f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -65,7 +65,7 @@ process { withName: FASTP { ext.prefix = { "${meta.id}_${meta.run_accession}" } ext.args = [ - // collapsing options + // collapsing options - option to retain singletons params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged", // trimming options params.shortread_clipmerge_skiptrim ? "--disable_adapter_trimming" : "", @@ -105,7 +105,7 @@ process { pattern: '*.{rma6,tab,text,sam,log}' ] ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + ext.prefix = { "${meta.id}-${meta.db_name}" } } withName: KRAKEN2_KRAKEN2 { @@ -115,7 +115,7 @@ process { pattern: '*.{fastq.gz,txt}' ] ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + ext.prefix = { "${meta.id}-${meta.db_name}" } } withName: CUSTOM_DUMPSOFTWAREVERSIONS { diff --git a/modules/nf-core/modules/kraken2/kraken2/main.nf b/modules/nf-core/modules/kraken2/kraken2/main.nf index 3ec5df5..52351f5 100644 --- a/modules/nf-core/modules/kraken2/kraken2/main.nf +++ b/modules/nf-core/modules/kraken2/kraken2/main.nf @@ -32,11 +32,12 @@ process KRAKEN2_KRAKEN2 { --threads $task.cpus \\ --unclassified-out $unclassified \\ --classified-out $classified \\ + $args \\ --report ${prefix}.kraken2.report.txt \\ --gzip-compressed \\ $paired \\ - $args \\ $reads + pigz -p $task.cpus *.fastq diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index 57ce2a6..d4d706e 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -7,7 +7,7 @@ include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fast workflow SHORTREAD_FASTP { take: - reads // file: /path/to/samplesheet.csv + reads // [[meta], [reads]] main: ch_versions = Channel.empty() @@ -24,16 +24,18 @@ workflow SHORTREAD_FASTP { ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) + // Last parameter here turns on merging of PE data FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs ) if ( params.shortread_clipmerge_mergepairs ) { + // TODO update to replace meta suffix ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged .mix( FASTP_SINGLE.out.reads ) .map { meta, reads -> - def meta_new = meta.clone() - meta_new['single_end'] = 1 - [ meta_new, reads ] + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] } } else { ch_fastp_reads_prepped = FASTP_PAIRED.out.reads diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index c82536f..7fba0c0 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -14,13 +14,6 @@ workflow SHORTREAD_PREPROCESSING { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() - // - // STEP: Read clipping and merging - // - // TODO give option to clip only and retain pairs - // TODO give option to retain singletons (probably fastp option likely) - // TODO move to subworkflow - if ( params.shortread_clipmerge_tool == "fastp" ) { ch_processed_reads = SHORTREAD_FASTP ( reads ).reads ch_versions = ch_versions.mix( SHORTREAD_FASTP.out.versions ) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index ee921ea..87f7a30 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -120,25 +120,40 @@ workflow TAXPROFILER { MODULE: PERFORM SHORT READ RUN MERGING */ - // TODO: Check not necessary for long reads too? - // TODO: source of clash - combined should only occur when - // files ARE to be combined. SE/unmerged (see not below) + // Remove run accession to allow grouping by sample. Will only merge + // if pairment type is the same. + + // TODO Current Branch system currently problematic - when single file not in a list, splits at + // `/` so makes list >= 2, so tries to merge, but then breaks kraken downstream + // e.g. `home jfellows Documents git nf-core taxprofiler testing work 68 9a2c8362add37832a776058d280bb7 2612_se.merged.fastq.gz` + // So theoretically need to force this into a list, (but results the can't access meta.id error as incorrect input format) + // But second issue >= 2 is MAYBE sufficient because what if merging two paired-end files? Need to chcek if the input channel formatted correctly for this? Need to check... ch_processed_for_combine = ch_shortreads_preprocessed .dump(tag: "prep_for_combine_grouping") .map { meta, reads -> def meta_new = meta.clone() - //meta_new['run_accession'] = 'combined' + + // remove run accession to allow group by sample + meta_new.remove('run_accession') + + // update id to prevent file name clashes when unable to group + // unmerged PE and SE runs of same sample + def type = meta_new['single_end'] ? "_se" : "_pe" + meta_new['id'] = meta['id'] + type + [ meta_new, reads ] } .groupTuple ( by: 0 ) + .dump(tag: "files_for_cat_fastq_branch") .branch{ - combine: it[1].size() >= 2 - skip: it[1].size() < 2 + combine: it[1] && it[1].size() > 1 + skip: true } // NOTE: this does not allow CATing of SE & PE runs of same sample // when --shortread_clipmerge_mergepairs is false + ch_processed_for_combine.combine.dump(tag: "input_into_cat_fastq") CAT_FASTQ ( ch_processed_for_combine.combine ) ch_reads_for_profiling = ch_processed_for_combine.skip From ee53283696261fa2687fa55402e3cfa3da164509 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 28 Mar 2022 09:13:05 +0200 Subject: [PATCH 057/214] Debugging comment --- workflows/taxprofiler.nf | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 87f7a30..8a05720 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -118,7 +118,6 @@ workflow TAXPROFILER { /* MODULE: PERFORM SHORT READ RUN MERGING - */ // Remove run accession to allow grouping by sample. Will only merge // if pairment type is the same. @@ -128,6 +127,7 @@ workflow TAXPROFILER { // e.g. `home jfellows Documents git nf-core taxprofiler testing work 68 9a2c8362add37832a776058d280bb7 2612_se.merged.fastq.gz` // So theoretically need to force this into a list, (but results the can't access meta.id error as incorrect input format) // But second issue >= 2 is MAYBE sufficient because what if merging two paired-end files? Need to chcek if the input channel formatted correctly for this? Need to check... + ch_processed_for_combine = ch_shortreads_preprocessed .dump(tag: "prep_for_combine_grouping") .map { @@ -160,6 +160,9 @@ workflow TAXPROFILER { .dump(tag: "skip_combine") .mix( CAT_FASTQ.out.reads ) .dump(tag: "files_for_profiling") + */ + + ch_reads_for_profiling = ch_shortreads_preprocessed /* COMBINE READS WITH POSSIBLE DATABASES @@ -198,6 +201,7 @@ workflow TAXPROFILER { } // We can run Kraken2 one-by-one sample-wise + // TODO Only flatten when paired-end! Causing issue commented out above! ch_input_for_kraken2 = ch_input_for_profiling.kraken2 .dump(tag: "input_to_kraken") .multiMap { From 0e0e8128e868df6ea8a4f4c22d7841f518d84c9f Mon Sep 17 00:00:00 2001 From: sofstam Date: Mon, 28 Mar 2022 13:43:32 +0200 Subject: [PATCH 058/214] Prettier format --- nextflow_schema.json | 579 +++++++++++++++++++++---------------------- 1 file changed, 288 insertions(+), 291 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index b4b3e07..b61a50e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,308 +1,305 @@ { - "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/nf-core/taxprofiler/master/nextflow_schema.json", - "title": "nf-core/taxprofiler pipeline parameters", - "description": "Taxonomic profiling of shotgun metagenomic data", - "type": "object", - "definitions": { - "input_output_options": { - "title": "Input/output options", - "type": "object", - "fa_icon": "fas fa-terminal", - "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], - "properties": { - "input": { - "type": "string", - "format": "file-path", - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "schema": "assets/schema_input.json", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" - }, - "outdir": { - "type": "string", - "format": "directory-path", - "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", - "fa_icon": "fas fa-folder-open" - }, - "email": { - "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" - }, - "multiqc_title": { - "type": "string", - "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", - "fa_icon": "fas fa-file-signature" - } - } + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/taxprofiler/master/nextflow_schema.json", + "title": "nf-core/taxprofiler pipeline parameters", + "description": "Taxonomic profiling of shotgun metagenomic data", + "type": "object", + "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": ["input", "outdir"], + "properties": { + "input": { + "type": "string", + "format": "file-path", + "mimetype": "text/csv", + "pattern": "^\\S+\\.csv$", + "schema": "assets/schema_input.json", + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).", + "fa_icon": "fas fa-file-csv" }, - "reference_genome_options": { - "title": "Reference genome options", - "type": "object", - "fa_icon": "fas fa-dna", - "description": "Reference genome related files and options required for the workflow.", - "properties": { - "genome": { - "type": "string", - "description": "Name of iGenomes reference.", - "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." - }, - "igenomes_base": { - "type": "string", - "format": "directory-path", - "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes", - "fa_icon": "fas fa-cloud-download-alt", - "hidden": true - }, - "igenomes_ignore": { - "type": "boolean", - "description": "Do not load the iGenomes reference config.", - "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." - } - } + "outdir": { + "type": "string", + "format": "directory-path", + "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "fa_icon": "fas fa-folder-open" }, - "institutional_config_options": { - "title": "Institutional config options", - "type": "object", - "fa_icon": "fas fa-university", - "description": "Parameters used to describe centralised config profiles. These should not be edited.", - "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", - "properties": { - "custom_config_version": { - "type": "string", - "description": "Git commit id for Institutional configs.", - "default": "master", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "custom_config_base": { - "type": "string", - "description": "Base directory for Institutional configs.", - "default": "https://raw.githubusercontent.com/nf-core/configs/master", - "hidden": true, - "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.", - "fa_icon": "fas fa-users-cog" - }, - "config_profile_name": { - "type": "string", - "description": "Institutional config name.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_description": { - "type": "string", - "description": "Institutional config description.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_contact": { - "type": "string", - "description": "Institutional config contact information.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_url": { - "type": "string", - "description": "Institutional config URL link.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - } - } + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" }, - "max_job_request_options": { - "title": "Max job request options", - "type": "object", - "fa_icon": "fab fa-acquisitions-incorporated", - "description": "Set the top limit for requested resources for any single job.", - "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", - "properties": { - "max_cpus": { - "type": "integer", - "description": "Maximum number of CPUs that can be requested for any single job.", - "default": 16, - "fa_icon": "fas fa-microchip", - "hidden": true, - "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" - }, - "max_memory": { - "type": "string", - "description": "Maximum amount of memory that can be requested for any single job.", - "default": "128.GB", - "fa_icon": "fas fa-memory", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "hidden": true, - "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" - }, - "max_time": { - "type": "string", - "description": "Maximum amount of time that can be requested for any single job.", - "default": "240.h", - "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", - "hidden": true, - "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" - } - } - }, - "generic_options": { - "title": "Generic options", - "type": "object", - "fa_icon": "fas fa-file-import", - "description": "Less common options for the pipeline, typically set in a config file.", - "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", - "properties": { - "help": { - "type": "boolean", - "description": "Display help text.", - "fa_icon": "fas fa-question-circle", - "hidden": true - }, - "publish_dir_mode": { - "type": "string", - "default": "copy", - "description": "Method used to save pipeline results to output directory.", - "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", - "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], - "hidden": true - }, - "email_on_fail": { - "type": "string", - "description": "Email address for completion summary, only when pipeline fails.", - "fa_icon": "fas fa-exclamation-triangle", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", - "help_text": "An email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.", - "hidden": true - }, - "plaintext_email": { - "type": "boolean", - "description": "Send plain-text email instead of HTML.", - "fa_icon": "fas fa-remove-format", - "hidden": true - }, - "max_multiqc_email_size": { - "type": "string", - "description": "File size limit when attaching MultiQC reports to summary emails.", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "default": "25.MB", - "fa_icon": "fas fa-file-upload", - "hidden": true - }, - "monochrome_logs": { - "type": "boolean", - "description": "Do not use coloured log outputs.", - "fa_icon": "fas fa-palette", - "hidden": true - }, - "multiqc_config": { - "type": "string", - "description": "Custom config file to supply to MultiQC.", - "fa_icon": "fas fa-cog", - "hidden": true - }, - "tracedir": { - "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", - "fa_icon": "fas fa-cogs", - "hidden": true - }, - "validate_params": { - "type": "boolean", - "description": "Boolean whether to validate parameters against the schema at runtime", - "default": true, - "fa_icon": "fas fa-check-square", - "hidden": true - }, - "show_hidden_params": { - "type": "boolean", - "fa_icon": "far fa-eye-slash", - "description": "Show all params when using `--help`", - "hidden": true, - "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." - }, - "enable_conda": { - "type": "boolean", - "description": "Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.", - "hidden": true, - "fa_icon": "fas fa-bacon" - } - } + "multiqc_title": { + "type": "string", + "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", + "fa_icon": "fas fa-file-signature" } + } }, - "allOf": [ - { - "$ref": "#/definitions/input_output_options" + "reference_genome_options": { + "title": "Reference genome options", + "type": "object", + "fa_icon": "fas fa-dna", + "description": "Reference genome related files and options required for the workflow.", + "properties": { + "genome": { + "type": "string", + "description": "Name of iGenomes reference.", + "fa_icon": "fas fa-book", + "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." }, - { - "$ref": "#/definitions/reference_genome_options" + "igenomes_base": { + "type": "string", + "format": "directory-path", + "description": "Directory / URL base for iGenomes references.", + "default": "s3://ngi-igenomes/igenomes", + "fa_icon": "fas fa-cloud-download-alt", + "hidden": true }, - { - "$ref": "#/definitions/institutional_config_options" - }, - { - "$ref": "#/definitions/max_job_request_options" - }, - { - "$ref": "#/definitions/generic_options" + "igenomes_ignore": { + "type": "boolean", + "description": "Do not load the iGenomes reference config.", + "fa_icon": "fas fa-ban", + "hidden": true, + "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." } - ], - "properties": { - "databases": { - "type": "string", - "default": "None" + } + }, + "institutional_config_options": { + "title": "Institutional config options", + "type": "object", + "fa_icon": "fas fa-university", + "description": "Parameters used to describe centralised config profiles. These should not be edited.", + "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", + "properties": { + "custom_config_version": { + "type": "string", + "description": "Git commit id for Institutional configs.", + "default": "master", + "hidden": true, + "fa_icon": "fas fa-users-cog" }, - "shortread_clipmerge": { - "type": "boolean" + "custom_config_base": { + "type": "string", + "description": "Base directory for Institutional configs.", + "default": "https://raw.githubusercontent.com/nf-core/configs/master", + "hidden": true, + "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.", + "fa_icon": "fas fa-users-cog" }, - "shortread_excludeunmerged": { - "type": "boolean", - "default": true + "config_profile_name": { + "type": "string", + "description": "Institutional config name.", + "hidden": true, + "fa_icon": "fas fa-users-cog" }, - "longread_clip": { - "type": "boolean" + "config_profile_description": { + "type": "string", + "description": "Institutional config description.", + "hidden": true, + "fa_icon": "fas fa-users-cog" }, - "run_malt": { - "type": "boolean" + "config_profile_contact": { + "type": "string", + "description": "Institutional config contact information.", + "hidden": true, + "fa_icon": "fas fa-users-cog" }, - "malt_mode": { - "type": "string", - "default": "BlastN" - }, - "run_kraken2": { - "type": "boolean" - }, - "run_centrifuge": { - "type": "boolean" - }, - "centrifuge_save_unaligned": { - "type": "boolean" - }, - "centrifuge_save_aligned": { - "type": "boolean" - }, - "centrifuge_sam_format": { - "type": "boolean" + "config_profile_url": { + "type": "string", + "description": "Institutional config URL link.", + "hidden": true, + "fa_icon": "fas fa-users-cog" } + } + }, + "max_job_request_options": { + "title": "Max job request options", + "type": "object", + "fa_icon": "fab fa-acquisitions-incorporated", + "description": "Set the top limit for requested resources for any single job.", + "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", + "properties": { + "max_cpus": { + "type": "integer", + "description": "Maximum number of CPUs that can be requested for any single job.", + "default": 16, + "fa_icon": "fas fa-microchip", + "hidden": true, + "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" + }, + "max_memory": { + "type": "string", + "description": "Maximum amount of memory that can be requested for any single job.", + "default": "128.GB", + "fa_icon": "fas fa-memory", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "hidden": true, + "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" + }, + "max_time": { + "type": "string", + "description": "Maximum amount of time that can be requested for any single job.", + "default": "240.h", + "fa_icon": "far fa-clock", + "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", + "hidden": true, + "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + } + } + }, + "generic_options": { + "title": "Generic options", + "type": "object", + "fa_icon": "fas fa-file-import", + "description": "Less common options for the pipeline, typically set in a config file.", + "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", + "properties": { + "help": { + "type": "boolean", + "description": "Display help text.", + "fa_icon": "fas fa-question-circle", + "hidden": true + }, + "publish_dir_mode": { + "type": "string", + "default": "copy", + "description": "Method used to save pipeline results to output directory.", + "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", + "fa_icon": "fas fa-copy", + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], + "hidden": true + }, + "email_on_fail": { + "type": "string", + "description": "Email address for completion summary, only when pipeline fails.", + "fa_icon": "fas fa-exclamation-triangle", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", + "help_text": "An email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.", + "hidden": true + }, + "plaintext_email": { + "type": "boolean", + "description": "Send plain-text email instead of HTML.", + "fa_icon": "fas fa-remove-format", + "hidden": true + }, + "max_multiqc_email_size": { + "type": "string", + "description": "File size limit when attaching MultiQC reports to summary emails.", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "default": "25.MB", + "fa_icon": "fas fa-file-upload", + "hidden": true + }, + "monochrome_logs": { + "type": "boolean", + "description": "Do not use coloured log outputs.", + "fa_icon": "fas fa-palette", + "hidden": true + }, + "multiqc_config": { + "type": "string", + "description": "Custom config file to supply to MultiQC.", + "fa_icon": "fas fa-cog", + "hidden": true + }, + "tracedir": { + "type": "string", + "description": "Directory to keep pipeline Nextflow logs and reports.", + "default": "${params.outdir}/pipeline_info", + "fa_icon": "fas fa-cogs", + "hidden": true + }, + "validate_params": { + "type": "boolean", + "description": "Boolean whether to validate parameters against the schema at runtime", + "default": true, + "fa_icon": "fas fa-check-square", + "hidden": true + }, + "show_hidden_params": { + "type": "boolean", + "fa_icon": "far fa-eye-slash", + "description": "Show all params when using `--help`", + "hidden": true, + "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "enable_conda": { + "type": "boolean", + "description": "Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.", + "hidden": true, + "fa_icon": "fas fa-bacon" + } + } } + }, + "allOf": [ + { + "$ref": "#/definitions/input_output_options" + }, + { + "$ref": "#/definitions/reference_genome_options" + }, + { + "$ref": "#/definitions/institutional_config_options" + }, + { + "$ref": "#/definitions/max_job_request_options" + }, + { + "$ref": "#/definitions/generic_options" + } + ], + "properties": { + "databases": { + "type": "string", + "default": "None" + }, + "shortread_clipmerge": { + "type": "boolean" + }, + "shortread_excludeunmerged": { + "type": "boolean", + "default": true + }, + "longread_clip": { + "type": "boolean" + }, + "run_malt": { + "type": "boolean" + }, + "malt_mode": { + "type": "string", + "default": "BlastN" + }, + "run_kraken2": { + "type": "boolean" + }, + "run_centrifuge": { + "type": "boolean" + }, + "centrifuge_save_unaligned": { + "type": "boolean" + }, + "centrifuge_save_aligned": { + "type": "boolean" + }, + "centrifuge_sam_format": { + "type": "boolean" + } + } } From 231253227c7e59c1c108cf4cf4953cb89c4f6f5f Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 28 Mar 2022 16:38:10 +0200 Subject: [PATCH 059/214] Remove run merging for now due to complexity --- conf/modules.config | 4 +- subworkflows/local/shortread_fastp.nf | 17 +++++---- workflows/taxprofiler.nf | 55 ++------------------------- 3 files changed, 14 insertions(+), 62 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 41f471f..a9bc3a1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -105,7 +105,7 @@ process { pattern: '*.{rma6,tab,text,sam,log}' ] ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.db_name}" } + ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } withName: KRAKEN2_KRAKEN2 { @@ -115,7 +115,7 @@ process { pattern: '*.{fastq.gz,txt}' ] ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.db_name}" } + ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } withName: CUSTOM_DUMPSOFTWAREVERSIONS { diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index d4d706e..c2e435c 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -28,15 +28,16 @@ workflow SHORTREAD_FASTP { FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs ) if ( params.shortread_clipmerge_mergepairs ) { - // TODO update to replace meta suffix - ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged - .mix( FASTP_SINGLE.out.reads ) - .map { - meta, reads -> - def meta_new = meta.clone() - meta_new['single_end'] = 1 - [ meta_new, reads ] + ch_fastp_reads_prepped_pe = FASTP_PAIRED.out.reads_merged + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] } + + ch_fastp_reads_prepped = ch_fastp_reads_prepped_pe.mix( FASTP_SINGLE.out.reads ) + } else { ch_fastp_reads_prepped = FASTP_PAIRED.out.reads .mix( FASTP_SINGLE.out.reads ) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 8a05720..e73d94d 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -116,63 +116,15 @@ workflow TAXPROFILER { ch_longreads_preprocessed = INPUT_CHECK.out.nanopore } - /* - MODULE: PERFORM SHORT READ RUN MERGING - - // Remove run accession to allow grouping by sample. Will only merge - // if pairment type is the same. - - // TODO Current Branch system currently problematic - when single file not in a list, splits at - // `/` so makes list >= 2, so tries to merge, but then breaks kraken downstream - // e.g. `home jfellows Documents git nf-core taxprofiler testing work 68 9a2c8362add37832a776058d280bb7 2612_se.merged.fastq.gz` - // So theoretically need to force this into a list, (but results the can't access meta.id error as incorrect input format) - // But second issue >= 2 is MAYBE sufficient because what if merging two paired-end files? Need to chcek if the input channel formatted correctly for this? Need to check... - - ch_processed_for_combine = ch_shortreads_preprocessed - .dump(tag: "prep_for_combine_grouping") - .map { - meta, reads -> - def meta_new = meta.clone() - - // remove run accession to allow group by sample - meta_new.remove('run_accession') - - // update id to prevent file name clashes when unable to group - // unmerged PE and SE runs of same sample - def type = meta_new['single_end'] ? "_se" : "_pe" - meta_new['id'] = meta['id'] + type - - [ meta_new, reads ] - } - .groupTuple ( by: 0 ) - .dump(tag: "files_for_cat_fastq_branch") - .branch{ - combine: it[1] && it[1].size() > 1 - skip: true - } - - // NOTE: this does not allow CATing of SE & PE runs of same sample - // when --shortread_clipmerge_mergepairs is false - ch_processed_for_combine.combine.dump(tag: "input_into_cat_fastq") - CAT_FASTQ ( ch_processed_for_combine.combine ) - - ch_reads_for_profiling = ch_processed_for_combine.skip - .dump(tag: "skip_combine") - .mix( CAT_FASTQ.out.reads ) - .dump(tag: "files_for_profiling") - */ - - ch_reads_for_profiling = ch_shortreads_preprocessed - /* COMBINE READS WITH POSSIBLE DATABASES */ // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] - ch_input_for_profiling = ch_reads_for_profiling + ch_input_for_profiling = ch_shortreads_preprocessed .mix( ch_longreads_preprocessed ) .combine(DB_CHECK.out.dbs) - .dump(tag: "reads_plus_db") + .dump(tag: "reads_plus_db_clean") .branch { malt: it[2]['tool'] == 'malt' kraken2: it[2]['tool'] == 'kraken2' @@ -201,12 +153,11 @@ workflow TAXPROFILER { } // We can run Kraken2 one-by-one sample-wise - // TODO Only flatten when paired-end! Causing issue commented out above! ch_input_for_kraken2 = ch_input_for_profiling.kraken2 .dump(tag: "input_to_kraken") .multiMap { it -> - reads: [ it[0] + it[2], it[1].flatten() ] + reads: [ it[0] + it[2], it[1] ] db: it[3] } From 98dc8014a526209b24bdd55e321c10b2dc007a88 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 28 Mar 2022 16:46:45 +0200 Subject: [PATCH 060/214] Fix kraken2 module --- modules.json | 8 ++++---- modules/nf-core/modules/kraken2/kraken2/main.nf | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/modules.json b/modules.json index 9e6428a..2494fc1 100644 --- a/modules.json +++ b/modules.json @@ -24,12 +24,12 @@ "multiqc": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, - "untar": { - "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" - }, "porechop": { "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046" + }, + "untar": { + "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/kraken2/kraken2/main.nf b/modules/nf-core/modules/kraken2/kraken2/main.nf index 52351f5..3ec5df5 100644 --- a/modules/nf-core/modules/kraken2/kraken2/main.nf +++ b/modules/nf-core/modules/kraken2/kraken2/main.nf @@ -32,12 +32,11 @@ process KRAKEN2_KRAKEN2 { --threads $task.cpus \\ --unclassified-out $unclassified \\ --classified-out $classified \\ - $args \\ --report ${prefix}.kraken2.report.txt \\ --gzip-compressed \\ $paired \\ + $args \\ $reads - pigz -p $task.cpus *.fastq From eada201eb28f3572af5481a6d69e4f87407a352d Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 28 Mar 2022 16:47:37 +0200 Subject: [PATCH 061/214] Prettier --- modules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 2494fc1..34ef9f1 100644 --- a/modules.json +++ b/modules.json @@ -32,4 +32,4 @@ } } } -} \ No newline at end of file +} From 94e5cfef4aa7c2dd69e72e7b1b12439110ed1a09 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 28 Mar 2022 18:20:10 +0200 Subject: [PATCH 062/214] Filter long reads for MALT, bump cpus for FastQC for minigut to pass --- conf/base.config | 2 +- conf/test.config | 15 ++++++++++++++- subworkflows/local/db_check.nf | 2 +- workflows/taxprofiler.nf | 6 ++++++ 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/conf/base.config b/conf/base.config index 0c65574..2f0f97f 100644 --- a/conf/base.config +++ b/conf/base.config @@ -27,7 +27,7 @@ process { // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_low { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } + cpus = { check_max( 4 * task.attempt, 'cpus' ) } memory = { check_max( 12.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } } diff --git a/conf/test.config b/conf/test.config index 92a10e4..26972df 100644 --- a/conf/test.config +++ b/conf/test.config @@ -15,7 +15,7 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Limit resources so that this can run on GitHub Actions - max_cpus = 2 + max_cpus = 8 max_memory = '6.GB' max_time = '6.h' @@ -29,3 +29,16 @@ params { shortread_clipmerge = true } + +process { + withName: FASTQC { + cpus = { check_max( 8 * task.attempt, 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 6.h * task.attempt, 'time' ) } + } + withName: FASTQC_PROCESSED { + cpus = { check_max( 8 * task.attempt, 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 6.h * task.attempt, 'time' ) } + } +} diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index 96f815a..bda5cfe 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -14,7 +14,7 @@ workflow DB_CHECK { // TODO: make database sheet check // Checks: // 1) no duplicates, - // 2) dbs with no special arguments does not have quotes, e.g. just `,,` and NOT `,"",` + // 2) args do not have quotes, e.g. just `,,` and NOT `,"",` parsed_samplesheet = DATABASE_CHECK ( dbsheet ) .csv .splitCsv ( header:true, sep:',' ) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index e73d94d..af5c54d 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -90,9 +90,11 @@ workflow TAXPROFILER { MODULE: Run FastQC */ ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ).dump(tag: "input_to_fastq") + FASTQC ( ch_input_for_fastqc ) + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) CUSTOM_DUMPSOFTWAREVERSIONS ( @@ -137,7 +139,11 @@ workflow TAXPROFILER { // We groupTuple to have all samples in one channel for MALT as database // loading takes a long time, so we only want to run it once per database + // TODO document somewhere we only accept illumina short reads for MALT? ch_input_for_malt = ch_input_for_profiling.malt + .dump(tag: "input_to_malt_prefilter") + .filter { it[0]['instrument_platform'] == 'ILLUMINA' } + .dump(tag: "input_to_malt_postfilter") .map { it -> def temp_meta = [ id: it[2]['db_name']] + it[2] From 04fb6f412752811ac8fc621c8730d418986a1d08 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 28 Mar 2022 20:38:16 +0200 Subject: [PATCH 063/214] Use new resources for test data --- conf/test.config | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/conf/test.config b/conf/test.config index 26972df..92a10e4 100644 --- a/conf/test.config +++ b/conf/test.config @@ -15,7 +15,7 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Limit resources so that this can run on GitHub Actions - max_cpus = 8 + max_cpus = 2 max_memory = '6.GB' max_time = '6.h' @@ -29,16 +29,3 @@ params { shortread_clipmerge = true } - -process { - withName: FASTQC { - cpus = { check_max( 8 * task.attempt, 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 6.h * task.attempt, 'time' ) } - } - withName: FASTQC_PROCESSED { - cpus = { check_max( 8 * task.attempt, 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 6.h * task.attempt, 'time' ) } - } -} From d25f97c5061ef7759196f04017ec2b7710075e48 Mon Sep 17 00:00:00 2001 From: sofstam Date: Mon, 28 Mar 2022 22:08:05 +0200 Subject: [PATCH 064/214] Prettier format --- nextflow_schema.json | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index b61a50e..f1b1de0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", From c552819c725a51fe2f04af981ebd90c311b33c30 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 28 Mar 2022 22:15:31 +0200 Subject: [PATCH 065/214] Apply prettier again --- nextflow_schema.json | 572 +++++++++++++++++++++---------------------- 1 file changed, 281 insertions(+), 291 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index f1b1de0..0e52ee5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,308 +1,298 @@ { - "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/nf-core/taxprofiler/master/nextflow_schema.json", - "title": "nf-core/taxprofiler pipeline parameters", - "description": "Taxonomic profiling of shotgun metagenomic data", - "type": "object", - "definitions": { - "input_output_options": { - "title": "Input/output options", - "type": "object", - "fa_icon": "fas fa-terminal", - "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], - "properties": { - "input": { - "type": "string", - "format": "file-path", - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "schema": "assets/schema_input.json", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/taxprofiler/master/nextflow_schema.json", + "title": "nf-core/taxprofiler pipeline parameters", + "description": "Taxonomic profiling of shotgun metagenomic data", + "type": "object", + "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": ["input", "outdir"], + "properties": { + "input": { + "type": "string", + "format": "file-path", + "mimetype": "text/csv", + "pattern": "^\\S+\\.csv$", + "schema": "assets/schema_input.json", + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).", + "fa_icon": "fas fa-file-csv" + }, + "outdir": { + "type": "string", + "format": "directory-path", + "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "fa_icon": "fas fa-folder-open" + }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + }, + "multiqc_title": { + "type": "string", + "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", + "fa_icon": "fas fa-file-signature" + } + } }, - "outdir": { - "type": "string", - "format": "directory-path", - "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", - "fa_icon": "fas fa-folder-open" + "reference_genome_options": { + "title": "Reference genome options", + "type": "object", + "fa_icon": "fas fa-dna", + "description": "Reference genome related files and options required for the workflow.", + "properties": { + "genome": { + "type": "string", + "description": "Name of iGenomes reference.", + "fa_icon": "fas fa-book", + "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + }, + "igenomes_base": { + "type": "string", + "format": "directory-path", + "description": "Directory / URL base for iGenomes references.", + "default": "s3://ngi-igenomes/igenomes", + "fa_icon": "fas fa-cloud-download-alt", + "hidden": true + }, + "igenomes_ignore": { + "type": "boolean", + "description": "Do not load the iGenomes reference config.", + "fa_icon": "fas fa-ban", + "hidden": true, + "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + } + } }, - "email": { - "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + "institutional_config_options": { + "title": "Institutional config options", + "type": "object", + "fa_icon": "fas fa-university", + "description": "Parameters used to describe centralised config profiles. These should not be edited.", + "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", + "properties": { + "custom_config_version": { + "type": "string", + "description": "Git commit id for Institutional configs.", + "default": "master", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "custom_config_base": { + "type": "string", + "description": "Base directory for Institutional configs.", + "default": "https://raw.githubusercontent.com/nf-core/configs/master", + "hidden": true, + "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.", + "fa_icon": "fas fa-users-cog" + }, + "config_profile_name": { + "type": "string", + "description": "Institutional config name.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_description": { + "type": "string", + "description": "Institutional config description.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_contact": { + "type": "string", + "description": "Institutional config contact information.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_url": { + "type": "string", + "description": "Institutional config URL link.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + } + } }, - "multiqc_title": { - "type": "string", - "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", - "fa_icon": "fas fa-file-signature" + "max_job_request_options": { + "title": "Max job request options", + "type": "object", + "fa_icon": "fab fa-acquisitions-incorporated", + "description": "Set the top limit for requested resources for any single job.", + "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", + "properties": { + "max_cpus": { + "type": "integer", + "description": "Maximum number of CPUs that can be requested for any single job.", + "default": 16, + "fa_icon": "fas fa-microchip", + "hidden": true, + "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" + }, + "max_memory": { + "type": "string", + "description": "Maximum amount of memory that can be requested for any single job.", + "default": "128.GB", + "fa_icon": "fas fa-memory", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "hidden": true, + "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" + }, + "max_time": { + "type": "string", + "description": "Maximum amount of time that can be requested for any single job.", + "default": "240.h", + "fa_icon": "far fa-clock", + "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", + "hidden": true, + "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + } + } + }, + "generic_options": { + "title": "Generic options", + "type": "object", + "fa_icon": "fas fa-file-import", + "description": "Less common options for the pipeline, typically set in a config file.", + "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", + "properties": { + "help": { + "type": "boolean", + "description": "Display help text.", + "fa_icon": "fas fa-question-circle", + "hidden": true + }, + "publish_dir_mode": { + "type": "string", + "default": "copy", + "description": "Method used to save pipeline results to output directory.", + "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", + "fa_icon": "fas fa-copy", + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "hidden": true + }, + "email_on_fail": { + "type": "string", + "description": "Email address for completion summary, only when pipeline fails.", + "fa_icon": "fas fa-exclamation-triangle", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", + "help_text": "An email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.", + "hidden": true + }, + "plaintext_email": { + "type": "boolean", + "description": "Send plain-text email instead of HTML.", + "fa_icon": "fas fa-remove-format", + "hidden": true + }, + "max_multiqc_email_size": { + "type": "string", + "description": "File size limit when attaching MultiQC reports to summary emails.", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "default": "25.MB", + "fa_icon": "fas fa-file-upload", + "hidden": true + }, + "monochrome_logs": { + "type": "boolean", + "description": "Do not use coloured log outputs.", + "fa_icon": "fas fa-palette", + "hidden": true + }, + "multiqc_config": { + "type": "string", + "description": "Custom config file to supply to MultiQC.", + "fa_icon": "fas fa-cog", + "hidden": true + }, + "tracedir": { + "type": "string", + "description": "Directory to keep pipeline Nextflow logs and reports.", + "default": "${params.outdir}/pipeline_info", + "fa_icon": "fas fa-cogs", + "hidden": true + }, + "validate_params": { + "type": "boolean", + "description": "Boolean whether to validate parameters against the schema at runtime", + "default": true, + "fa_icon": "fas fa-check-square", + "hidden": true + }, + "show_hidden_params": { + "type": "boolean", + "fa_icon": "far fa-eye-slash", + "description": "Show all params when using `--help`", + "hidden": true, + "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "enable_conda": { + "type": "boolean", + "description": "Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.", + "hidden": true, + "fa_icon": "fas fa-bacon" + } + } } - } }, - "reference_genome_options": { - "title": "Reference genome options", - "type": "object", - "fa_icon": "fas fa-dna", - "description": "Reference genome related files and options required for the workflow.", - "properties": { - "genome": { - "type": "string", - "description": "Name of iGenomes reference.", - "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + "allOf": [ + { + "$ref": "#/definitions/input_output_options" }, - "igenomes_base": { - "type": "string", - "format": "directory-path", - "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes", - "fa_icon": "fas fa-cloud-download-alt", - "hidden": true + { + "$ref": "#/definitions/reference_genome_options" }, - "igenomes_ignore": { - "type": "boolean", - "description": "Do not load the iGenomes reference config.", - "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + { + "$ref": "#/definitions/institutional_config_options" + }, + { + "$ref": "#/definitions/max_job_request_options" + }, + { + "$ref": "#/definitions/generic_options" } - } - }, - "institutional_config_options": { - "title": "Institutional config options", - "type": "object", - "fa_icon": "fas fa-university", - "description": "Parameters used to describe centralised config profiles. These should not be edited.", - "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", - "properties": { - "custom_config_version": { - "type": "string", - "description": "Git commit id for Institutional configs.", - "default": "master", - "hidden": true, - "fa_icon": "fas fa-users-cog" + ], + "properties": { + "databases": { + "type": "string", + "default": "None" }, - "custom_config_base": { - "type": "string", - "description": "Base directory for Institutional configs.", - "default": "https://raw.githubusercontent.com/nf-core/configs/master", - "hidden": true, - "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.", - "fa_icon": "fas fa-users-cog" + "shortread_clipmerge": { + "type": "boolean" }, - "config_profile_name": { - "type": "string", - "description": "Institutional config name.", - "hidden": true, - "fa_icon": "fas fa-users-cog" + "shortread_excludeunmerged": { + "type": "boolean", + "default": true }, - "config_profile_description": { - "type": "string", - "description": "Institutional config description.", - "hidden": true, - "fa_icon": "fas fa-users-cog" + "longread_clip": { + "type": "boolean" }, - "config_profile_contact": { - "type": "string", - "description": "Institutional config contact information.", - "hidden": true, - "fa_icon": "fas fa-users-cog" + "run_malt": { + "type": "boolean" }, - "config_profile_url": { - "type": "string", - "description": "Institutional config URL link.", - "hidden": true, - "fa_icon": "fas fa-users-cog" + "malt_mode": { + "type": "string", + "default": "BlastN" + }, + "run_kraken2": { + "type": "boolean" + }, + "run_centrifuge": { + "type": "boolean" + }, + "centrifuge_save_unaligned": { + "type": "boolean" + }, + "centrifuge_save_aligned": { + "type": "boolean" + }, + "centrifuge_sam_format": { + "type": "boolean" } - } - }, - "max_job_request_options": { - "title": "Max job request options", - "type": "object", - "fa_icon": "fab fa-acquisitions-incorporated", - "description": "Set the top limit for requested resources for any single job.", - "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", - "properties": { - "max_cpus": { - "type": "integer", - "description": "Maximum number of CPUs that can be requested for any single job.", - "default": 16, - "fa_icon": "fas fa-microchip", - "hidden": true, - "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" - }, - "max_memory": { - "type": "string", - "description": "Maximum amount of memory that can be requested for any single job.", - "default": "128.GB", - "fa_icon": "fas fa-memory", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "hidden": true, - "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" - }, - "max_time": { - "type": "string", - "description": "Maximum amount of time that can be requested for any single job.", - "default": "240.h", - "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", - "hidden": true, - "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" - } - } - }, - "generic_options": { - "title": "Generic options", - "type": "object", - "fa_icon": "fas fa-file-import", - "description": "Less common options for the pipeline, typically set in a config file.", - "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", - "properties": { - "help": { - "type": "boolean", - "description": "Display help text.", - "fa_icon": "fas fa-question-circle", - "hidden": true - }, - "publish_dir_mode": { - "type": "string", - "default": "copy", - "description": "Method used to save pipeline results to output directory.", - "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", - "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], - "hidden": true - }, - "email_on_fail": { - "type": "string", - "description": "Email address for completion summary, only when pipeline fails.", - "fa_icon": "fas fa-exclamation-triangle", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", - "help_text": "An email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.", - "hidden": true - }, - "plaintext_email": { - "type": "boolean", - "description": "Send plain-text email instead of HTML.", - "fa_icon": "fas fa-remove-format", - "hidden": true - }, - "max_multiqc_email_size": { - "type": "string", - "description": "File size limit when attaching MultiQC reports to summary emails.", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "default": "25.MB", - "fa_icon": "fas fa-file-upload", - "hidden": true - }, - "monochrome_logs": { - "type": "boolean", - "description": "Do not use coloured log outputs.", - "fa_icon": "fas fa-palette", - "hidden": true - }, - "multiqc_config": { - "type": "string", - "description": "Custom config file to supply to MultiQC.", - "fa_icon": "fas fa-cog", - "hidden": true - }, - "tracedir": { - "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", - "fa_icon": "fas fa-cogs", - "hidden": true - }, - "validate_params": { - "type": "boolean", - "description": "Boolean whether to validate parameters against the schema at runtime", - "default": true, - "fa_icon": "fas fa-check-square", - "hidden": true - }, - "show_hidden_params": { - "type": "boolean", - "fa_icon": "far fa-eye-slash", - "description": "Show all params when using `--help`", - "hidden": true, - "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." - }, - "enable_conda": { - "type": "boolean", - "description": "Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.", - "hidden": true, - "fa_icon": "fas fa-bacon" - } - } } - }, - "allOf": [ - { - "$ref": "#/definitions/input_output_options" - }, - { - "$ref": "#/definitions/reference_genome_options" - }, - { - "$ref": "#/definitions/institutional_config_options" - }, - { - "$ref": "#/definitions/max_job_request_options" - }, - { - "$ref": "#/definitions/generic_options" - } - ], - "properties": { - "databases": { - "type": "string", - "default": "None" - }, - "shortread_clipmerge": { - "type": "boolean" - }, - "shortread_excludeunmerged": { - "type": "boolean", - "default": true - }, - "longread_clip": { - "type": "boolean" - }, - "run_malt": { - "type": "boolean" - }, - "malt_mode": { - "type": "string", - "default": "BlastN" - }, - "run_kraken2": { - "type": "boolean" - }, - "run_centrifuge": { - "type": "boolean" - }, - "centrifuge_save_unaligned": { - "type": "boolean" - }, - "centrifuge_save_aligned": { - "type": "boolean" - }, - "centrifuge_sam_format": { - "type": "boolean" - } - } } From 98a9688329e66814764746a9219580e841e84477 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 29 Mar 2022 15:18:06 +0200 Subject: [PATCH 066/214] Change CPUs back to template, update skiptrim fiag --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 2f0f97f..0c65574 100644 --- a/conf/base.config +++ b/conf/base.config @@ -27,7 +27,7 @@ process { // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_low { - cpus = { check_max( 4 * task.attempt, 'cpus' ) } + cpus = { check_max( 2 * task.attempt, 'cpus' ) } memory = { check_max( 12.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } } From 76e9624e3f04e573a56c1443884b2ca385488319 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 30 Mar 2022 11:31:37 +0200 Subject: [PATCH 067/214] NOW commit the adapte rtrim flag --- conf/modules.config | 2 +- nextflow.config | 2 +- nextflow_schema.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index a9bc3a1..71eaa75 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -68,7 +68,7 @@ process { // collapsing options - option to retain singletons params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged", // trimming options - params.shortread_clipmerge_skiptrim ? "--disable_adapter_trimming" : "", + params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "", params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", !{ ${meta.single_end} } && params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : "", // filtering options diff --git a/nextflow.config b/nextflow.config index a312d0c..6fde513 100644 --- a/nextflow.config +++ b/nextflow.config @@ -57,7 +57,7 @@ params { // FASTQ preprocessing shortread_clipmerge = false shortread_clipmerge_tool = 'fastp' - shortread_clipmerge_skiptrim = false + shortread_clipmerge_skipadaptertrim = false shortread_clipmerge_mergepairs = false shortread_clipmerge_excludeunmerged = true shortread_clipmerge_adapter1 = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 545f990..adc5a73 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -286,7 +286,7 @@ "type": "string", "default": "fastp" }, - "shortread_clipmerge_skiptrim": { + "shortread_clipmerge_skipadaptertrim": { "type": "boolean" }, "shortread_clipmerge_mergepairs": { From c4c93bd59d5390c7970dea72575ff9bf8fe1707b Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 31 Mar 2022 15:31:45 +0200 Subject: [PATCH 068/214] Start working on adding adapterremoval --- modules.json | 5 +- .../nf-core/modules/adapterremoval/main.nf | 70 ++++++++++++++ .../nf-core/modules/adapterremoval/meta.yml | 90 ++++++++++++++++++ nextflow_schema.json | 26 ++++-- .../local/shortread_adapterremoval.nf | 91 +++++++++++++++++++ subworkflows/local/shortread_fastp.nf | 8 +- subworkflows/local/shortread_preprocessing.nf | 5 + 7 files changed, 281 insertions(+), 14 deletions(-) create mode 100644 modules/nf-core/modules/adapterremoval/main.nf create mode 100644 modules/nf-core/modules/adapterremoval/meta.yml create mode 100644 subworkflows/local/shortread_adapterremoval.nf diff --git a/modules.json b/modules.json index 34ef9f1..c4b2f12 100644 --- a/modules.json +++ b/modules.json @@ -3,6 +3,9 @@ "homePage": "https://github.com/nf-core/taxprofiler", "repos": { "nf-core/modules": { + "adapterremoval": { + "git_sha": "f0800157544a82ae222931764483331a81812012" + }, "cat/fastq": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, @@ -32,4 +35,4 @@ } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/adapterremoval/main.nf b/modules/nf-core/modules/adapterremoval/main.nf new file mode 100644 index 0000000..9d16b9c --- /dev/null +++ b/modules/nf-core/modules/adapterremoval/main.nf @@ -0,0 +1,70 @@ +process ADAPTERREMOVAL { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? "bioconda::adapterremoval=2.3.2" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/adapterremoval:2.3.2--hb7ba0dd_0' : + 'quay.io/biocontainers/adapterremoval:2.3.2--hb7ba0dd_0' }" + + input: + tuple val(meta), path(reads) + path(adapterlist) + + output: + tuple val(meta), path("${prefix}.truncated.gz") , optional: true, emit: singles_truncated + tuple val(meta), path("${prefix}.discarded.gz") , optional: true, emit: discarded + tuple val(meta), path("${prefix}.pair1.truncated.gz") , optional: true, emit: pair1_truncated + tuple val(meta), path("${prefix}.pair2.truncated.gz") , optional: true, emit: pair2_truncated + tuple val(meta), path("${prefix}.collapsed.gz") , optional: true, emit: collapsed + tuple val(meta), path("${prefix}.collapsed.truncated.gz") , optional: true, emit: collapsed_truncated + tuple val(meta), path("${prefix}.paired.gz") , optional: true, emit: paired_interleaved + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def list = adapterlist ? "--adapter-list ${adapterlist}" : "" + prefix = task.ext.prefix ?: "${meta.id}" + + if (meta.single_end) { + """ + AdapterRemoval \\ + --file1 $reads \\ + $args \\ + $adapterlist \\ + --basename ${prefix} \\ + --threads ${task.cpus} \\ + --settings ${prefix}.log \\ + --seed 42 \\ + --gzip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g") + END_VERSIONS + """ + } else { + """ + AdapterRemoval \\ + --file1 ${reads[0]} \\ + --file2 ${reads[1]} \\ + $args \\ + $adapterlist \\ + --basename ${prefix} \\ + --threads $task.cpus \\ + --settings ${prefix}.log \\ + --seed 42 \\ + --gzip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g") + END_VERSIONS + """ + } + +} diff --git a/modules/nf-core/modules/adapterremoval/meta.yml b/modules/nf-core/modules/adapterremoval/meta.yml new file mode 100644 index 0000000..5faad04 --- /dev/null +++ b/modules/nf-core/modules/adapterremoval/meta.yml @@ -0,0 +1,90 @@ +name: adapterremoval +description: Trim sequencing adapters and collapse overlapping reads +keywords: + - trimming + - adapters + - merging + - fastq +tools: + - adapterremoval: + description: The AdapterRemoval v2 tool for merging and clipping reads. + homepage: https://github.com/MikkelSchubert/adapterremoval + documentation: https://adapterremoval.readthedocs.io + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + pattern: "*.{fq,fastq,fq.gz,fastq.gz}" + - adapterlist: + type: file + description: Optional text file containing list of adapters to look for for removal + with one adapter per line. Otherwise will look for default adapters (see + AdapterRemoval man page), or can be modified to remove user-specified + adapters via ext.args. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - singles_truncated: + type: file + description: | + Adapter trimmed FastQ files of either single-end reads, or singleton + 'orphaned' reads from merging of paired-end data (i.e., one of the pair + was lost due to filtering thresholds). + pattern: "*.truncated.gz" + - discarded: + type: file + description: | + Adapter trimmed FastQ files of reads that did not pass filtering + thresholds. + pattern: "*.discarded.gz" + - pair1_truncated: + type: file + description: | + Adapter trimmed R1 FastQ files of paired-end reads that did not merge + with their respective R2 pair due to long templates. The respective pair + is stored in 'pair2_truncated'. + pattern: "*.pair1.truncated.gz" + - pair2_truncated: + type: file + description: | + Adapter trimmed R2 FastQ files of paired-end reads that did not merge + with their respective R1 pair due to long templates. The respective pair + is stored in 'pair1_truncated'. + pattern: "*.pair2.truncated.gz" + - collapsed: + type: file + description: | + Collapsed FastQ of paired-end reads that successfully merged with their + respective R1 pair but were not trimmed. + pattern: "*.collapsed.gz" + - collapsed_truncated: + type: file + description: | + Collapsed FastQ of paired-end reads that successfully merged with their + respective R1 pair and were trimmed of adapter due to sufficient overlap. + pattern: "*.collapsed.truncated.gz" + - log: + type: file + description: AdapterRemoval log file + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@maxibor" + - "@jfy133" diff --git a/nextflow_schema.json b/nextflow_schema.json index adc5a73..ebb748a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -284,7 +294,11 @@ }, "shortread_clipmerge_tool": { "type": "string", - "default": "fastp" + "default": "fastp", + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -294,15 +308,15 @@ }, "shortread_clipmerge_adapter1": { "type": "string", - "default": null + "default": "None" }, "shortread_clipmerge_adapter2": { "type": "string", - "default": null + "default": "None" }, "shortread_clipmerge_minlength": { "type": "integer", "default": 15 } } -} +} \ No newline at end of file diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf new file mode 100644 index 0000000..b09356a --- /dev/null +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -0,0 +1,91 @@ +/* +Process short raw reads with AdapterRemoval +*/ + +include { ADAPTERREMOVAL as ADAPTERREMOVAL_SINGLE } from '../../modules/nf-core/modules/adapterremoval/main' +include { ADAPTERREMOVAL as ADAPTERREMOVAL_PAIRED } from '../../modules/nf-core/modules/adapterremoval/main' +include { CAT_FASTQ } from '../../modules/nf-core/modules/cat/fastq/main' + +workflow SHORTREAD_ADAPTERREMOVAL { + + take: + reads // [[meta], [reads]] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + ch_input_for_adapterremoval = reads + .dump(tag: "pre_adapterremoval_branch") + .branch{ + single: it[0]['single_end'] == true + paired: it[0]['single_end'] == false + } + + ADAPTERREMOVAL_SINGLE ( ch_input_for_adapterremoval.single, [] ) + ADAPTERREMOVAL_PAIRED ( ch_input_for_adapterremoval.paired, [] ) + + if ( params.shortread_clipmerge_mergepairs && !params.shortread_clipmerge_excludeunmerged ) { + ch_adapterremoval_for_cat = ADAPTERREMOVAL_PAIRED.out.collapsed + .mix( + ADAPTERREMOVAL_PAIRED.out.collapsed_truncated, + ADAPTERREMOVAL_PAIRED.out.singles_truncated, + ADAPTERREMOVAL_PAIRED.out.pair1_truncated, + ADAPTERREMOVAL_PAIRED.out.pair2_truncated + ) + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + + [ meta_new, reads ] + } + .groupTuple(by: 0) + ch_adapterremoval_reads_prepped_pe = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads + + ch_adapterremoval_reads_prepped = ch_adapterremoval_reads_prepped_pe.mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) + + } else if ( params.shortread_clipmerge_mergepairs && params.shortread_clipmerge_excludeunmerged ) { + ch_adapterremoval_for_cat = ADAPTERREMOVAL_PAIRED.out.collapsed + .mix( ADAPTERREMOVAL_PAIRED.out.collapsed_truncated ) + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + + [ meta_new, reads ] + } + .groupTuple(by: 0) + + ch_adapterremoval_reads_prepped_pe = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads + + ch_adapterremoval_reads_prepped = ch_adapterremoval_reads_prepped_pe.mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) + + } else { + + ch_adapterremoval_reads_prepped_pe = ADAPTERREMOVAL_PAIRED.out.pair1_truncated + .join( ADAPTERREMOVAL_PAIRED.out.pair2_truncated ) + .dump(tag: "pre-group") + .groupTuple(by: 0) + .dump(tag: "post-group") + .map { meta, pair1, pair2 -> + [ meta, [ pair1, pair2 ].flatten() ] + } + .dump(tag: "post-map") + + + ch_adapterremoval_reads_prepped = ch_adapterremoval_reads_prepped_pe + .mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) + } + + ch_processed_reads = ch_adapterremoval_reads_prepped + + ch_versions = ch_versions.mix( ADAPTERREMOVAL_SINGLE.out.versions.first() ) + ch_versions = ch_versions.mix( ADAPTERREMOVAL_PAIRED.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( ADAPTERREMOVAL_PAIRED.out.log.collect{it[1]}, ADAPTERREMOVAL_SINGLE.out.log.collect{it[1]} ) + + emit: + reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index c2e435c..48817db 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -14,15 +14,11 @@ workflow SHORTREAD_FASTP { ch_multiqc_files = Channel.empty() ch_input_for_fastp = reads - .dump(tag: "pre-fastp_branch") .branch{ single: it[0]['single_end'] == true paired: it[0]['single_end'] == false } - ch_input_for_fastp.single.dump(tag: "input_fastp_single") - ch_input_for_fastp.paired.dump(tag: "input_fastp_paired") - FASTP_SINGLE ( ch_input_for_fastp.single, false, false ) // Last parameter here turns on merging of PE data FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs ) @@ -46,13 +42,11 @@ workflow SHORTREAD_FASTP { ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) - ch_processed_reads = ch_fastp_reads_prepped.dump(tag: "ch_fastp_reads_prepped") + ch_processed_reads = ch_fastp_reads_prepped ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} ) ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} ) - ch_multiqc_files.dump(tag: "preprocessing_fastp_mqc_final") - emit: reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] versions = ch_versions // channel: [ versions.yml ] diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index 7fba0c0..f0a1ed4 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -4,6 +4,7 @@ include { SHORTREAD_FASTP } from './shortread_fastp' +include { SHORTREAD_ADAPTERREMOVAL } from './shortread_adapterremoval' include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main' workflow SHORTREAD_PREPROCESSING { @@ -18,6 +19,10 @@ workflow SHORTREAD_PREPROCESSING { ch_processed_reads = SHORTREAD_FASTP ( reads ).reads ch_versions = ch_versions.mix( SHORTREAD_FASTP.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc ) + } else if ( params.shortread_clipmerge_tool == "adapterremoval" ) { + ch_processed_reads = SHORTREAD_ADAPTERREMOVAL ( reads ).reads + ch_versions = ch_versions.mix( SHORTREAD_ADAPTERREMOVAL.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_ADAPTERREMOVAL.out.mqc ) } else { ch_processed_reads = reads } From dcda815bfb03697b69f219f26af69708bd2d28b5 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 1 Apr 2022 09:50:08 +0200 Subject: [PATCH 069/214] Add final adapterremoval options to match fastp functionality --- conf/modules.config | 70 ++++++++++++++++--- nextflow.config | 2 +- nextflow_schema.json | 23 ++---- .../local/shortread_adapterremoval.nf | 10 ++- 4 files changed, 71 insertions(+), 34 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 71eaa75..dc8b138 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -62,18 +62,15 @@ process { ] } - withName: FASTP { - ext.prefix = { "${meta.id}_${meta.run_accession}" } + withName: FASTP_SINGLE { ext.args = [ - // collapsing options - option to retain singletons - params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged", // trimming options params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "", params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", - !{ ${meta.single_end} } && params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : "", // filtering options "--length_required ${params.shortread_clipmerge_minlength}" ].join(' ').trim() + ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ path: { "${params.outdir}/fastp" }, mode: 'copy', @@ -81,6 +78,61 @@ process { ] } + withName: FASTP_PAIRED { + ext.args = [ + // collapsing options - option to retain singletons + params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged", + // trimming options + params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "", + params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", + params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe", + // filtering options + "--length_required ${params.shortread_clipmerge_minlength}" + ].join(' ').trim() + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/fastp" }, + mode: 'copy', + pattern: '*.fastq.gz' + ] + } + + withName: ADAPTERREMOVAL_SINGLE { + ext.args = [ + // trimming options + params.shortread_clipmerge_skipadaptertrim ? "--adapter1 '' --adapter2 ''" : "", + params.shortread_clipmerge_adapter1 ? "--adapter1 ${params.shortread_clipmerge_adapter1}" : "", + // filtering options + "--minlength ${params.shortread_clipmerge_minlength}" + ].join(' ').trim() + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/adapterremoval" }, + mode: 'copy', + pattern: '*.fastq.gz' + ] + } + + withName: ADAPTERREMOVAL_PAIRED { + ext.args = [ + // collapsing options + params.shortread_clipmerge_mergepairs ? "--collapse" : "", + // trimming options + params.shortread_clipmerge_skipadaptertrim ? "--adapter1 '' --adapter2 ''" : "", + params.shortread_clipmerge_adapter1 ? "--adapter1 ${params.shortread_clipmerge_adapter1}" : "", + params.shortread_clipmerge_adapter2 ? "--adapter2 ${params.shortread_clipmerge_adapter2}" : "", + // filtering options + "--minlength ${params.shortread_clipmerge_minlength}" + ].join(' ').trim() + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/adapterremoval" }, + mode: 'copy', + pattern: '*.fastq.gz' + ] + } + + withName: PORECHOP { ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ @@ -99,23 +151,23 @@ process { } withName: MALT_RUN { + ext.args = { "${meta.db_params}" } + ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/malt/${meta.db_name}" }, mode: 'copy', pattern: '*.{rma6,tab,text,sam,log}' ] - ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } withName: KRAKEN2_KRAKEN2 { + ext.args = { "${meta.db_params}" } + ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/kraken2/${meta.db_name}" }, mode: 'copy', pattern: '*.{fastq.gz,txt}' ] - ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } withName: CUSTOM_DUMPSOFTWAREVERSIONS { diff --git a/nextflow.config b/nextflow.config index 6fde513..7be36a6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -59,7 +59,7 @@ params { shortread_clipmerge_tool = 'fastp' shortread_clipmerge_skipadaptertrim = false shortread_clipmerge_mergepairs = false - shortread_clipmerge_excludeunmerged = true + shortread_clipmerge_excludeunmerged = false shortread_clipmerge_adapter1 = null shortread_clipmerge_adapter2 = null shortread_clipmerge_minlength = 15 diff --git a/nextflow_schema.json b/nextflow_schema.json index ebb748a..fb2ca31 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -277,7 +267,7 @@ }, "shortread_clipmerge_excludeunmerged": { "type": "boolean", - "default": true + "default": false }, "longread_clip": { "type": "boolean" @@ -295,10 +285,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -319,4 +306,4 @@ "default": 15 } } -} \ No newline at end of file +} diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index b09356a..b467a64 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -16,7 +16,6 @@ workflow SHORTREAD_ADAPTERREMOVAL { ch_multiqc_files = Channel.empty() ch_input_for_adapterremoval = reads - .dump(tag: "pre_adapterremoval_branch") .branch{ single: it[0]['single_end'] == true paired: it[0]['single_end'] == false @@ -36,11 +35,13 @@ workflow SHORTREAD_ADAPTERREMOVAL { .map { meta, reads -> def meta_new = meta.clone() - meta_new['single_end'] = 1 + meta_new['single_end'] = true [ meta_new, reads ] } .groupTuple(by: 0) + + ch_adapterremoval_reads_prepped_pe = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads ch_adapterremoval_reads_prepped = ch_adapterremoval_reads_prepped_pe.mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) @@ -51,7 +52,7 @@ workflow SHORTREAD_ADAPTERREMOVAL { .map { meta, reads -> def meta_new = meta.clone() - meta_new['single_end'] = 1 + meta_new['single_end'] = true [ meta_new, reads ] } @@ -65,13 +66,10 @@ workflow SHORTREAD_ADAPTERREMOVAL { ch_adapterremoval_reads_prepped_pe = ADAPTERREMOVAL_PAIRED.out.pair1_truncated .join( ADAPTERREMOVAL_PAIRED.out.pair2_truncated ) - .dump(tag: "pre-group") .groupTuple(by: 0) - .dump(tag: "post-group") .map { meta, pair1, pair2 -> [ meta, [ pair1, pair2 ].flatten() ] } - .dump(tag: "post-map") ch_adapterremoval_reads_prepped = ch_adapterremoval_reads_prepped_pe From b04cfec2c2ad651e498bf8b0a21108ccb51a1a0a Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 1 Apr 2022 11:13:36 +0200 Subject: [PATCH 070/214] Prettier linting --- CITATIONS.md | 18 +++++++++++++++++- modules.json | 2 +- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 1018ccd..6e43999 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -13,9 +13,25 @@ - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) + > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. -* [Porechop](https://github.com/rrwick/Porechop) +- [fastp](https://doi.org/10.1093/bioinformatics/bty560) + + > Chen, Shifu, Yanqing Zhou, Yaru Chen, and Jia Gu. 2018. “Fastp: An Ultra-Fast All-in-One FASTQ Preprocessor.” Bioinformatics 34 (17): i884-90. 10.1093/bioinformatics/bty560. + +- [AdapterRemoval2](https://doi.org/10.1186/s13104-016-1900-2) + + > Schubert, Mikkel, Stinus Lindgreen, and Ludovic Orlando. 2016. “AdapterRemoval v2: Rapid Adapter Trimming, Identification, and Read Merging.” BMC Research Notes 9 (February): 88. doi:10.1186/s13104-016-1900-2. + +- [Porechop](https://github.com/rrwick/Porechop) + +- [Kraken2](https://doi.org/10.1186/s13059-019-1891-0) + + > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. “Improved Metagenomic Analysis with Kraken 2.” Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0. + +- [MALT](https://doi.org/10.1038/s41559-017-0446-6) + > Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. “Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico.” Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6. ## Software packaging/containerisation tools diff --git a/modules.json b/modules.json index c4b2f12..dcfbd3f 100644 --- a/modules.json +++ b/modules.json @@ -35,4 +35,4 @@ } } } -} \ No newline at end of file +} From df4f13319fde9e29d1ed5f0780d60aedccde04e5 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 1 Apr 2022 11:13:57 +0200 Subject: [PATCH 071/214] More linting --- CITATIONS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CITATIONS.md b/CITATIONS.md index 6e43999..8f286b0 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -31,6 +31,7 @@ > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. “Improved Metagenomic Analysis with Kraken 2.” Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0. - [MALT](https://doi.org/10.1038/s41559-017-0446-6) + > Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. “Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico.” Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6. ## Software packaging/containerisation tools From 0d0b377dfb00b8ccd48b47be55b9bcba3dcbd582 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 1 Apr 2022 11:19:00 +0200 Subject: [PATCH 072/214] Add additional test --- .github/workflows/ci.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fb6de77..a8078e5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,6 +28,10 @@ jobs: # Test latest edge release of Nextflow - NXF_VER: "" NXF_EDGE: "1" + parameters: + - "--shortread_clipmerge_tool fastp" + - "--shortread_clipmerge_tool adapterremoval" + steps: - name: Check out pipeline code uses: actions/checkout@v2 @@ -47,6 +51,6 @@ jobs: # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results ${{ matrix.parameters }} # From 14f2b7bd84956a2dadb6c2f69d39baf1c6f3e027 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 1 Apr 2022 15:34:05 +0200 Subject: [PATCH 073/214] Apply suggestions from code review Co-authored-by: Moritz E. Beber --- subworkflows/local/shortread_adapterremoval.nf | 18 ++++++++---------- subworkflows/local/shortread_preprocessing.nf | 4 ++-- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index b467a64..3701f7e 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -17,14 +17,16 @@ workflow SHORTREAD_ADAPTERREMOVAL { ch_input_for_adapterremoval = reads .branch{ - single: it[0]['single_end'] == true - paired: it[0]['single_end'] == false + single: it[0].single_end + paired: !it[0].single_end } ADAPTERREMOVAL_SINGLE ( ch_input_for_adapterremoval.single, [] ) ADAPTERREMOVAL_PAIRED ( ch_input_for_adapterremoval.paired, [] ) if ( params.shortread_clipmerge_mergepairs && !params.shortread_clipmerge_excludeunmerged ) { + // due to the slightly ugly output implementation of the current AdapterRemoval2 module, each file + // has to be exported in a separate channel, and we must manually recombine when necessary ch_adapterremoval_for_cat = ADAPTERREMOVAL_PAIRED.out.collapsed .mix( ADAPTERREMOVAL_PAIRED.out.collapsed_truncated, @@ -35,16 +37,14 @@ workflow SHORTREAD_ADAPTERREMOVAL { .map { meta, reads -> def meta_new = meta.clone() - meta_new['single_end'] = true + meta_new.single_end = true [ meta_new, reads ] } - .groupTuple(by: 0) + .groupTuple() - ch_adapterremoval_reads_prepped_pe = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads - - ch_adapterremoval_reads_prepped = ch_adapterremoval_reads_prepped_pe.mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) + ch_adapterremoval_reads_prepped = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads.mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) } else if ( params.shortread_clipmerge_mergepairs && params.shortread_clipmerge_excludeunmerged ) { ch_adapterremoval_for_cat = ADAPTERREMOVAL_PAIRED.out.collapsed @@ -58,9 +58,7 @@ workflow SHORTREAD_ADAPTERREMOVAL { } .groupTuple(by: 0) - ch_adapterremoval_reads_prepped_pe = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads - - ch_adapterremoval_reads_prepped = ch_adapterremoval_reads_prepped_pe.mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) + ch_adapterremoval_reads_prepped = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads.mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) } else { diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index f0a1ed4..1d0caac 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -21,8 +21,8 @@ workflow SHORTREAD_PREPROCESSING { ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc ) } else if ( params.shortread_clipmerge_tool == "adapterremoval" ) { ch_processed_reads = SHORTREAD_ADAPTERREMOVAL ( reads ).reads - ch_versions = ch_versions.mix( SHORTREAD_ADAPTERREMOVAL.out.versions ) - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_ADAPTERREMOVAL.out.mqc ) + ch_versions = ch_versions.mix( SHORTREAD_ADAPTERREMOVAL.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_ADAPTERREMOVAL.out.mqc ) } else { ch_processed_reads = reads } From 03e832954f89e43a641033dcae059c1870c08b2e Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 1 Apr 2022 20:55:33 +0200 Subject: [PATCH 074/214] Update subworkflows/local/shortread_adapterremoval.nf Co-authored-by: Moritz E. Beber --- subworkflows/local/shortread_adapterremoval.nf | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index 3701f7e..ac8ff0f 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -62,15 +62,12 @@ workflow SHORTREAD_ADAPTERREMOVAL { } else { - ch_adapterremoval_reads_prepped_pe = ADAPTERREMOVAL_PAIRED.out.pair1_truncated + ch_adapterremoval_reads_prepped = ADAPTERREMOVAL_PAIRED.out.pair1_truncated .join( ADAPTERREMOVAL_PAIRED.out.pair2_truncated ) - .groupTuple(by: 0) + .groupTuple() .map { meta, pair1, pair2 -> [ meta, [ pair1, pair2 ].flatten() ] } - - - ch_adapterremoval_reads_prepped = ch_adapterremoval_reads_prepped_pe .mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) } From 93aaa9eeacbf6211646853be0f4e18a94574e2f1 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 1 Apr 2022 21:36:22 +0200 Subject: [PATCH 075/214] Clean up AdapterRemoval and remove all debugging dumps --- subworkflows/local/db_check.nf | 2 -- subworkflows/local/input_check.nf | 4 --- subworkflows/local/longread_preprocessing.nf | 1 - .../local/shortread_adapterremoval.nf | 32 ++++++++++--------- workflows/taxprofiler.nf | 6 +--- 5 files changed, 18 insertions(+), 27 deletions(-) diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index bda5cfe..356915a 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -18,9 +18,7 @@ workflow DB_CHECK { parsed_samplesheet = DATABASE_CHECK ( dbsheet ) .csv .splitCsv ( header:true, sep:',' ) - .dump(tag: "db_split_csv_out") .map { create_db_channels(it) } - .dump(tag: "db_channel_prepped") ch_dbs_for_untar = parsed_samplesheet .branch { diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 4501386..ec404c2 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -12,7 +12,6 @@ workflow INPUT_CHECK { parsed_samplesheet = SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) - .dump(tag: "input_split_csv_out") .branch { fasta: it['fasta'] != '' nanopore: it['instrument_platform'] == 'OXFORD_NANOPORE' @@ -21,17 +20,14 @@ workflow INPUT_CHECK { parsed_samplesheet.fastq .map { create_fastq_channel(it) } - .dump(tag: "fastq_channel_init") .set { fastq } parsed_samplesheet.nanopore .map { create_fastq_channel(it) } - .dump(tag: "fastq_nanopore_channel_init") .set { nanopore } parsed_samplesheet.fasta .map { create_fasta_channel(it) } - .dump(tag: "fasta_channel_init") .set { fasta } emit: diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf index 7c7c24b..a1515c7 100644 --- a/subworkflows/local/longread_preprocessing.nf +++ b/subworkflows/local/longread_preprocessing.nf @@ -16,7 +16,6 @@ workflow LONGREAD_PREPROCESSING { PORECHOP ( reads ) ch_processed_reads = PORECHOP.out.reads - .dump(tag: "pre_fastqc_check") .map { meta, reads -> def meta_new = meta.clone() diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index ac8ff0f..5e005db 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -24,9 +24,10 @@ workflow SHORTREAD_ADAPTERREMOVAL { ADAPTERREMOVAL_SINGLE ( ch_input_for_adapterremoval.single, [] ) ADAPTERREMOVAL_PAIRED ( ch_input_for_adapterremoval.paired, [] ) + // due to the slightly ugly output implementation of the current AdapterRemoval2 version, each file + // has to be exported in a separate channel, and we must manually recombine when necessary + if ( params.shortread_clipmerge_mergepairs && !params.shortread_clipmerge_excludeunmerged ) { - // due to the slightly ugly output implementation of the current AdapterRemoval2 module, each file - // has to be exported in a separate channel, and we must manually recombine when necessary ch_adapterremoval_for_cat = ADAPTERREMOVAL_PAIRED.out.collapsed .mix( ADAPTERREMOVAL_PAIRED.out.collapsed_truncated, @@ -43,22 +44,23 @@ workflow SHORTREAD_ADAPTERREMOVAL { } .groupTuple() - - ch_adapterremoval_reads_prepped = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads.mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) + ch_adapterremoval_reads_prepped = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads + .mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) } else if ( params.shortread_clipmerge_mergepairs && params.shortread_clipmerge_excludeunmerged ) { - ch_adapterremoval_for_cat = ADAPTERREMOVAL_PAIRED.out.collapsed - .mix( ADAPTERREMOVAL_PAIRED.out.collapsed_truncated ) - .map { - meta, reads -> - def meta_new = meta.clone() - meta_new['single_end'] = true + ch_adapterremoval_for_cat = ADAPTERREMOVAL_PAIRED.out.collapsed + .mix( ADAPTERREMOVAL_PAIRED.out.collapsed_truncated ) + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = true - [ meta_new, reads ] - } - .groupTuple(by: 0) + [ meta_new, reads ] + } + .groupTuple(by: 0) - ch_adapterremoval_reads_prepped = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads.mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) + ch_adapterremoval_reads_prepped = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads + .mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) } else { @@ -68,7 +70,7 @@ workflow SHORTREAD_ADAPTERREMOVAL { .map { meta, pair1, pair2 -> [ meta, [ pair1, pair2 ].flatten() ] } - .mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) + .mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) } ch_processed_reads = ch_adapterremoval_reads_prepped diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index af5c54d..ce89a91 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -18,6 +18,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not except uncollapsed paired-reads. Pairs will be profiled as separate files." +if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "[nf-core/taxprofiler] error: cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -126,7 +127,6 @@ workflow TAXPROFILER { ch_input_for_profiling = ch_shortreads_preprocessed .mix( ch_longreads_preprocessed ) .combine(DB_CHECK.out.dbs) - .dump(tag: "reads_plus_db_clean") .branch { malt: it[2]['tool'] == 'malt' kraken2: it[2]['tool'] == 'kraken2' @@ -141,9 +141,7 @@ workflow TAXPROFILER { // loading takes a long time, so we only want to run it once per database // TODO document somewhere we only accept illumina short reads for MALT? ch_input_for_malt = ch_input_for_profiling.malt - .dump(tag: "input_to_malt_prefilter") .filter { it[0]['instrument_platform'] == 'ILLUMINA' } - .dump(tag: "input_to_malt_postfilter") .map { it -> def temp_meta = [ id: it[2]['db_name']] + it[2] @@ -151,7 +149,6 @@ workflow TAXPROFILER { [ temp_meta, it[1], db ] } .groupTuple(by: [0,2]) - .dump(tag: "input_to_malt") .multiMap { it -> reads: [ it[0], it[1].flatten() ] @@ -160,7 +157,6 @@ workflow TAXPROFILER { // We can run Kraken2 one-by-one sample-wise ch_input_for_kraken2 = ch_input_for_profiling.kraken2 - .dump(tag: "input_to_kraken") .multiMap { it -> reads: [ it[0] + it[2], it[1] ] From 4d2960c06fedfb052ce38b70c90197ea8ae8f9fc Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Fri, 18 Mar 2022 15:38:19 +0100 Subject: [PATCH 076/214] chore: add metaphlan3 module --- modules.json | 3 ++ modules/nf-core/modules/metaphlan3/main.nf | 45 ++++++++++++++++++ modules/nf-core/modules/metaphlan3/meta.yml | 52 +++++++++++++++++++++ 3 files changed, 100 insertions(+) create mode 100644 modules/nf-core/modules/metaphlan3/main.nf create mode 100644 modules/nf-core/modules/metaphlan3/meta.yml diff --git a/modules.json b/modules.json index dcfbd3f..9953edb 100644 --- a/modules.json +++ b/modules.json @@ -24,6 +24,9 @@ "malt/run": { "git_sha": "72b96f4e504eef673f2b5c13560a9d90b669129b" }, + "metaphlan3": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + }, "multiqc": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, diff --git a/modules/nf-core/modules/metaphlan3/main.nf b/modules/nf-core/modules/metaphlan3/main.nf new file mode 100644 index 0000000..3fc6b27 --- /dev/null +++ b/modules/nf-core/modules/metaphlan3/main.nf @@ -0,0 +1,45 @@ +process METAPHLAN3 { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? 'bioconda::metaphlan=3.0.12' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/metaphlan:3.0.12--pyhb7b1952_0' : + 'quay.io/biocontainers/metaphlan:3.0.12--pyhb7b1952_0' }" + + input: + tuple val(meta), path(input) + path metaphlan_db + + output: + tuple val(meta), path("*_profile.txt") , emit: profile + tuple val(meta), path("*.biom") , emit: biom + tuple val(meta), path('*.bowtie2out.txt'), optional:true, emit: bt2out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_type = ("$input".endsWith(".fastq.gz")) ? "--input_type fastq" : ("$input".contains(".fasta")) ? "--input_type fasta" : ("$input".endsWith(".bowtie2out.txt")) ? "--input_type bowtie2out" : "--input_type sam" + def input_data = ("$input_type".contains("fastq")) && !meta.single_end ? "${input[0]},${input[1]}" : "$input" + def bowtie2_out = "$input_type" == "--input_type bowtie2out" || "$input_type" == "--input_type sam" ? '' : "--bowtie2out ${prefix}.bowtie2out.txt" + + """ + metaphlan \\ + --nproc $task.cpus \\ + $input_type \\ + $input_data \\ + $args \\ + $bowtie2_out \\ + --bowtie2db ${metaphlan_db} \\ + --biom ${prefix}.biom \\ + --output_file ${prefix}_profile.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + metaphlan3: \$(metaphlan --version 2>&1 | awk '{print \$3}') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/metaphlan3/meta.yml b/modules/nf-core/modules/metaphlan3/meta.yml new file mode 100644 index 0000000..d10a27d --- /dev/null +++ b/modules/nf-core/modules/metaphlan3/meta.yml @@ -0,0 +1,52 @@ +name: metaphlan3 +description: MetaPhlAn is a tool for profiling the composition of microbial communities from metagenomic shotgun sequencing data. +keywords: + - metagenomics + - classification + - fastq + - bam + - fasta +tools: + - metaphlan3: + description: Identify clades (phyla to species) present in the metagenome obtained from a microbiome sample and their relative abundance + homepage: https://huttenhower.sph.harvard.edu/metaphlan/ + documentation: https://github.com/biobakery/MetaPhlAn + doi: "10.7554/eLife.65088" + licence: ["MIT License"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: Metaphlan 3.0 can classify the metagenome from a variety of input data types, including FASTQ files (single-end and paired-end), FASTA, bowtie2-produced SAM files (produced from alignments to the MetaPHlAn marker database) and intermediate bowtie2 alignment files (bowtie2out) + pattern: "*.{fastq.gz, fasta, fasta.gz, sam, bowtie2out.txt}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - profile: + type: file + description: Tab-separated output file of the predicted taxon relative abundances + pattern: "*.{txt}" + - biom: + type: file + description: General-use format for representing biological sample by observation contingency tables + pattern: "*.{biom}" + - bowtie2out: + type: file + description: Intermediate Bowtie2 output produced from mapping the metagenome against the MetaPHlAn marker database ( not compatible with `bowtie2out` files generated with MetaPhlAn versions below 3 ) + pattern: "*.{bowtie2out.txt}" + +authors: + - "@MGordon09" From 69db9206108c7926a25b45aba7401e406e36b4c3 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Fri, 18 Mar 2022 16:01:33 +0100 Subject: [PATCH 077/214] feat: include metaphlan3 in workflow --- workflows/taxprofiler.nf | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index ce89a91..a61580c 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -60,7 +60,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/ include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' include { MALT_RUN } from '../modules/nf-core/modules/malt/run/main' include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/modules/kraken2/kraken2/main' - +include { METAPHLAN3 } from '../modules/nf-core/modules/metaphlan3/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -130,6 +130,7 @@ workflow TAXPROFILER { .branch { malt: it[2]['tool'] == 'malt' kraken2: it[2]['tool'] == 'kraken2' + metaphlan3: it[2]['tool'] == 'metaphlan3' unknown: true } @@ -163,6 +164,13 @@ workflow TAXPROFILER { db: it[3] } + ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + /* MODULE: RUN PROFILING */ @@ -174,6 +182,10 @@ workflow TAXPROFILER { KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) } + if ( params.run_metaphlan3 ) { + METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db ) + } + /* MODULE: MultiQC */ From a425a322f294f49f0588123a555533e292bacfc7 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Fri, 18 Mar 2022 20:50:18 +0100 Subject: [PATCH 078/214] refactor: make metaphlan3 run in workflow --- nextflow.config | 3 +++ subworkflows/local/input_check.nf | 6 +++--- workflows/taxprofiler.nf | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/nextflow.config b/nextflow.config index 7be36a6..9f3b103 100644 --- a/nextflow.config +++ b/nextflow.config @@ -71,6 +71,9 @@ params { // kraken2 run_kraken2 = false + + // metaphlan3 + run_metaphlan3 = false } // Load base.config by default for all pipelines diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index ec404c2..e8669b3 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -31,9 +31,9 @@ workflow INPUT_CHECK { .set { fasta } emit: - fastq // channel: [ val(meta), [ reads ] ] - nanopore // channel: [ val(meta), [ reads ] ] - fasta // channel: [ val(meta), fasta ] + fastq = fastq ?: [] // channel: [ val(meta), [ reads ] ] + nanopore = nanopore ?: [] // channel: [ val(meta), [ reads ] ] + fasta = fasta ?: [] // channel: [ val(meta), fasta ] versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index a61580c..99a1685 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -165,9 +165,10 @@ workflow TAXPROFILER { } ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 + .dump(tag: "input_metaphlan3") .multiMap { it -> - reads: [ it[0] + it[2], it[1] ] + reads: [it[0] + it[2], it[1][0]] db: it[3] } From fb9afa013afa33c08554a740b73bd18ee6a1d57e Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Fri, 1 Apr 2022 16:33:22 +0200 Subject: [PATCH 079/214] fix: add metaphlan parameter to schema --- nextflow_schema.json | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index fb2ca31..358ba64 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -282,6 +292,10 @@ "run_kraken2": { "type": "boolean" }, + "run_metaphlan3": { + "type": "boolean", + "description": "Enable MetaPhlAn for taxonomic profiling" + }, "shortread_clipmerge_tool": { "type": "string", "default": "fastp", @@ -306,4 +320,4 @@ "default": 15 } } -} +} \ No newline at end of file From e579f9f5f33899336ea449eea41302ec0e8657f0 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Apr 2022 00:28:54 +0200 Subject: [PATCH 080/214] style: apply prettier --- nextflow_schema.json | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 358ba64..1efdea8 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -320,4 +310,4 @@ "default": 15 } } -} \ No newline at end of file +} From 95ed86516b445b42e1703cd2c7c020c45fabbfb3 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Apr 2022 13:59:10 +0200 Subject: [PATCH 081/214] chore: add citation --- CITATIONS.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CITATIONS.md b/CITATIONS.md index 8f286b0..0b35cce 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -34,6 +34,12 @@ > Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. “Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico.” Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6. +- [Porechop](https://github.com/rrwick/Porechop) + +- [MetaPhlAn3](https://doi.org/10.7554/eLife.65088) + + > Beghini, Francesco, Lauren J McIver, Aitor Blanco-Míguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, et al. 2021. “Integrating Taxonomic, Functional, and Strain-Level Profiling of Diverse Microbial Communities with BioBakery 3.” Edited by Peter Turnbaugh, Eduardo Franco, and C Titus Brown. ELife 10 (May): e65088. + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) From b0b5936fe847fd416d4f223f9f1051425fddac73 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Apr 2022 14:44:27 +0200 Subject: [PATCH 082/214] chore: configure output for metaphlan3 --- conf/modules.config | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index dc8b138..5dfaa82 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -170,6 +170,15 @@ process { ] } + withName: METAPHLAN3 { + publishDir = [ + path: { "${params.outdir}/metaphlan3/${metaphlan_db.simpleName}" }, + mode: params.publish_dir_mode, + pattern: '*.{biom,txt}' + ] + ext.prefix = { "${meta.id}-${meta.run_accession}-${metaphlan_db.simpleName}" } + } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, From 996b3f49126b68e273dbc94dd46302f0490a8168 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Apr 2022 14:44:48 +0200 Subject: [PATCH 083/214] fix: correct the input shape to metaphlan3 --- workflows/taxprofiler.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 99a1685..54ede13 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -168,7 +168,7 @@ workflow TAXPROFILER { .dump(tag: "input_metaphlan3") .multiMap { it -> - reads: [it[0] + it[2], it[1][0]] + reads: [it[0] + it[2], it[1]] db: it[3] } @@ -217,6 +217,7 @@ workflow TAXPROFILER { // TODO MALT results overwriting per database? // TODO Versions for Karken/MALT not report? + // TODO create multiQC module for metaphlan MULTIQC ( ch_multiqc_files.collect() ) From 817678d870389a5b88b99bcf4dc7290c71dd416b Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Apr 2022 14:45:07 +0200 Subject: [PATCH 084/214] chore: add metaphlan3 run to test profile --- conf/test.config | 1 + 1 file changed, 1 insertion(+) diff --git a/conf/test.config b/conf/test.config index 92a10e4..b8cd47c 100644 --- a/conf/test.config +++ b/conf/test.config @@ -26,6 +26,7 @@ params { databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' run_kraken2 = true run_malt = true + run_metaphlan3 = true shortread_clipmerge = true } From 70862570e0334e0af3a039fe790158f57c88b86d Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Apr 2022 14:58:07 +0200 Subject: [PATCH 085/214] fix: use same naming scheme --- conf/modules.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 5dfaa82..37b1f2f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -172,11 +172,11 @@ process { withName: METAPHLAN3 { publishDir = [ - path: { "${params.outdir}/metaphlan3/${metaphlan_db.simpleName}" }, + path: { "${params.outdir}/metaphlan3/${meta.db_name}" }, mode: params.publish_dir_mode, pattern: '*.{biom,txt}' ] - ext.prefix = { "${meta.id}-${meta.run_accession}-${metaphlan_db.simpleName}" } + ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } withName: CUSTOM_DUMPSOFTWAREVERSIONS { From 562fa426962da46946bf9411ac3b6f68b4ae8d96 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Apr 2022 15:00:55 +0200 Subject: [PATCH 086/214] fix: remove duplicate porechop entry --- CITATIONS.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 0b35cce..b18f841 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -34,8 +34,6 @@ > Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. “Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico.” Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6. -- [Porechop](https://github.com/rrwick/Porechop) - - [MetaPhlAn3](https://doi.org/10.7554/eLife.65088) > Beghini, Francesco, Lauren J McIver, Aitor Blanco-Míguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, et al. 2021. “Integrating Taxonomic, Functional, and Strain-Level Profiling of Diverse Microbial Communities with BioBakery 3.” Edited by Peter Turnbaugh, Eduardo Franco, and C Titus Brown. ELife 10 (May): e65088. From 4e22bc961084688ce9a0c6f8d1d564b70b7af912 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Apr 2022 15:20:36 +0200 Subject: [PATCH 087/214] fix: force UTF-8 in different locale? --- nextflow.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 9f3b103..588c0cf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -158,7 +158,8 @@ if (!params.igenomes_ignore) { // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. env { - PYTHONNOUSERSITE = 1 + PYTHONNOUSERSITE = '1' + PYTHONUTF8 = '1' // MetaPhlAn3 may fail with a UnicodeDecodeError if the locale is not set appropriately. R_PROFILE_USER = "/.Rprofile" R_ENVIRON_USER = "/.Renviron" JULIA_DEPOT_PATH = "/usr/local/share/julia" From 41526e8a64bdbf40cc28a941e1aaa2430aa66a5d Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Apr 2022 15:44:39 +0200 Subject: [PATCH 088/214] try to fix locale --- .github/workflows/ci.yml | 8 ++++++++ nextflow.config | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a8078e5..b8975b5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -46,6 +46,14 @@ jobs: wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ + - name: Show current locale + run: locale + + - name: Set UTF-8 enabled locale + run: | + sudo locale-gen en_US.UTF-8 + sudo update-locale LANG=en_US.UTF-8 + - name: Run pipeline with test data # TODO nf-core: You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters diff --git a/nextflow.config b/nextflow.config index 588c0cf..cd6de9c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -159,7 +159,7 @@ if (!params.igenomes_ignore) { env { PYTHONNOUSERSITE = '1' - PYTHONUTF8 = '1' // MetaPhlAn3 may fail with a UnicodeDecodeError if the locale is not set appropriately. + // PYTHONUTF8 = '1' // MetaPhlAn3 may fail with a UnicodeDecodeError if the locale is not set appropriately. R_PROFILE_USER = "/.Rprofile" R_ENVIRON_USER = "/.Renviron" JULIA_DEPOT_PATH = "/usr/local/share/julia" From b055df5ea076e2d82071eba9e3e692ce9d43e976 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 2 Apr 2022 17:02:05 +0200 Subject: [PATCH 089/214] Add bbduk complexity (entropy-based) filtering --- CITATIONS.md | 2 + conf/modules.config | 15 +++-- modules.json | 3 + modules/nf-core/modules/bbmap/bbduk/main.nf | 43 ++++++++++++ modules/nf-core/modules/bbmap/bbduk/meta.yml | 52 +++++++++++++++ nextflow.config | 7 ++ nextflow_schema.json | 21 +++++- subworkflows/local/longread_preprocessing.nf | 8 +-- .../local/shortread_adapterremoval.nf | 6 +- .../local/shortread_complexityfiltering.nf | 28 ++++++++ subworkflows/local/shortread_fastp.nf | 10 +-- subworkflows/local/shortread_preprocessing.nf | 6 +- workflows/taxprofiler.nf | 66 ++++++++++++------- 13 files changed, 222 insertions(+), 45 deletions(-) create mode 100644 modules/nf-core/modules/bbmap/bbduk/main.nf create mode 100644 modules/nf-core/modules/bbmap/bbduk/meta.yml create mode 100644 subworkflows/local/shortread_complexityfiltering.nf diff --git a/CITATIONS.md b/CITATIONS.md index 8f286b0..0eb8141 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -26,6 +26,8 @@ - [Porechop](https://github.com/rrwick/Porechop) +- [BBTools](http://sourceforge.net/projects/bbmap/) + - [Kraken2](https://doi.org/10.1186/s13059-019-1891-0) > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. “Improved Metagenomic Analysis with Kraken 2.” Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0. diff --git a/conf/modules.config b/conf/modules.config index dc8b138..601f915 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -132,7 +132,6 @@ process { ] } - withName: PORECHOP { ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ @@ -142,11 +141,17 @@ process { ] } - withName: CAT_FASTQ { + withName: BBMAP_BBDUK { + ext.args = [ + "entropy=${params.shortread_complexityfilter_bbduk_entropy}", + "entropywindow=${params.shortread_complexityfilter_bbduk_windowsize}", + params.shortread_complexityfilter_bbduk_mask ? "entropymask=t" : "entropymask=f" + ].join(' ').trim() + ext.prefix = { "${meta.id}-${meta.run_accession}" } publishDir = [ - path: { "${params.outdir}/prepared_sequences" }, - mode: 'copy', - pattern: '*.fastq.gz' + path: { "${params.outdir}/bbduk/" }, + mode: params.publish_dir_mode, + pattern: '*.{fastq.gz,log}' ] } diff --git a/modules.json b/modules.json index dcfbd3f..64e6c3c 100644 --- a/modules.json +++ b/modules.json @@ -6,6 +6,9 @@ "adapterremoval": { "git_sha": "f0800157544a82ae222931764483331a81812012" }, + "bbmap/bbduk": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + }, "cat/fastq": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, diff --git a/modules/nf-core/modules/bbmap/bbduk/main.nf b/modules/nf-core/modules/bbmap/bbduk/main.nf new file mode 100644 index 0000000..0ae005e --- /dev/null +++ b/modules/nf-core/modules/bbmap/bbduk/main.nf @@ -0,0 +1,43 @@ +process BBMAP_BBDUK { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? "bioconda::bbmap=38.90" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bbmap:38.90--he522d1c_1' : + 'quay.io/biocontainers/bbmap:38.90--he522d1c_1' }" + + input: + tuple val(meta), path(reads) + path contaminants + + output: + tuple val(meta), path('*.fastq.gz'), emit: reads + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def raw = meta.single_end ? "in=${reads[0]}" : "in1=${reads[0]} in2=${reads[1]}" + def trimmed = meta.single_end ? "out=${prefix}.fastq.gz" : "out1=${prefix}_1.fastq.gz out2=${prefix}_2.fastq.gz" + def contaminants_fa = contaminants ? "ref=$contaminants" : '' + """ + maxmem=\$(echo \"$task.memory\"| sed 's/ GB/g/g') + bbduk.sh \\ + -Xmx\$maxmem \\ + $raw \\ + $trimmed \\ + threads=$task.cpus \\ + $args \\ + $contaminants_fa \\ + &> ${prefix}.bbduk.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/bbmap/bbduk/meta.yml b/modules/nf-core/modules/bbmap/bbduk/meta.yml new file mode 100644 index 0000000..6abd3d9 --- /dev/null +++ b/modules/nf-core/modules/bbmap/bbduk/meta.yml @@ -0,0 +1,52 @@ +name: bbmap_bbduk +description: Adapter and quality trimming of sequencing reads +keywords: + - trimming + - adapter trimming + - quality trimming +tools: + - bbmap: + description: BBMap is a short read aligner, as well as various other bioinformatic tools. + homepage: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + documentation: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + tool_dev_url: None + doi: "" + licence: ["UC-LBL license (see package)"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - contaminants: + type: file + description: | + Reference files containing adapter and/or contaminant sequences for sequence kmer matching + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified fastq reads + pattern: "*fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - log: + type: file + description: Bbduk log file + pattern: "*bbduk.log" + +authors: + - "@MGordon09" diff --git a/nextflow.config b/nextflow.config index 7be36a6..d57743a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -65,6 +65,13 @@ params { shortread_clipmerge_minlength = 15 longread_clip = false + // Complexity filtering + shortread_complexityfilter = false + shortread_complexityfilter_tool = 'bbduk' + shortread_complexityfilter_bbduk_entropy = 0.3 + shortread_complexityfilter_bbduk_windowsize = 50 + shortread_complexityfilter_bbduk_mask = false + // MALT run_malt = false malt_mode = 'BlastN' diff --git a/nextflow_schema.json b/nextflow_schema.json index fb2ca31..bd8b438 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -266,8 +266,7 @@ "type": "boolean" }, "shortread_clipmerge_excludeunmerged": { - "type": "boolean", - "default": false + "type": "boolean" }, "longread_clip": { "type": "boolean" @@ -304,6 +303,24 @@ "shortread_clipmerge_minlength": { "type": "integer", "default": 15 + }, + "shortread_complexityfilter_tool": { + "type": "string", + "default": "bbduk" + }, + "shortread_complexityfilter_bbduk_entropy": { + "type": "number", + "default": 0.3 + }, + "shortread_complexityfilter_bbduk_windowsize": { + "type": "integer", + "default": 50 + }, + "shortread_complexityfilter_bbduk_mask": { + "type": "boolean" + }, + "shortread_complexityfilter": { + "type": "boolean" } } } diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf index a1515c7..2fa5f3b 100644 --- a/subworkflows/local/longread_preprocessing.nf +++ b/subworkflows/local/longread_preprocessing.nf @@ -1,6 +1,6 @@ -/* -Process long raw reads with porechop -*/ +// +// Process long raw reads with porechop +// include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main' include { PORECHOP } from '../../modules/nf-core/modules/porechop/main' @@ -25,7 +25,7 @@ workflow LONGREAD_PREPROCESSING { FASTQC_PROCESSED ( PORECHOP.out.reads ) ch_versions = ch_versions.mix(PORECHOP.out.versions.first()) - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) emit: diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index 5e005db..b173e4c 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -1,6 +1,6 @@ -/* -Process short raw reads with AdapterRemoval -*/ +// +// Process short raw reads with AdapterRemoval +// include { ADAPTERREMOVAL as ADAPTERREMOVAL_SINGLE } from '../../modules/nf-core/modules/adapterremoval/main' include { ADAPTERREMOVAL as ADAPTERREMOVAL_PAIRED } from '../../modules/nf-core/modules/adapterremoval/main' diff --git a/subworkflows/local/shortread_complexityfiltering.nf b/subworkflows/local/shortread_complexityfiltering.nf new file mode 100644 index 0000000..3d69ed6 --- /dev/null +++ b/subworkflows/local/shortread_complexityfiltering.nf @@ -0,0 +1,28 @@ +// +// Check input samplesheet and get read channels +// + +include { BBMAP_BBDUK } from '../../modules/nf-core/modules/bbmap/bbduk/main' + +workflow SHORTREAD_COMPLEXITYFILTERING { + take: + reads // [ [ meta ], [ reads ] ] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( params.shortread_complexityfilter_tool == 'bbduk' ) { + ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads + ch_versions = ch_versions.mix( BBMAP_BBDUK.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( BBMAP_BBDUK.out.log ) + } else { + ch_filtered_reads = reads + } + + emit: + reads = ch_filtered_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index 48817db..18baf17 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -1,6 +1,6 @@ -/* -Process short raw reads with FastP -*/ +// +// Process short raw reads with FastP +// include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main' include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' @@ -44,8 +44,8 @@ workflow SHORTREAD_FASTP { ch_processed_reads = ch_fastp_reads_prepped - ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} ) - ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json ) + ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json ) emit: reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index 1d0caac..b0ac25e 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -1,5 +1,5 @@ // -// Check input samplesheet and get read channels +// Perform read trimming and merging // @@ -9,7 +9,7 @@ include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules workflow SHORTREAD_PREPROCESSING { take: - reads // file: /path/to/samplesheet.csv + reads // [ [ meta ], [ reads ] ] main: ch_versions = Channel.empty() @@ -29,7 +29,7 @@ workflow SHORTREAD_PREPROCESSING { FASTQC_PROCESSED ( ch_processed_reads ) ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) emit: reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index ce89a91..a403589 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -40,9 +40,10 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi // include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { DB_CHECK } from '../subworkflows/local/db_check' -include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' -include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' +include { DB_CHECK } from '../subworkflows/local/db_check' +include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' +include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' +include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_complexityfiltering' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -61,7 +62,6 @@ include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fas include { MALT_RUN } from '../modules/nf-core/modules/malt/run/main' include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/modules/kraken2/kraken2/main' - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -98,10 +98,6 @@ workflow TAXPROFILER { ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - CUSTOM_DUMPSOFTWAREVERSIONS ( - ch_versions.unique().collectFile(name: 'collated_versions.yml') - ) - /* SUBWORKFLOW: PERFORM PREPROCESSING */ @@ -114,17 +110,26 @@ workflow TAXPROFILER { if ( params.longread_clip ) { ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ).reads .map { it -> [ it[0], [it[1]] ] } - ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first()) } else { ch_longreads_preprocessed = INPUT_CHECK.out.nanopore } + /* + SUBWORKFLOW: COMPLEXITY FILTERING + */ + + if ( params.shortread_complexityfilter ) { + ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads + } else { + ch_shortreads_filtered = ch_shortreads_preprocessed + } + /* COMBINE READS WITH POSSIBLE DATABASES */ // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] - ch_input_for_profiling = ch_shortreads_preprocessed + ch_input_for_profiling = ch_shortreads_filtered .mix( ch_longreads_preprocessed ) .combine(DB_CHECK.out.dbs) .branch { @@ -177,6 +182,12 @@ workflow TAXPROFILER { /* MODULE: MultiQC */ + + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') + ) + + workflow_summary = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) @@ -188,21 +199,30 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) if (params.shortread_clipmerge) { - ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_PREPROCESSING.out.mqc) - } - if (params.longread_clip) { - ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc) - } - if (params.run_kraken2) { - ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([])) - ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first()) - } - if (params.run_malt) { - ch_multiqc_files = ch_multiqc_files.mix(MALT_RUN.out.log.collect{it[1]}.ifEmpty([])) - ch_versions = ch_versions.mix(MALT_RUN.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_shortclipmerge") + ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions ) + } + + if (params.longread_clip) { + ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_longclipmerge") + ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions ) + } + + if (params.shortread_complexityfilter){ + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_compelxity") + ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) + } + + if (params.run_kraken2) { + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_kraken") + ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) + } + + if (params.run_malt) { + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_malt") + ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) } - // TODO MALT results overwriting per database? // TODO Versions for Karken/MALT not report? MULTIQC ( ch_multiqc_files.collect() From 4eab257d68a36b333bc1c5e087e6a79721b5b87e Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Apr 2022 17:05:46 +0200 Subject: [PATCH 090/214] style: prettify comment --- subworkflows/local/shortread_adapterremoval.nf | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index 5e005db..3598e3e 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -24,8 +24,10 @@ workflow SHORTREAD_ADAPTERREMOVAL { ADAPTERREMOVAL_SINGLE ( ch_input_for_adapterremoval.single, [] ) ADAPTERREMOVAL_PAIRED ( ch_input_for_adapterremoval.paired, [] ) - // due to the slightly ugly output implementation of the current AdapterRemoval2 version, each file - // has to be exported in a separate channel, and we must manually recombine when necessary + /* + * Due to the ~slightly~ very ugly output implementation of the current AdapterRemoval2 version, each file + * has to be exported in a separate channel and we must manually recombine when necessary. + */ if ( params.shortread_clipmerge_mergepairs && !params.shortread_clipmerge_excludeunmerged ) { ch_adapterremoval_for_cat = ADAPTERREMOVAL_PAIRED.out.collapsed From 7f1a7fb4e7be661e3a01ab0ffb0a61fe5de314ae Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 2 Apr 2022 17:05:54 +0200 Subject: [PATCH 091/214] Add complexity filtering to test profile --- conf/test.config | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/conf/test.config b/conf/test.config index 92a10e4..bce5918 100644 --- a/conf/test.config +++ b/conf/test.config @@ -24,8 +24,9 @@ params { // TODO nf-core: Give any required params for the test so that command line flags are not needed input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' - run_kraken2 = true - run_malt = true - shortread_clipmerge = true + run_kraken2 = true + run_malt = true + shortread_clipmerge = true + shortread_complexityfiltering = true } From 469e4f36825d86bf83a341d8c95bb4768f682bfd Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 2 Apr 2022 17:07:30 +0200 Subject: [PATCH 092/214] Remove remaining debugging .dumps( --- workflows/taxprofiler.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index a403589..1c86edb 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -90,7 +90,7 @@ workflow TAXPROFILER { /* MODULE: Run FastQC */ - ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ).dump(tag: "input_to_fastq") + ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ) FASTQC ( ch_input_for_fastqc @@ -199,27 +199,27 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) if (params.shortread_clipmerge) { - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_shortclipmerge") + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions ) } if (params.longread_clip) { - ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_longclipmerge") + ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions ) } if (params.shortread_complexityfilter){ - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_compelxity") + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } if (params.run_kraken2) { - ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_kraken") + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) } if (params.run_malt) { - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_malt") + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) } From 02d950c58c84f4e6c571cdb055154016fc78a22d Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 2 Apr 2022 17:18:20 +0200 Subject: [PATCH 093/214] Fix test so complexity filter actually executes --- conf/test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index bce5918..3e7530c 100644 --- a/conf/test.config +++ b/conf/test.config @@ -27,6 +27,6 @@ params { run_kraken2 = true run_malt = true shortread_clipmerge = true - shortread_complexityfiltering = true + shortread_complexityfilter = true } From be93eae64057d21c7c5da34162ac2bc51fd86922 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Apr 2022 21:50:13 +0200 Subject: [PATCH 094/214] fix: insert `.fastq` file extension The MetaPhlAn3 module computes the input type from the file extension without the possibility to configure an override. Since AdapterRemoval2 creates files without `.fastq` extensions, MetaPhlAn3 input was determined to be SAM. --- modules/local/ensure_fastq_extension.nf | 31 +++++ .../local/shortread_adapterremoval.nf | 126 ++++++++++++------ 2 files changed, 118 insertions(+), 39 deletions(-) create mode 100644 modules/local/ensure_fastq_extension.nf diff --git a/modules/local/ensure_fastq_extension.nf b/modules/local/ensure_fastq_extension.nf new file mode 100644 index 0000000..6de223b --- /dev/null +++ b/modules/local/ensure_fastq_extension.nf @@ -0,0 +1,31 @@ +process ENSURE_FASTQ_EXTENSION { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "conda-forge::bash=5.0" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv2/biocontainers_v1.2.0_cv2.img' : + 'biocontainers/biocontainers:v1.2.0_cv2' }" + + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path('*.fastq.gz'), emit: reads + + script: + if (meta.single_end) { + fastq = "${reads.baseName}.fastq.gz" + """ + ln -s '${reads}' '${fastq}' + """ + } else { + first = "${reads[0].baseName}.fastq.gz" + second = "${reads[1].baseName}.fastq.gz" + """ + ln -s '${reads[0]}' '${first}' + ln -s '${reads[1]}' '${second}' + """ + } +} diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index 3598e3e..661062d 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -5,6 +5,11 @@ Process short raw reads with AdapterRemoval include { ADAPTERREMOVAL as ADAPTERREMOVAL_SINGLE } from '../../modules/nf-core/modules/adapterremoval/main' include { ADAPTERREMOVAL as ADAPTERREMOVAL_PAIRED } from '../../modules/nf-core/modules/adapterremoval/main' include { CAT_FASTQ } from '../../modules/nf-core/modules/cat/fastq/main' +include { + ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION1; + ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION2; + ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION3; +} from '../../modules/local/ensure_fastq_extension' workflow SHORTREAD_ADAPTERREMOVAL { @@ -30,59 +35,102 @@ workflow SHORTREAD_ADAPTERREMOVAL { */ if ( params.shortread_clipmerge_mergepairs && !params.shortread_clipmerge_excludeunmerged ) { - ch_adapterremoval_for_cat = ADAPTERREMOVAL_PAIRED.out.collapsed - .mix( - ADAPTERREMOVAL_PAIRED.out.collapsed_truncated, - ADAPTERREMOVAL_PAIRED.out.singles_truncated, - ADAPTERREMOVAL_PAIRED.out.pair1_truncated, - ADAPTERREMOVAL_PAIRED.out.pair2_truncated - ) - .map { - meta, reads -> - def meta_new = meta.clone() - meta_new.single_end = true - [ meta_new, reads ] - } - .groupTuple() + ENSURE_FASTQ_EXTENSION1( + Channel.empty().mix( + ADAPTERREMOVAL_PAIRED.out.collapsed, + ADAPTERREMOVAL_PAIRED.out.collapsed_truncated, + ADAPTERREMOVAL_PAIRED.out.singles_truncated, + ADAPTERREMOVAL_PAIRED.out.pair1_truncated, + ADAPTERREMOVAL_PAIRED.out.pair2_truncated + ) + .map { meta, reads -> + meta.single_end = true + [meta, reads] + } + ) - ch_adapterremoval_reads_prepped = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads - .mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) + CAT_FASTQ( + ENSURE_FASTQ_EXTENSION1.out.reads + .groupTuple() + ) + + ENSURE_FASTQ_EXTENSION2(ADAPTERREMOVAL_SINGLE.out.singles_truncated) + + ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads + .mix(ENSURE_FASTQ_EXTENSION2.out.reads) } else if ( params.shortread_clipmerge_mergepairs && params.shortread_clipmerge_excludeunmerged ) { - ch_adapterremoval_for_cat = ADAPTERREMOVAL_PAIRED.out.collapsed - .mix( ADAPTERREMOVAL_PAIRED.out.collapsed_truncated ) - .map { - meta, reads -> - def meta_new = meta.clone() - meta_new['single_end'] = true - [ meta_new, reads ] - } - .groupTuple(by: 0) + ENSURE_FASTQ_EXTENSION1( + Channel.empty().mix( + ADAPTERREMOVAL_PAIRED.out.collapsed, + ADAPTERREMOVAL_PAIRED.out.collapsed_truncated + ) + .map { meta, reads -> + meta.single_end = true + [meta, reads] + } + ) - ch_adapterremoval_reads_prepped = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads - .mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) + CAT_FASTQ( + ENSURE_FASTQ_EXTENSION1.out.reads + .groupTuple() + ) + + ENSURE_FASTQ_EXTENSION2(ADAPTERREMOVAL_SINGLE.out.singles_truncated) + + ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads + .mix(ENSURE_FASTQ_EXTENSION2.out.reads) } else { - ch_adapterremoval_reads_prepped = ADAPTERREMOVAL_PAIRED.out.pair1_truncated - .join( ADAPTERREMOVAL_PAIRED.out.pair2_truncated ) - .groupTuple() - .map { meta, pair1, pair2 -> - [ meta, [ pair1, pair2 ].flatten() ] - } - .mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) - } + ENSURE_FASTQ_EXTENSION1( + ADAPTERREMOVAL_PAIRED.out.pair1_truncated + .map { meta, reads -> + meta.single_end = true + [meta, reads] + } + ) - ch_processed_reads = ch_adapterremoval_reads_prepped + ENSURE_FASTQ_EXTENSION2( + ADAPTERREMOVAL_PAIRED.out.pair2_truncated + .map { meta, reads -> + meta.single_end = true + [meta, reads] + } + ) + + ENSURE_FASTQ_EXTENSION3(ADAPTERREMOVAL_SINGLE.out.singles_truncated) + + ch_adapterremoval_reads_prepped = ENSURE_FASTQ_EXTENSION1.out.reads + .join(ENSURE_FASTQ_EXTENSION2.out.reads) + .groupTuple() + .map { meta, pair1, pair2 -> + meta.single_end = false + [ meta, [ pair1, pair2 ].flatten() ] + } + .mix(ENSURE_FASTQ_EXTENSION3.out.reads) + + } ch_versions = ch_versions.mix( ADAPTERREMOVAL_SINGLE.out.versions.first() ) ch_versions = ch_versions.mix( ADAPTERREMOVAL_PAIRED.out.versions.first() ) - ch_multiqc_files = ch_multiqc_files.mix( ADAPTERREMOVAL_PAIRED.out.log.collect{it[1]}, ADAPTERREMOVAL_SINGLE.out.log.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( + ADAPTERREMOVAL_PAIRED.out.log.collect{it[1]}, + ADAPTERREMOVAL_SINGLE.out.log.collect{it[1]} + ) emit: - reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] - versions = ch_versions // channel: [ versions.yml ] + reads = ch_adapterremoval_reads_prepped // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] mqc = ch_multiqc_files } + +def ensureFastQExtension(row) { + def (meta, read) = row + def filename = file(read.parent).resolve("${read.baseName}.fastq.gz") + + read.renameTo(filename.toString()) + return [meta, read] +} From f5baf910be4a7a6207c6070fef407873201d3ce4 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sun, 3 Apr 2022 00:21:09 +0200 Subject: [PATCH 095/214] Apply suggestions from code review Co-authored-by: James A. Fellows Yates --- nextflow.config | 1 - subworkflows/local/shortread_adapterremoval.nf | 7 ------- 2 files changed, 8 deletions(-) diff --git a/nextflow.config b/nextflow.config index cd6de9c..bc66734 100644 --- a/nextflow.config +++ b/nextflow.config @@ -159,7 +159,6 @@ if (!params.igenomes_ignore) { env { PYTHONNOUSERSITE = '1' - // PYTHONUTF8 = '1' // MetaPhlAn3 may fail with a UnicodeDecodeError if the locale is not set appropriately. R_PROFILE_USER = "/.Rprofile" R_ENVIRON_USER = "/.Renviron" JULIA_DEPOT_PATH = "/usr/local/share/julia" diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index 661062d..e522a1a 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -127,10 +127,3 @@ workflow SHORTREAD_ADAPTERREMOVAL { mqc = ch_multiqc_files } -def ensureFastQExtension(row) { - def (meta, read) = row - def filename = file(read.parent).resolve("${read.baseName}.fastq.gz") - - read.renameTo(filename.toString()) - return [meta, read] -} From d3572e18787eee40ed4e856bd6caf2f573246094 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 3 Apr 2022 07:28:01 +0200 Subject: [PATCH 096/214] Final MQC fix for AR2 (remove too-early collect) --- subworkflows/local/shortread_adapterremoval.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index b173e4c..bbb108e 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -77,7 +77,7 @@ workflow SHORTREAD_ADAPTERREMOVAL { ch_versions = ch_versions.mix( ADAPTERREMOVAL_SINGLE.out.versions.first() ) ch_versions = ch_versions.mix( ADAPTERREMOVAL_PAIRED.out.versions.first() ) - ch_multiqc_files = ch_multiqc_files.mix( ADAPTERREMOVAL_PAIRED.out.log.collect{it[1]}, ADAPTERREMOVAL_SINGLE.out.log.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( ADAPTERREMOVAL_PAIRED.out.log, ADAPTERREMOVAL_SINGLE.out.log ) emit: reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] From 482112bb42ddd314de8e92b6a3c1a94e530828f7 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 3 Apr 2022 07:58:40 +0200 Subject: [PATCH 097/214] Start work on host removal --- modules.json | 11 ++- modules/nf-core/modules/bowtie2/align/main.nf | 77 +++++++++++++++++++ .../nf-core/modules/bowtie2/align/meta.yml | 51 ++++++++++++ modules/nf-core/modules/bowtie2/build/main.nf | 30 ++++++++ .../nf-core/modules/bowtie2/build/meta.yml | 33 ++++++++ .../nf-core/modules/samtools/flagstat/main.nf | 34 ++++++++ .../modules/samtools/flagstat/meta.yml | 49 ++++++++++++ nextflow.config | 4 + subworkflows/local/shortread_hostremoval.nf | 39 ++++++++++ workflows/taxprofiler.nf | 12 ++- 10 files changed, 336 insertions(+), 4 deletions(-) create mode 100644 modules/nf-core/modules/bowtie2/align/main.nf create mode 100644 modules/nf-core/modules/bowtie2/align/meta.yml create mode 100644 modules/nf-core/modules/bowtie2/build/main.nf create mode 100644 modules/nf-core/modules/bowtie2/build/meta.yml create mode 100644 modules/nf-core/modules/samtools/flagstat/main.nf create mode 100644 modules/nf-core/modules/samtools/flagstat/meta.yml create mode 100644 subworkflows/local/shortread_hostremoval.nf diff --git a/modules.json b/modules.json index dcfbd3f..7c3facc 100644 --- a/modules.json +++ b/modules.json @@ -6,6 +6,12 @@ "adapterremoval": { "git_sha": "f0800157544a82ae222931764483331a81812012" }, + "bowtie2/align": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + }, + "bowtie2/build": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + }, "cat/fastq": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, @@ -30,9 +36,12 @@ "porechop": { "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046" }, + "samtools/flagstat": { + "git_sha": "1ad73f1b2abdea9398680d6d20014838135c9a35" + }, "untar": { "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/bowtie2/align/main.nf b/modules/nf-core/modules/bowtie2/align/main.nf new file mode 100644 index 0000000..7e8a965 --- /dev/null +++ b/modules/nf-core/modules/bowtie2/align/main.nf @@ -0,0 +1,77 @@ +process BOWTIE2_ALIGN { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? 'bioconda::bowtie2=2.4.4 bioconda::samtools=1.14 conda-forge::pigz=2.6' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:4d235f41348a00533f18e47c9669f1ecb327f629-0' : + 'quay.io/biocontainers/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:4d235f41348a00533f18e47c9669f1ecb327f629-0' }" + + input: + tuple val(meta), path(reads) + path index + val save_unaligned + + output: + tuple val(meta), path('*.bam') , emit: bam + tuple val(meta), path('*.log') , emit: log + tuple val(meta), path('*fastq.gz'), emit: fastq, optional:true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + def unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' + """ + INDEX=`find -L ./ -name "*.rev.1.bt2" | sed 's/.rev.1.bt2//'` + bowtie2 \\ + -x \$INDEX \\ + -U $reads \\ + --threads $task.cpus \\ + $unaligned \\ + $args \\ + 2> ${prefix}.bowtie2.log \\ + | samtools view -@ $task.cpus $args2 -bhS -o ${prefix}.bam - + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + } else { + def unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' + """ + INDEX=`find -L ./ -name "*.rev.1.bt2" | sed 's/.rev.1.bt2//'` + bowtie2 \\ + -x \$INDEX \\ + -1 ${reads[0]} \\ + -2 ${reads[1]} \\ + --threads $task.cpus \\ + $unaligned \\ + $args \\ + 2> ${prefix}.bowtie2.log \\ + | samtools view -@ $task.cpus $args2 -bhS -o ${prefix}.bam - + + if [ -f ${prefix}.unmapped.fastq.1.gz ]; then + mv ${prefix}.unmapped.fastq.1.gz ${prefix}.unmapped_1.fastq.gz + fi + if [ -f ${prefix}.unmapped.fastq.2.gz ]; then + mv ${prefix}.unmapped.fastq.2.gz ${prefix}.unmapped_2.fastq.gz + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/modules/bowtie2/align/meta.yml b/modules/nf-core/modules/bowtie2/align/meta.yml new file mode 100644 index 0000000..f80421e --- /dev/null +++ b/modules/nf-core/modules/bowtie2/align/meta.yml @@ -0,0 +1,51 @@ +name: bowtie2_align +description: Align reads to a reference genome using bowtie2 +keywords: + - align + - fasta + - genome + - reference +tools: + - bowtie2: + description: | + Bowtie 2 is an ultrafast and memory-efficient tool for aligning + sequencing reads to long reference sequences. + homepage: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml + documentation: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml + doi: 10.1038/nmeth.1923 + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - index: + type: file + description: Bowtie2 genome index files + pattern: "*.ebwt" +output: + - bam: + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastq: + type: file + description: Unaligned FastQ files + pattern: "*.fastq.gz" + - log: + type: file + description: Aligment log + pattern: "*.log" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/modules/bowtie2/build/main.nf b/modules/nf-core/modules/bowtie2/build/main.nf new file mode 100644 index 0000000..a4da62d --- /dev/null +++ b/modules/nf-core/modules/bowtie2/build/main.nf @@ -0,0 +1,30 @@ +process BOWTIE2_BUILD { + tag "$fasta" + label 'process_high' + + conda (params.enable_conda ? 'bioconda::bowtie2=2.4.4' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bowtie2:2.4.4--py39hbb4e92a_0' : + 'quay.io/biocontainers/bowtie2:2.4.4--py39hbb4e92a_0' }" + + input: + path fasta + + output: + path 'bowtie2' , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + mkdir bowtie2 + bowtie2-build $args --threads $task.cpus $fasta bowtie2/${fasta.baseName} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/bowtie2/build/meta.yml b/modules/nf-core/modules/bowtie2/build/meta.yml new file mode 100644 index 0000000..2da9a21 --- /dev/null +++ b/modules/nf-core/modules/bowtie2/build/meta.yml @@ -0,0 +1,33 @@ +name: bowtie2_build +description: Builds bowtie index for reference genome +keywords: + - build + - index + - fasta + - genome + - reference +tools: + - bowtie2: + description: | + Bowtie 2 is an ultrafast and memory-efficient tool for aligning + sequencing reads to long reference sequences. + homepage: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml + documentation: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml + doi: 10.1038/nmeth.1923 + licence: ["GPL-3.0-or-later"] +input: + - fasta: + type: file + description: Input genome fasta file +output: + - index: + type: file + description: Bowtie2 genome index files + pattern: "*.bt2" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/modules/samtools/flagstat/main.nf b/modules/nf-core/modules/samtools/flagstat/main.nf new file mode 100644 index 0000000..9e3440a --- /dev/null +++ b/modules/nf-core/modules/samtools/flagstat/main.nf @@ -0,0 +1,34 @@ +process SAMTOOLS_FLAGSTAT { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::samtools=1.15" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.15--h1170115_1' : + 'quay.io/biocontainers/samtools:1.15--h1170115_1' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.flagstat"), emit: flagstat + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + flagstat \\ + --threads ${task.cpus-1} \\ + $bam \\ + > ${bam}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/samtools/flagstat/meta.yml b/modules/nf-core/modules/samtools/flagstat/meta.yml new file mode 100644 index 0000000..9526906 --- /dev/null +++ b/modules/nf-core/modules/samtools/flagstat/meta.yml @@ -0,0 +1,49 @@ +name: samtools_flagstat +description: Counts the number of alignments in a BAM/CRAM/SAM file for each FLAG type +keywords: + - stats + - mapping + - counts + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: hhttp://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: Index for BAM/CRAM/SAM file + pattern: "*.{bai,crai,sai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - flagstat: + type: file + description: File containing samtools flagstat output + pattern: "*.{flagstat}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" diff --git a/nextflow.config b/nextflow.config index 7be36a6..8aa8722 100644 --- a/nextflow.config +++ b/nextflow.config @@ -65,6 +65,10 @@ params { shortread_clipmerge_minlength = 15 longread_clip = false + // Host Removal + shortread_hostremoval_reference = null + shortread_hostremoval_index = null + // MALT run_malt = false malt_mode = 'BlastN' diff --git a/subworkflows/local/shortread_hostremoval.nf b/subworkflows/local/shortread_hostremoval.nf new file mode 100644 index 0000000..4b0861c --- /dev/null +++ b/subworkflows/local/shortread_hostremoval.nf @@ -0,0 +1,39 @@ +// +// Remove host reads via alignment and export off-target reads +// + +include { BOWTIE2_ALIGN } from '../../../modules/nf-core/modules/bowtie2/align/main' +include { BOWTIE2_BUILD } from '../../../modules/nf-core/modules/bowtie2/build/main' +include { SAMTOOLS_VIEW } from '../../../modules/nf-core/modules/samtools/view/main' +include { SAMTOOLS_FASTQ } from '../../../modules/nf-core/modules/samtools/fastq/main' +include { SAMTOOLS_FLAGSTAT } from '../../../modules/nf-core/modules/samtools/flagstat/main' + +workflow SHORTREAD_PREPROCESSING { + take: + reads // [ [ meta ], [ reads ] ] + reference // /path/to/fasta + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( !params.shortread_hostremoval_index ) { + file( , checkIfExists: true ) + BOWTIE2_BUILD ( reference ) + ch_versions = ch_versions.mix( BOWTIE2_BUILD.out.versions ) + } + + BOWTIE2_ALIGN ( reads, BOWTIE2_BUILD.out.index ) + ch_versions = ch_versions.mix( BOWTIE2_BUILD.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_FLAGSTAT.out.log ) + + SAMTOOLS_FLAGSTAT ( BOWTIE2_ALIGN.out.bam ) + ch_versions = ch_versions.mix( SAMTOOLS_FLAGSTAT.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_FLAGSTAT.out.flagstat ) + + emit: + reads = BOWTIE2_ALIGN.out.fastq // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index ce89a91..c5678f1 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -11,15 +11,21 @@ WorkflowTaxprofiler.initialise(params, log) // TODO nf-core: Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.databases, params.multiqc_config ] +def checkPathParamList = [ params.input, params.databases, params.shortread_hostremoval_reference, + params.shortread_hostremoval_index, params.multiqc_config + ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters -if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } -if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } +if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (params.databases ) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not except uncollapsed paired-reads. Pairs will be profiled as separate files." if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "[nf-core/taxprofiler] error: cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" +// TODO Add check if index but no reference exit 1 +if (params.shortread_hostremoval_reference ) { ch_reference = file(params.shortread_hostremoval_reference) } else { } +if (params.shortread_hostremoval_index) { ch_reference_index = file(params.shortread_hostremoval_index ) } else { ch_reference_index = [] } + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES From 066ceb2bcaeb21c0a2a07fd1a5359e30277869ce Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 3 Apr 2022 17:23:14 +0200 Subject: [PATCH 098/214] Remove flagstat as bowtie2 reports this itself --- conf/modules.config | 19 ++++++- conf/test.config | 12 +++-- modules.json | 3 -- .../nf-core/modules/samtools/flagstat/main.nf | 34 ------------- .../modules/samtools/flagstat/meta.yml | 49 ------------------- nextflow.config | 1 + nextflow_schema.json | 14 +++++- subworkflows/local/shortread_hostremoval.nf | 29 +++++------ workflows/taxprofiler.nf | 27 ++++++++-- 9 files changed, 72 insertions(+), 116 deletions(-) delete mode 100644 modules/nf-core/modules/samtools/flagstat/main.nf delete mode 100644 modules/nf-core/modules/samtools/flagstat/meta.yml diff --git a/conf/modules.config b/conf/modules.config index dc8b138..41faa62 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -132,7 +132,6 @@ process { ] } - withName: PORECHOP { ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ @@ -142,6 +141,24 @@ process { ] } + withName: BOWTIE2_BUILD { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/bowtie2/build" }, + mode: 'copy', + pattern: '*.bt2' + ] + } + + withName: BOWTIE2_ALIGN { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/bowtie2/align" }, + mode: 'copy', + pattern: '*.{fastq.gz,bam}' + ] + } + withName: CAT_FASTQ { publishDir = [ path: { "${params.outdir}/prepared_sequences" }, diff --git a/conf/test.config b/conf/test.config index 92a10e4..90ea241 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,10 +22,12 @@ params { // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' - run_kraken2 = true - run_malt = true - shortread_clipmerge = true + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + run_kraken2 = true + run_malt = true + shortread_clipmerge = true + shortread_hostremoval = true + shortread_hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' } diff --git a/modules.json b/modules.json index 7c3facc..7395d68 100644 --- a/modules.json +++ b/modules.json @@ -36,9 +36,6 @@ "porechop": { "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046" }, - "samtools/flagstat": { - "git_sha": "1ad73f1b2abdea9398680d6d20014838135c9a35" - }, "untar": { "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" } diff --git a/modules/nf-core/modules/samtools/flagstat/main.nf b/modules/nf-core/modules/samtools/flagstat/main.nf deleted file mode 100644 index 9e3440a..0000000 --- a/modules/nf-core/modules/samtools/flagstat/main.nf +++ /dev/null @@ -1,34 +0,0 @@ -process SAMTOOLS_FLAGSTAT { - tag "$meta.id" - label 'process_low' - - conda (params.enable_conda ? "bioconda::samtools=1.15" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.15--h1170115_1' : - 'quay.io/biocontainers/samtools:1.15--h1170115_1' }" - - input: - tuple val(meta), path(bam), path(bai) - - output: - tuple val(meta), path("*.flagstat"), emit: flagstat - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - """ - samtools \\ - flagstat \\ - --threads ${task.cpus-1} \\ - $bam \\ - > ${bam}.flagstat - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/modules/samtools/flagstat/meta.yml b/modules/nf-core/modules/samtools/flagstat/meta.yml deleted file mode 100644 index 9526906..0000000 --- a/modules/nf-core/modules/samtools/flagstat/meta.yml +++ /dev/null @@ -1,49 +0,0 @@ -name: samtools_flagstat -description: Counts the number of alignments in a BAM/CRAM/SAM file for each FLAG type -keywords: - - stats - - mapping - - counts - - bam - - sam - - cram -tools: - - samtools: - description: | - SAMtools is a set of utilities for interacting with and post-processing - short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. - These files are generated as output by short read aligners like BWA. - homepage: http://www.htslib.org/ - documentation: hhttp://www.htslib.org/doc/samtools.html - doi: 10.1093/bioinformatics/btp352 - licence: ["MIT"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - bai: - type: file - description: Index for BAM/CRAM/SAM file - pattern: "*.{bai,crai,sai}" -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - flagstat: - type: file - description: File containing samtools flagstat output - pattern: "*.{flagstat}" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@drpatelh" diff --git a/nextflow.config b/nextflow.config index 8aa8722..e559a85 100644 --- a/nextflow.config +++ b/nextflow.config @@ -66,6 +66,7 @@ params { longread_clip = false // Host Removal + shortread_hostremoval = false shortread_hostremoval_reference = null shortread_hostremoval_index = null diff --git a/nextflow_schema.json b/nextflow_schema.json index fb2ca31..0b5162b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -266,8 +266,7 @@ "type": "boolean" }, "shortread_clipmerge_excludeunmerged": { - "type": "boolean", - "default": false + "type": "boolean" }, "longread_clip": { "type": "boolean" @@ -304,6 +303,17 @@ "shortread_clipmerge_minlength": { "type": "integer", "default": 15 + }, + "shortread_hostremoval": { + "type": "boolean" + }, + "shortread_hostremoval_reference": { + "type": "string", + "default": null + }, + "shortread_hostremoval_index": { + "type": "string", + "default": null } } } diff --git a/subworkflows/local/shortread_hostremoval.nf b/subworkflows/local/shortread_hostremoval.nf index 4b0861c..505f989 100644 --- a/subworkflows/local/shortread_hostremoval.nf +++ b/subworkflows/local/shortread_hostremoval.nf @@ -2,38 +2,33 @@ // Remove host reads via alignment and export off-target reads // -include { BOWTIE2_ALIGN } from '../../../modules/nf-core/modules/bowtie2/align/main' -include { BOWTIE2_BUILD } from '../../../modules/nf-core/modules/bowtie2/build/main' -include { SAMTOOLS_VIEW } from '../../../modules/nf-core/modules/samtools/view/main' -include { SAMTOOLS_FASTQ } from '../../../modules/nf-core/modules/samtools/fastq/main' -include { SAMTOOLS_FLAGSTAT } from '../../../modules/nf-core/modules/samtools/flagstat/main' +include { BOWTIE2_BUILD } from '../../modules/nf-core/modules/bowtie2/build/main' +include { BOWTIE2_ALIGN } from '../../modules/nf-core/modules/bowtie2/align/main' -workflow SHORTREAD_PREPROCESSING { +workflow SHORTREAD_HOSTREMOVAL { take: reads // [ [ meta ], [ reads ] ] reference // /path/to/fasta + index // /path/to/index main: ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() if ( !params.shortread_hostremoval_index ) { - file( , checkIfExists: true ) - BOWTIE2_BUILD ( reference ) - ch_versions = ch_versions.mix( BOWTIE2_BUILD.out.versions ) + ch_bowtie2_index = BOWTIE2_BUILD ( reference ).index + ch_versions = ch_versions.mix( BOWTIE2_BUILD.out.versions ) + } else { + ch_bowtie2_index = index.first() } - BOWTIE2_ALIGN ( reads, BOWTIE2_BUILD.out.index ) - ch_versions = ch_versions.mix( BOWTIE2_BUILD.out.versions ) - ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_FLAGSTAT.out.log ) - - SAMTOOLS_FLAGSTAT ( BOWTIE2_ALIGN.out.bam ) - ch_versions = ch_versions.mix( SAMTOOLS_FLAGSTAT.out.versions ) - ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_FLAGSTAT.out.flagstat ) + BOWTIE2_ALIGN ( reads, ch_bowtie2_index, true ) + ch_versions = ch_versions.mix( BOWTIE2_ALIGN.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( BOWTIE2_ALIGN.out.log ) emit: reads = BOWTIE2_ALIGN.out.fastq // channel: [ val(meta), [ reads ] ] - versions = ch_versions // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] mqc = ch_multiqc_files } diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index c5678f1..631aee6 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -23,8 +23,11 @@ if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-cor if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "[nf-core/taxprofiler] error: cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" // TODO Add check if index but no reference exit 1 -if (params.shortread_hostremoval_reference ) { ch_reference = file(params.shortread_hostremoval_reference) } else { } -if (params.shortread_hostremoval_index) { ch_reference_index = file(params.shortread_hostremoval_index ) } else { ch_reference_index = [] } +if (params.shortread_hostremoval && !params.shortread_hostremoval_reference) { exit 1, "[nf-core/taxprofiler] error: --shortread_hostremoval requested but no --shortread_hostremoval_reference FASTA supplied. Check input." } +if (!params.shortread_hostremoval_reference && params.shortread_hostremoval_reference_index) { exit 1, "[nf-core/taxprofiler] error: --shortread_hostremoval_index provided but no --shortread_hostremoval_reference FASTA supplied. Check input." } + +if (params.shortread_hostremoval_reference ) { ch_reference = file(params.shortread_hostremoval_reference) } else { ch_reference = [] } +if (params.shortread_hostremoval_index ) { ch_reference_index = file(params.shortread_hostremoval_index ) } else { ch_reference_index = [] } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -49,6 +52,7 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' include { DB_CHECK } from '../subworkflows/local/db_check' include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' +include { SHORTREAD_HOSTREMOVAL } from '../subworkflows/local/shortread_hostremoval' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -120,17 +124,24 @@ workflow TAXPROFILER { if ( params.longread_clip ) { ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ).reads .map { it -> [ it[0], [it[1]] ] } - ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first()) - } else { + ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first()) + } else {SHORTREAD_HOSTREMOVAL ch_longreads_preprocessed = INPUT_CHECK.out.nanopore } + if ( params.shortread_hostremoval ) { + ch_shortreads_hostremoved = SHORTREAD_HOSTREMOVAL ( ch_shortreads_preprocessed, ch_reference, ch_reference_index ).reads + ch_versions = ch_versions.mix(SHORTREAD_HOSTREMOVAL.out.versions.first()) + } else { + ch_shortreads_hostremoved = ch_shortreads_preprocessed + } + /* COMBINE READS WITH POSSIBLE DATABASES */ // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] - ch_input_for_profiling = ch_shortreads_preprocessed + ch_input_for_profiling = ch_shortreads_hostremoved .mix( ch_longreads_preprocessed ) .combine(DB_CHECK.out.dbs) .branch { @@ -196,9 +207,15 @@ workflow TAXPROFILER { if (params.shortread_clipmerge) { ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_PREPROCESSING.out.mqc) } + + if (params.shortread_hostremoval) { + ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_HOSTREMOVAL.out.mqc.collect{it[1]}.ifEmpty([])) + } + if (params.longread_clip) { ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc) } + if (params.run_kraken2) { ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([])) ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first()) From a76576c16b44be09548f8f8c6bede81aef2a4d99 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 3 Apr 2022 17:24:50 +0200 Subject: [PATCH 099/214] Prettier --- modules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 398c351..cd1a0a9 100644 --- a/modules.json +++ b/modules.json @@ -44,4 +44,4 @@ } } } -} \ No newline at end of file +} From 3a562065138a4ef9035e43b558c844f58a36611d Mon Sep 17 00:00:00 2001 From: sofstam Date: Mon, 4 Apr 2022 16:55:48 +0200 Subject: [PATCH 100/214] Centrifuge classification --- nextflow.config | 2 +- nextflow_schema.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 725f892..a99481c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -74,7 +74,7 @@ params { // centrifuge run_centrifuge = false - centrifuge_db_name = false + centrifuge_db_name = 'minigut_cf' centrifuge_save_unaligned = false centrifuge_save_aligned = false centrifuge_sam_format = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 3ec2aae..777b82f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -336,7 +336,7 @@ }, "centrifuge_db_name": { "type": "string", - "default": "false" + "default": null } } } \ No newline at end of file From a384162810cb1fe1d800d7c3f2053f01c25083da Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 4 Apr 2022 21:16:51 +0200 Subject: [PATCH 101/214] Add prinseq as alternative complexity filtering --- CITATIONS.md | 12 ++-- conf/modules.config | 15 ++++- modules.json | 5 +- .../nf-core/modules/prinseqplusplus/main.nf | 61 +++++++++++++++++++ .../nf-core/modules/prinseqplusplus/meta.yml | 60 ++++++++++++++++++ nextflow.config | 13 ++-- nextflow_schema.json | 37 ++++++++--- .../local/shortread_complexityfiltering.nf | 6 +- 8 files changed, 189 insertions(+), 20 deletions(-) create mode 100644 modules/nf-core/modules/prinseqplusplus/main.nf create mode 100644 modules/nf-core/modules/prinseqplusplus/meta.yml diff --git a/CITATIONS.md b/CITATIONS.md index 0eb8141..ec424b1 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -18,23 +18,27 @@ - [fastp](https://doi.org/10.1093/bioinformatics/bty560) - > Chen, Shifu, Yanqing Zhou, Yaru Chen, and Jia Gu. 2018. “Fastp: An Ultra-Fast All-in-One FASTQ Preprocessor.” Bioinformatics 34 (17): i884-90. 10.1093/bioinformatics/bty560. + > Chen, Shifu, Yanqing Zhou, Yaru Chen, and Jia Gu. 2018. Fastp: An Ultra-Fast All-in-One FASTQ Preprocessor. Bioinformatics 34 (17): i884-90. 10.1093/bioinformatics/bty560. - [AdapterRemoval2](https://doi.org/10.1186/s13104-016-1900-2) - > Schubert, Mikkel, Stinus Lindgreen, and Ludovic Orlando. 2016. “AdapterRemoval v2: Rapid Adapter Trimming, Identification, and Read Merging.” BMC Research Notes 9 (February): 88. doi:10.1186/s13104-016-1900-2. + > Schubert, Mikkel, Stinus Lindgreen, and Ludovic Orlando. 2016. AdapterRemoval v2: Rapid Adapter Trimming, Identification, and Read Merging. BMC Research Notes 9 (February): 88. doi:10.1186/s13104-016-1900-2. - [Porechop](https://github.com/rrwick/Porechop) - [BBTools](http://sourceforge.net/projects/bbmap/) +- [PRINSEQ++](https://doi.org/10.7287/peerj.preprints.27553v1) + + > Cantu, Vito Adrian, Jeffrey Sadural, and Robert Edwards. 2019. PRINSEQ++, a Multi-Threaded Tool for Fast and Efficient Quality Control and Preprocessing of Sequencing Datasets. e27553v1. PeerJ Preprints. doi: 10.7287/peerj.preprints.27553v1. + - [Kraken2](https://doi.org/10.1186/s13059-019-1891-0) - > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. “Improved Metagenomic Analysis with Kraken 2.” Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0. + > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. Improved Metagenomic Analysis with Kraken 2. Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0. - [MALT](https://doi.org/10.1038/s41559-017-0446-6) - > Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. “Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico.” Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6. + > Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico. Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6. ## Software packaging/containerisation tools diff --git a/conf/modules.config b/conf/modules.config index 601f915..9d2edc3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -143,7 +143,7 @@ process { withName: BBMAP_BBDUK { ext.args = [ - "entropy=${params.shortread_complexityfilter_bbduk_entropy}", + "entropy=${params.shortread_complexityfilter_entropy}", "entropywindow=${params.shortread_complexityfilter_bbduk_windowsize}", params.shortread_complexityfilter_bbduk_mask ? "entropymask=t" : "entropymask=f" ].join(' ').trim() @@ -155,6 +155,19 @@ process { ] } + withName: PRINSEQPLUSPLUS { + ext.args = [ + params.shortread_complexityfilter_prinseqplusplus_mode == 'dust' ? "-lc_dust=${params.shortread_complexityfilter_prinseqplusplus_dustscore}" : "-lc_entropy=${params.shortread_complexityfilter_entropy}", + "-trim_qual_left=0 -trim_qual_left=0 -trim_qual_window=0 -trim_qual_step=0" + ].join(' ').trim() + ext.prefix = { "${meta.id}-${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/prinseqplusplus/" }, + mode: params.publish_dir_mode, + pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz,log}' + ] + } + withName: MALT_RUN { ext.args = { "${meta.db_params}" } ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } diff --git a/modules.json b/modules.json index 64e6c3c..355dc9e 100644 --- a/modules.json +++ b/modules.json @@ -33,9 +33,12 @@ "porechop": { "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046" }, + "prinseqplusplus": { + "git_sha": "f1c5384c31e985591716afdd732cf8c2ae29d05b" + }, "untar": { "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/prinseqplusplus/main.nf b/modules/nf-core/modules/prinseqplusplus/main.nf new file mode 100644 index 0000000..ebd8c58 --- /dev/null +++ b/modules/nf-core/modules/prinseqplusplus/main.nf @@ -0,0 +1,61 @@ +process PRINSEQPLUSPLUS { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::prinseq-plus-plus=1.2.3" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/prinseq-plus-plus:1.2.3--hc90279e_1': + 'quay.io/biocontainers/prinseq-plus-plus:1.2.3--hc90279e_1' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*_good_out*.fastq.gz") , emit: good_reads + tuple val(meta), path("*_single_out*.fastq.gz"), optional: true, emit: single_reads + tuple val(meta), path("*_bad_out*.fastq.gz") , optional: true, emit: bad_reads + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + if (meta.single_end) { + """ + prinseq++ \\ + -threads $task.cpus \\ + -fastq ${reads} \\ + -out_name ${prefix} \\ + -out_gz \\ + -VERBOSE 1 \\ + $args \\ + | tee ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + prinseqplusplus: \$(echo \$(prinseq++ --version | cut -f 2 -d ' ' )) + END_VERSIONS + """ + } else { + """ + prinseq++ \\ + -threads $task.cpus \\ + -fastq ${reads[0]} \\ + -fastq2 ${reads[1]} \\ + -out_name ${prefix} \\ + -out_gz \\ + -VERBOSE 1 \\ + $args \\ + | tee ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + prinseqplusplus: \$(echo \$(prinseq++ --version | cut -f 2 -d ' ' )) + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/modules/prinseqplusplus/meta.yml b/modules/nf-core/modules/prinseqplusplus/meta.yml new file mode 100644 index 0000000..8155df9 --- /dev/null +++ b/modules/nf-core/modules/prinseqplusplus/meta.yml @@ -0,0 +1,60 @@ +name: "prinseqplusplus" +description: PRINSEQ++ is a C++ implementation of the prinseq-lite.pl program. It can be used to filter, reformat or trim genomic and metagenomic sequence data +keywords: + - fastq + - fasta + - filter + - trim +tools: + - "prinseqplusplus": + description: "PRINSEQ++ - Multi-threaded C++ sequence cleaning" + homepage: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus" + documentation: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus" + tool_dev_url: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus" + doi: "10.7287/peerj.preprints.27553v1" + licence: "['GPL v2']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end + data, respectively. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - good_reads: + type: file + description: Reads passing filter(s) in gzipped FASTQ format + pattern: "*_good_out_{R1,R2}.fastq.gz" + - single_reads: + type: file + description: | + Single reads without the pair passing filter(s) in gzipped FASTQ format + pattern: "*_single_out_{R1,R2}.fastq.gz" + - bad_reads: + type: file + description: | + Reads without not passing filter(s) in gzipped FASTQ format + pattern: "*_bad_out_{R1,R2}.fastq.gz" + - log: + type: file + description: | + Verbose level 2 STDOUT information in a log file + pattern: "*.log" + +authors: + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index d57743a..03dfbb9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -66,11 +66,14 @@ params { longread_clip = false // Complexity filtering - shortread_complexityfilter = false - shortread_complexityfilter_tool = 'bbduk' - shortread_complexityfilter_bbduk_entropy = 0.3 - shortread_complexityfilter_bbduk_windowsize = 50 - shortread_complexityfilter_bbduk_mask = false + shortread_complexityfilter = false + shortread_complexityfilter_tool = 'bbduk' + shortread_complexityfilter_entropy = 0.3 + shortread_complexityfilter_bbduk_windowsize = 50 + shortread_complexityfilter_bbduk_mask = false + shortread_complexityfilter_prinseqplusplus_mode = 'entropy' + shortread_complexityfilter_prinseqplusplus_dustscore = 0.5 + // MALT run_malt = false diff --git a/nextflow_schema.json b/nextflow_schema.json index bd8b438..6dee045 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -284,7 +294,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -308,10 +321,6 @@ "type": "string", "default": "bbduk" }, - "shortread_complexityfilter_bbduk_entropy": { - "type": "number", - "default": 0.3 - }, "shortread_complexityfilter_bbduk_windowsize": { "type": "integer", "default": 50 @@ -321,6 +330,18 @@ }, "shortread_complexityfilter": { "type": "boolean" + }, + "shortread_complexityfilter_entropy": { + "type": "number", + "default": 0.3 + }, + "shortread_complexityfilter_prinseqplusplus_mode": { + "type": "string", + "default": "entropy" + }, + "shortread_complexityfilter_prinseqplusplus_dustscore": { + "type": "number", + "default": 0.5 } } -} +} \ No newline at end of file diff --git a/subworkflows/local/shortread_complexityfiltering.nf b/subworkflows/local/shortread_complexityfiltering.nf index 3d69ed6..12686d7 100644 --- a/subworkflows/local/shortread_complexityfiltering.nf +++ b/subworkflows/local/shortread_complexityfiltering.nf @@ -2,7 +2,8 @@ // Check input samplesheet and get read channels // -include { BBMAP_BBDUK } from '../../modules/nf-core/modules/bbmap/bbduk/main' +include { BBMAP_BBDUK } from '../../modules/nf-core/modules/bbmap/bbduk/main' +include { PRINSEQPLUSPLUS } from '../../modules/nf-core/modules/prinseqplusplus/main' workflow SHORTREAD_COMPLEXITYFILTERING { take: @@ -16,6 +17,9 @@ workflow SHORTREAD_COMPLEXITYFILTERING { ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads ch_versions = ch_versions.mix( BBMAP_BBDUK.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( BBMAP_BBDUK.out.log ) + } else if ( params.shortread_complexityfilter_tool == 'prinseqplusplus' ) { + ch_filtered_reads = PRINSEQPLUSPLUS ( reads ).good_reads + ch_versions = ch_versions.mix( PRINSEQPLUSPLUS.out.versions.first() ) } else { ch_filtered_reads = reads } From 613f6a5565dbaaf6bef06ba980a6b017506490f7 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 4 Apr 2022 21:21:29 +0200 Subject: [PATCH 102/214] Prettier --- modules.json | 2 +- nextflow_schema.json | 21 ++++----------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/modules.json b/modules.json index 355dc9e..d8ac3db 100644 --- a/modules.json +++ b/modules.json @@ -41,4 +41,4 @@ } } } -} \ No newline at end of file +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 6dee045..75e1abe 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -294,10 +284,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -344,4 +331,4 @@ "default": 0.5 } } -} \ No newline at end of file +} From c54c32c4947df993231c364ac3fe19f2abb85eed Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 5 Apr 2022 10:18:34 +0200 Subject: [PATCH 103/214] chore: update adapterremoval --- modules.json | 4 +- .../nf-core/modules/adapterremoval/main.nf | 44 ++++++++++++++----- .../nf-core/modules/adapterremoval/meta.yml | 14 +++--- 3 files changed, 42 insertions(+), 20 deletions(-) diff --git a/modules.json b/modules.json index 9953edb..51f13ea 100644 --- a/modules.json +++ b/modules.json @@ -4,7 +4,7 @@ "repos": { "nf-core/modules": { "adapterremoval": { - "git_sha": "f0800157544a82ae222931764483331a81812012" + "git_sha": "879d42c5e28661fe0a5e744c9e2c515868f9e08a" }, "cat/fastq": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" @@ -38,4 +38,4 @@ } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/adapterremoval/main.nf b/modules/nf-core/modules/adapterremoval/main.nf index 9d16b9c..0e17c05 100644 --- a/modules/nf-core/modules/adapterremoval/main.nf +++ b/modules/nf-core/modules/adapterremoval/main.nf @@ -12,15 +12,14 @@ process ADAPTERREMOVAL { path(adapterlist) output: - tuple val(meta), path("${prefix}.truncated.gz") , optional: true, emit: singles_truncated - tuple val(meta), path("${prefix}.discarded.gz") , optional: true, emit: discarded - tuple val(meta), path("${prefix}.pair1.truncated.gz") , optional: true, emit: pair1_truncated - tuple val(meta), path("${prefix}.pair2.truncated.gz") , optional: true, emit: pair2_truncated - tuple val(meta), path("${prefix}.collapsed.gz") , optional: true, emit: collapsed - tuple val(meta), path("${prefix}.collapsed.truncated.gz") , optional: true, emit: collapsed_truncated - tuple val(meta), path("${prefix}.paired.gz") , optional: true, emit: paired_interleaved - tuple val(meta), path('*.log') , emit: log - path "versions.yml" , emit: versions + tuple val(meta), path("${prefix}.truncated.fastq.gz") , optional: true, emit: singles_truncated + tuple val(meta), path("${prefix}.discarded.fastq.gz") , optional: true, emit: discarded + tuple val(meta), path("${prefix}.pair{1,2}.truncated.fastq.gz") , optional: true, emit: paired_truncated + tuple val(meta), path("${prefix}.collapsed.fastq.gz") , optional: true, emit: collapsed + tuple val(meta), path("${prefix}.collapsed.truncated.fastq.gz") , optional: true, emit: collapsed_truncated + tuple val(meta), path("${prefix}.paired.fastq.gz") , optional: true, emit: paired_interleaved + tuple val(meta), path('*.settings') , emit: settings + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -38,10 +37,19 @@ process ADAPTERREMOVAL { $adapterlist \\ --basename ${prefix} \\ --threads ${task.cpus} \\ - --settings ${prefix}.log \\ --seed 42 \\ --gzip + ensure_fastq() { + if [ -f "\${1}" ]; then + mv "\${1}" "\${1::-3}.fastq.gz" + fi + + } + + ensure_fastq '${prefix}.truncated.gz' + ensure_fastq '${prefix}.discarded.gz' + cat <<-END_VERSIONS > versions.yml "${task.process}": adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g") @@ -56,10 +64,24 @@ process ADAPTERREMOVAL { $adapterlist \\ --basename ${prefix} \\ --threads $task.cpus \\ - --settings ${prefix}.log \\ --seed 42 \\ --gzip + ensure_fastq() { + if [ -f "\${1}" ]; then + mv "\${1}" "\${1::-3}.fastq.gz" + fi + + } + + ensure_fastq '${prefix}.truncated.gz' + ensure_fastq '${prefix}.discarded.gz' + ensure_fastq '${prefix}.pair1.truncated.gz' + ensure_fastq '${prefix}.pair2.truncated.gz' + ensure_fastq '${prefix}.collapsed.gz' + ensure_fastq '${prefix}.collapsed.truncated.gz' + ensure_fastq '${prefix}.paired.gz' + cat <<-END_VERSIONS > versions.yml "${task.process}": adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g") diff --git a/modules/nf-core/modules/adapterremoval/meta.yml b/modules/nf-core/modules/adapterremoval/meta.yml index 5faad04..77273f6 100644 --- a/modules/nf-core/modules/adapterremoval/meta.yml +++ b/modules/nf-core/modules/adapterremoval/meta.yml @@ -43,43 +43,43 @@ output: Adapter trimmed FastQ files of either single-end reads, or singleton 'orphaned' reads from merging of paired-end data (i.e., one of the pair was lost due to filtering thresholds). - pattern: "*.truncated.gz" + pattern: "*.truncated.fastq.gz" - discarded: type: file description: | Adapter trimmed FastQ files of reads that did not pass filtering thresholds. - pattern: "*.discarded.gz" + pattern: "*.discarded.fastq.gz" - pair1_truncated: type: file description: | Adapter trimmed R1 FastQ files of paired-end reads that did not merge with their respective R2 pair due to long templates. The respective pair is stored in 'pair2_truncated'. - pattern: "*.pair1.truncated.gz" + pattern: "*.pair1.truncated.fastq.gz" - pair2_truncated: type: file description: | Adapter trimmed R2 FastQ files of paired-end reads that did not merge with their respective R1 pair due to long templates. The respective pair is stored in 'pair1_truncated'. - pattern: "*.pair2.truncated.gz" + pattern: "*.pair2.truncated.fastq.gz" - collapsed: type: file description: | Collapsed FastQ of paired-end reads that successfully merged with their respective R1 pair but were not trimmed. - pattern: "*.collapsed.gz" + pattern: "*.collapsed.fastq.gz" - collapsed_truncated: type: file description: | Collapsed FastQ of paired-end reads that successfully merged with their respective R1 pair and were trimmed of adapter due to sufficient overlap. - pattern: "*.collapsed.truncated.gz" + pattern: "*.collapsed.truncated.fastq.gz" - log: type: file description: AdapterRemoval log file - pattern: "*.log" + pattern: "*.settings" - versions: type: file description: File containing software versions From 0e9b2989e3fd59b6e05a77a8faf6ed89d6ddea03 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 5 Apr 2022 10:53:04 +0200 Subject: [PATCH 104/214] refactor: remove superfluous ENSURE_FASTQ_EXTENSION --- .../local/shortread_adapterremoval.nf | 75 ++++--------------- 1 file changed, 16 insertions(+), 59 deletions(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index e522a1a..8c02f6f 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -5,11 +5,6 @@ Process short raw reads with AdapterRemoval include { ADAPTERREMOVAL as ADAPTERREMOVAL_SINGLE } from '../../modules/nf-core/modules/adapterremoval/main' include { ADAPTERREMOVAL as ADAPTERREMOVAL_PAIRED } from '../../modules/nf-core/modules/adapterremoval/main' include { CAT_FASTQ } from '../../modules/nf-core/modules/cat/fastq/main' -include { - ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION1; - ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION2; - ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION3; -} from '../../modules/local/ensure_fastq_extension' workflow SHORTREAD_ADAPTERREMOVAL { @@ -36,34 +31,25 @@ workflow SHORTREAD_ADAPTERREMOVAL { if ( params.shortread_clipmerge_mergepairs && !params.shortread_clipmerge_excludeunmerged ) { - ENSURE_FASTQ_EXTENSION1( - Channel.empty().mix( + ch_concat_fastq = Channel.empty() + .mix( ADAPTERREMOVAL_PAIRED.out.collapsed, ADAPTERREMOVAL_PAIRED.out.collapsed_truncated, ADAPTERREMOVAL_PAIRED.out.singles_truncated, - ADAPTERREMOVAL_PAIRED.out.pair1_truncated, - ADAPTERREMOVAL_PAIRED.out.pair2_truncated + ADAPTERREMOVAL_PAIRED.out.paired_truncated ) - .map { meta, reads -> - meta.single_end = true - [meta, reads] - } - ) + .groupTuple() + .map { [it.head(), it.tail().flatten()] } - CAT_FASTQ( - ENSURE_FASTQ_EXTENSION1.out.reads - .groupTuple() - ) - - ENSURE_FASTQ_EXTENSION2(ADAPTERREMOVAL_SINGLE.out.singles_truncated) + CAT_FASTQ(ch_concat_fastq) ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads - .mix(ENSURE_FASTQ_EXTENSION2.out.reads) + .mix(ADAPTERREMOVAL_SINGLE.out.singles_truncated) } else if ( params.shortread_clipmerge_mergepairs && params.shortread_clipmerge_excludeunmerged ) { - ENSURE_FASTQ_EXTENSION1( - Channel.empty().mix( + ch_concat_fastq = Channel.empty() + .mix( ADAPTERREMOVAL_PAIRED.out.collapsed, ADAPTERREMOVAL_PAIRED.out.collapsed_truncated ) @@ -71,54 +57,25 @@ workflow SHORTREAD_ADAPTERREMOVAL { meta.single_end = true [meta, reads] } - ) + .groupTuple() - CAT_FASTQ( - ENSURE_FASTQ_EXTENSION1.out.reads - .groupTuple() - ) - - ENSURE_FASTQ_EXTENSION2(ADAPTERREMOVAL_SINGLE.out.singles_truncated) + CAT_FASTQ(ch_concat_fastq) ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads - .mix(ENSURE_FASTQ_EXTENSION2.out.reads) + .mix(ADAPTERREMOVAL_SINGLE.out.singles_truncated) } else { - ENSURE_FASTQ_EXTENSION1( - ADAPTERREMOVAL_PAIRED.out.pair1_truncated - .map { meta, reads -> - meta.single_end = true - [meta, reads] - } - ) - - ENSURE_FASTQ_EXTENSION2( - ADAPTERREMOVAL_PAIRED.out.pair2_truncated - .map { meta, reads -> - meta.single_end = true - [meta, reads] - } - ) - - ENSURE_FASTQ_EXTENSION3(ADAPTERREMOVAL_SINGLE.out.singles_truncated) - - ch_adapterremoval_reads_prepped = ENSURE_FASTQ_EXTENSION1.out.reads - .join(ENSURE_FASTQ_EXTENSION2.out.reads) - .groupTuple() - .map { meta, pair1, pair2 -> - meta.single_end = false - [ meta, [ pair1, pair2 ].flatten() ] - } - .mix(ENSURE_FASTQ_EXTENSION3.out.reads) + ch_adapterremoval_reads_prepped = ADAPTERREMOVAL_PAIRED.out.paired_truncated + .mix(ADAPTERREMOVAL_SINGLE.out.singles_truncated) } ch_versions = ch_versions.mix( ADAPTERREMOVAL_SINGLE.out.versions.first() ) ch_versions = ch_versions.mix( ADAPTERREMOVAL_PAIRED.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( - ADAPTERREMOVAL_PAIRED.out.log.collect{it[1]}, - ADAPTERREMOVAL_SINGLE.out.log.collect{it[1]} + ADAPTERREMOVAL_PAIRED.out.settings.collect{it[1]}, + ADAPTERREMOVAL_SINGLE.out.settings.collect{it[1]} ) emit: From d46ddd972c2fc2c3ce0e7fe03b3ad42a4821332a Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 5 Apr 2022 11:04:30 +0200 Subject: [PATCH 105/214] Expand tests and fix schema after review --- .github/workflows/ci.yml | 8 ++++++++ conf/test.config | 13 +++++++------ nextflow.config | 2 +- nextflow_schema.json | 3 ++- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b8975b5..79148f0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,8 +29,16 @@ jobs: - NXF_VER: "" NXF_EDGE: "1" parameters: + - "--longread_clip false" + - "--shortread_clip false" - "--shortread_clipmerge_tool fastp" + - "--shortread_clipmerge_tool fastp --shortread_clipmerge_mergepairs --shortread_clipmerge_excludeunmerged" + - "--shortread_clipmerge_tool fastp --shortread_clipmerge_mergepairs" - "--shortread_clipmerge_tool adapterremoval" + - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs --shortread_clipmerge_excludeunmerged" + - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs" + - "--shortread_complexityfilter_tool bbduk" + - "--shortread_complexityfilter_tool prinseq" steps: - name: Check out pipeline code diff --git a/conf/test.config b/conf/test.config index ff92fe7..bdad2c1 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,11 +22,12 @@ params { // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' - run_kraken2 = true - run_malt = true - run_metaphlan3 = true - shortread_clipmerge = true + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + run_kraken2 = true + run_malt = true + run_metaphlan3 = true + shortread_clipmerge = true + longread_clip = false shortread_complexityfilter = true } diff --git a/nextflow.config b/nextflow.config index cd81f08..e6d6307 100644 --- a/nextflow.config +++ b/nextflow.config @@ -51,7 +51,7 @@ params { max_cpus = 16 max_time = '240.h' - // Databaess + // Databases databases = null // FASTQ preprocessing diff --git a/nextflow_schema.json b/nextflow_schema.json index bec9ea4..6f6125e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -328,7 +328,8 @@ }, "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", - "default": "entropy" + "default": "entropy", + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", From 59b2088aa183a58523f712ec5ee85cbdab8e1647 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 5 Apr 2022 11:16:43 +0200 Subject: [PATCH 106/214] style: apply prettier --- modules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 51f13ea..becb501 100644 --- a/modules.json +++ b/modules.json @@ -38,4 +38,4 @@ } } } -} \ No newline at end of file +} From 39e5e802c7af4e7e36927afa471801ff3b8aab6e Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 5 Apr 2022 11:29:39 +0200 Subject: [PATCH 107/214] refactor: make code more explicit, add comment --- subworkflows/local/shortread_adapterremoval.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index 8c02f6f..906a33f 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -39,7 +39,9 @@ workflow SHORTREAD_ADAPTERREMOVAL { ADAPTERREMOVAL_PAIRED.out.paired_truncated ) .groupTuple() - .map { [it.head(), it.tail().flatten()] } + // Paired-end reads cause a nested tuple during grouping. + // We want to present a flat list of files to `CAT_FASTQ`. + .map { meta, fastq -> [meta, fastq.flatten()] } CAT_FASTQ(ch_concat_fastq) From 245a4d1f5dae7e29ec76aa1265154edbbe3eb3db Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 5 Apr 2022 13:08:44 +0200 Subject: [PATCH 108/214] Fix MQC staging and remove debugging dump --- subworkflows/local/shortread_adapterremoval.nf | 4 ++-- workflows/taxprofiler.nf | 13 ++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index 9d49b10..473a05f 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -117,8 +117,8 @@ workflow SHORTREAD_ADAPTERREMOVAL { ch_versions = ch_versions.mix( ADAPTERREMOVAL_SINGLE.out.versions.first() ) ch_versions = ch_versions.mix( ADAPTERREMOVAL_PAIRED.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( - ADAPTERREMOVAL_PAIRED.out.log.collect{it[1]}, - ADAPTERREMOVAL_SINGLE.out.log.collect{it[1]} + ADAPTERREMOVAL_PAIRED.out.log, + ADAPTERREMOVAL_SINGLE.out.log ) emit: diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index ddd5914..eb2b461 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -171,7 +171,6 @@ workflow TAXPROFILER { } ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 - .dump(tag: "input_metaphlan3") .multiMap { it -> reads: [it[0] + it[2], it[1]] @@ -213,34 +212,34 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) if (params.shortread_clipmerge) { - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]).dump(tag: "clipmerge") ) ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions ) } if (params.longread_clip) { - ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) + ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]).dump(tag: "clip") ) ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions ) } if (params.shortread_complexityfilter){ - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]).dump(tag: "complex") ) ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } if (params.run_kraken2) { - ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]).dump(tag: "kraken") ) ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) } if (params.run_malt) { - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]).dump(tag: "malt") ) ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) } // TODO Versions for Karken/MALT not report? // TODO create multiQC module for metaphlan MULTIQC ( - ch_multiqc_files.collect() + ch_multiqc_files.collect().dump(tag: "input_to_mqc") ) multiqc_report = MULTIQC.out.report.toList() ch_versions = ch_versions.mix(MULTIQC.out.versions) From 5b9355725dea7af56745a28e6807ae5768798e8b Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 5 Apr 2022 13:14:06 +0200 Subject: [PATCH 109/214] Whoops --- workflows/taxprofiler.nf | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index eb2b461..8dfe996 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -17,7 +17,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } -if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not except uncollapsed paired-reads. Pairs will be profiled as separate files." +if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "[nf-core/taxprofiler] error: cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" /* @@ -212,34 +212,34 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) if (params.shortread_clipmerge) { - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]).dump(tag: "clipmerge") ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions ) } if (params.longread_clip) { - ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]).dump(tag: "clip") ) + ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions ) } if (params.shortread_complexityfilter){ - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]).dump(tag: "complex") ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } if (params.run_kraken2) { - ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]).dump(tag: "kraken") ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) } if (params.run_malt) { - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]).dump(tag: "malt") ) + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) } // TODO Versions for Karken/MALT not report? // TODO create multiQC module for metaphlan MULTIQC ( - ch_multiqc_files.collect().dump(tag: "input_to_mqc") + ch_multiqc_files.collect() ) multiqc_report = MULTIQC.out.report.toList() ch_versions = ch_versions.mix(MULTIQC.out.versions) From 98f082d7b6df9e7ef8da41b98bafab63770d3582 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 5 Apr 2022 13:17:54 +0200 Subject: [PATCH 110/214] Fix mistake in previous upstream merge with AR2 output channel for settings file --- subworkflows/local/shortread_adapterremoval.nf | 5 +++-- workflows/taxprofiler.nf | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index 6512596..bfed76a 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -75,9 +75,10 @@ workflow SHORTREAD_ADAPTERREMOVAL { ch_versions = ch_versions.mix( ADAPTERREMOVAL_SINGLE.out.versions.first() ) ch_versions = ch_versions.mix( ADAPTERREMOVAL_PAIRED.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( - ADAPTERREMOVAL_PAIRED.out.log, - ADAPTERREMOVAL_SINGLE.out.log + ADAPTERREMOVAL_PAIRED.out.settings, + ADAPTERREMOVAL_SINGLE.out.settings ) emit: diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 8dfe996..3b08402 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -222,7 +222,7 @@ workflow TAXPROFILER { } if (params.shortread_complexityfilter){ - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } From 82aa89ad63d8769989b533da4695dc9387d81355 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 5 Apr 2022 13:55:11 +0200 Subject: [PATCH 111/214] re add missing switch meta of merged reads to true --- subworkflows/local/shortread_adapterremoval.nf | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index bfed76a..b573be9 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -38,11 +38,17 @@ workflow SHORTREAD_ADAPTERREMOVAL { ADAPTERREMOVAL_PAIRED.out.singles_truncated, ADAPTERREMOVAL_PAIRED.out.paired_truncated ) + .map { meta, reads -> + def meta_new = meta.clone() + meta_new.single_end = true + [meta_new, reads] + } .groupTuple() // Paired-end reads cause a nested tuple during grouping. // We want to present a flat list of files to `CAT_FASTQ`. .map { meta, fastq -> [meta, fastq.flatten()] } + CAT_FASTQ(ch_concat_fastq) ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads @@ -56,10 +62,13 @@ workflow SHORTREAD_ADAPTERREMOVAL { ADAPTERREMOVAL_PAIRED.out.collapsed_truncated ) .map { meta, reads -> - meta.single_end = true - [meta, reads] + def meta_new = meta.clone() + meta_new.single_end = true + [meta_new, reads] } .groupTuple() + .map { meta, fastq -> [meta, fastq.flatten()] } + CAT_FASTQ(ch_concat_fastq) From c40a9a00113c3e178bf20a4f1e0870d0994d9383 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 5 Apr 2022 12:13:45 +0200 Subject: [PATCH 112/214] refactor: avoid publishing unpacked databases --- conf/modules.config | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 2a8789f..22db8d0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,11 +35,7 @@ process { } withName: UNTAR { - publishDir = [ - path: { "${params.outdir}/databases" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + publishDir = null } withName: FASTQC { From af12435fa5e3adf240d48bf9f764a063c62593b8 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 5 Apr 2022 12:16:26 +0200 Subject: [PATCH 113/214] fix: use parameter for publish mode everywhere --- conf/modules.config | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 22db8d0..a910b8e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -43,7 +43,7 @@ process { ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } publishDir = [ path: { "${params.outdir}/fastqc/raw" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.html' ] } @@ -53,7 +53,7 @@ process { ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } publishDir = [ path: { "${params.outdir}/fastqc/processed" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.html' ] } @@ -69,7 +69,7 @@ process { ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ path: { "${params.outdir}/fastp" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.fastq.gz' ] } @@ -88,7 +88,7 @@ process { ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ path: { "${params.outdir}/fastp" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.fastq.gz' ] } @@ -104,7 +104,7 @@ process { ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ path: { "${params.outdir}/adapterremoval" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.fastq.gz' ] } @@ -123,7 +123,7 @@ process { ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ path: { "${params.outdir}/adapterremoval" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.fastq.gz' ] } @@ -132,7 +132,7 @@ process { ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ path: { "${params.outdir}/porechop" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.fastq.gz' ] } @@ -169,7 +169,7 @@ process { ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/malt/${meta.db_name}" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.{rma6,tab,text,sam,log}' ] } @@ -179,7 +179,7 @@ process { ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/kraken2/${meta.db_name}" }, - mode: 'copy', + mode: params.publish_dir_mode, pattern: '*.{fastq.gz,txt}' ] } From 8827b143a22b07c21c7cbe5776fb112015f5ae8b Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 5 Apr 2022 12:30:06 +0200 Subject: [PATCH 114/214] fix: disable publishing correctly --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index a910b8e..ce9597b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,7 +35,7 @@ process { } withName: UNTAR { - publishDir = null + publishDir = [enabled: false] } withName: FASTQC { From 018b9a8ea60afd0c5d123e4420fdccaecafbc9f7 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 6 Apr 2022 10:21:43 +0200 Subject: [PATCH 115/214] refactor: do not generally publish output --- conf/modules.config | 6 ------ 1 file changed, 6 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ce9597b..f9cfe3b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,12 +12,6 @@ process { - publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - withName: SAMPLESHEET_CHECK { publishDir = [ path: { "${params.outdir}/pipeline_info" }, From 7ab3aa546b8e1d0388981757d259284bf67f5fc8 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 6 Apr 2022 10:27:11 +0200 Subject: [PATCH 116/214] refactor: use flags to publish reads --- conf/modules.config | 25 ++++++++++++++----------- nextflow.config | 2 ++ nextflow_schema.json | 8 ++++++++ 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index f9cfe3b..b59850f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -28,10 +28,6 @@ process { ] } - withName: UNTAR { - publishDir = [enabled: false] - } - withName: FASTQC { ext.args = '--quiet' ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } @@ -64,7 +60,8 @@ process { publishDir = [ path: { "${params.outdir}/fastp" }, mode: params.publish_dir_mode, - pattern: '*.fastq.gz' + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads ] } @@ -83,7 +80,8 @@ process { publishDir = [ path: { "${params.outdir}/fastp" }, mode: params.publish_dir_mode, - pattern: '*.fastq.gz' + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads ] } @@ -99,7 +97,8 @@ process { publishDir = [ path: { "${params.outdir}/adapterremoval" }, mode: params.publish_dir_mode, - pattern: '*.fastq.gz' + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads ] } @@ -118,7 +117,8 @@ process { publishDir = [ path: { "${params.outdir}/adapterremoval" }, mode: params.publish_dir_mode, - pattern: '*.fastq.gz' + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads ] } @@ -127,7 +127,8 @@ process { publishDir = [ path: { "${params.outdir}/porechop" }, mode: params.publish_dir_mode, - pattern: '*.fastq.gz' + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads ] } @@ -141,7 +142,8 @@ process { publishDir = [ path: { "${params.outdir}/bbduk/" }, mode: params.publish_dir_mode, - pattern: '*.{fastq.gz,log}' + pattern: '*.{fastq.gz,log}', + enabled: params.save_complexityfiltered_reads ] } @@ -154,7 +156,8 @@ process { publishDir = [ path: { "${params.outdir}/prinseqplusplus/" }, mode: params.publish_dir_mode, - pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz,log}' + pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz,log}', + enabled: params.save_complexityfiltered_reads ] } diff --git a/nextflow.config b/nextflow.config index e6d6307..19cc823 100644 --- a/nextflow.config +++ b/nextflow.config @@ -64,6 +64,7 @@ params { shortread_clipmerge_adapter2 = null shortread_clipmerge_minlength = 15 longread_clip = false + save_preprocessed_reads = false // Complexity filtering shortread_complexityfilter = false @@ -73,6 +74,7 @@ params { shortread_complexityfilter_bbduk_mask = false shortread_complexityfilter_prinseqplusplus_mode = 'entropy' shortread_complexityfilter_prinseqplusplus_dustscore = 0.5 + save_complexityfiltered_reads = false // MALT diff --git a/nextflow_schema.json b/nextflow_schema.json index 6f6125e..6858409 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -308,6 +308,10 @@ "type": "integer", "default": 15 }, + "save_preprocessed_reads": { + "type": "boolean", + "default": false + }, "shortread_complexityfilter_tool": { "type": "string", "default": "bbduk" @@ -334,6 +338,10 @@ "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", "default": 0.5 + }, + "save_complexityfiltered_reads": { + "type": "boolean", + "default": false } } } From 26779a4420f5d71d55a7bb99ff85ee1863740401 Mon Sep 17 00:00:00 2001 From: sofstam Date: Thu, 7 Apr 2022 16:27:19 +0200 Subject: [PATCH 117/214] Remove db_name from nextflow.config --- conf/test.config | 4 ++-- nextflow.config | 1 - nextflow_schema.json | 7 +++++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/conf/test.config b/conf/test.config index 6e82300..d392306 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,10 +25,10 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' run_kraken2 = true - run_malt = true + run_malt = false run_metaphlan3 = true run_centrifuge = true shortread_clipmerge = true longread_clip = false - shortread_complexityfilter = true + shortread_complexityfilter = false } diff --git a/nextflow.config b/nextflow.config index 37f886f..b4a8d91 100644 --- a/nextflow.config +++ b/nextflow.config @@ -86,7 +86,6 @@ params { // centrifuge run_centrifuge = false - centrifuge_db_name = 'minigut_cf' centrifuge_save_unaligned = false centrifuge_save_aligned = false centrifuge_sam_format = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 2ed80ed..23eb83b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -358,7 +358,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -369,4 +372,4 @@ "default": false } } -} +} \ No newline at end of file From 3d45ac57aead82baa98a8f0ee8aa668775bf4021 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Thu, 7 Apr 2022 16:42:22 +0200 Subject: [PATCH 118/214] Prettier --- modules.json | 2 +- nextflow_schema.json | 26 +++++--------------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/modules.json b/modules.json index 5abebf6..7fbc65c 100644 --- a/modules.json +++ b/modules.json @@ -47,4 +47,4 @@ } } } -} \ No newline at end of file +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 23eb83b..2b115eb 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -310,10 +300,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -358,10 +345,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -372,4 +356,4 @@ "default": false } } -} \ No newline at end of file +} From 2dfe3b3cc1d3a08aace2f521e84fa2c9820c3d98 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli <91951607+sofstam@users.noreply.github.com> Date: Fri, 8 Apr 2022 10:00:04 +0200 Subject: [PATCH 119/214] Update conf/test.config Co-authored-by: James A. Fellows Yates --- conf/test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index d392306..270ad95 100644 --- a/conf/test.config +++ b/conf/test.config @@ -30,5 +30,5 @@ params { run_centrifuge = true shortread_clipmerge = true longread_clip = false - shortread_complexityfilter = false + shortread_complexityfilter = true } From 48d28cf8d4ac889b80736c86b3ed97eb46554af2 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli <91951607+sofstam@users.noreply.github.com> Date: Fri, 8 Apr 2022 10:00:10 +0200 Subject: [PATCH 120/214] Update conf/test.config Co-authored-by: James A. Fellows Yates --- conf/test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index 270ad95..6e82300 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,7 +25,7 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' run_kraken2 = true - run_malt = false + run_malt = true run_metaphlan3 = true run_centrifuge = true shortread_clipmerge = true From fd5ebea9a697b7cc2e97cc2c056f131b64848bae Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Fri, 8 Apr 2022 10:57:58 +0200 Subject: [PATCH 121/214] Remove old centrifuge module --- modules.json | 2 +- .../nf-core/modules/centrifuge/main.nf | 63 ---------------- .../nf-core/modules/centrifuge/meta.yml | 73 ------------------- 3 files changed, 1 insertion(+), 137 deletions(-) delete mode 100644 modules/nf-core/modules/nf-core/modules/centrifuge/main.nf delete mode 100644 modules/nf-core/modules/nf-core/modules/centrifuge/meta.yml diff --git a/modules.json b/modules.json index 7fbc65c..5abebf6 100644 --- a/modules.json +++ b/modules.json @@ -47,4 +47,4 @@ } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/nf-core/modules/centrifuge/main.nf b/modules/nf-core/modules/nf-core/modules/centrifuge/main.nf deleted file mode 100644 index 7eb566d..0000000 --- a/modules/nf-core/modules/nf-core/modules/centrifuge/main.nf +++ /dev/null @@ -1,63 +0,0 @@ -process CENTRIFUGE { - tag "$meta.id" - label 'process_high' - - conda (params.enable_conda ? "bioconda::centrifuge=1.0.4_beta" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6' : - 'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }" - - input: - tuple val(meta), path(reads) - path db - val save_unaligned - val save_aligned - val sam_format - - output: - tuple val(meta), path('*report.txt') , emit: report - tuple val(meta), path('*results.txt') , emit: results - tuple val(meta), path('*kreport.txt') , emit: kreport - tuple val(meta), path('*.sam') , optional: true, emit: sam - tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_mapped - tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def paired = meta.single_end ? "-U ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" - def db_name = db.toString().replace(".tar.gz","") - def unaligned = '' - def aligned = '' - if (meta.single_end) { - unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' - aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : '' - } else { - unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' - aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : '' - } - def sam_output = sam_format ? "--out-fmt 'sam'" : '' - """ - tar -xf $db - centrifuge \\ - -x $db_name \\ - -p $task.cpus \\ - $paired \\ - --report-file ${prefix}.report.txt \\ - -S ${prefix}.results.txt \\ - $unaligned \\ - $aligned \\ - $sam_output \\ - $args - centrifuge-kreport -x $db_name ${prefix}.results.txt > ${prefix}.kreport.txt - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') - END_VERSIONS - """ -} diff --git a/modules/nf-core/modules/nf-core/modules/centrifuge/meta.yml b/modules/nf-core/modules/nf-core/modules/centrifuge/meta.yml deleted file mode 100644 index 3adf0e2..0000000 --- a/modules/nf-core/modules/nf-core/modules/centrifuge/meta.yml +++ /dev/null @@ -1,73 +0,0 @@ -name: centrifuge -description: Classifies metagenomic sequence data -keywords: - - classify - - metagenomics - - fastq - - db -tools: - - centrifuge: - description: Centrifuge is a classifier for metagenomic sequences. - homepage: https://ccb.jhu.edu/software/centrifuge/ - documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml - doi: 10.1101/gr.210641.116 - licence: ["GPL v3"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. - - db: - type: directory - description: Centrifuge database in .tar.gz format - pattern: "*.tar.gz" - - save_unaligned: - type: value - description: If true unmapped fastq files are saved - - save_aligned: - type: value - description: If true mapped fastq files are saved -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - report: - type: file - description: | - File containing a classification summary - pattern: "*.{report.txt}" - - results: - type: file - description: | - File containing classification results - pattern: "*.{results.txt}" - - kreport: - type: file - description: | - File containing kraken-style report from centrifuge - out files. - pattern: "*.{kreport.txt}" - - fastq_unmapped: - type: file - description: Unmapped fastq files - pattern: "*.unmapped.fastq.gz" - - fastq_mapped: - type: file - description: Mapped fastq files - pattern: "*.mapped.fastq.gz" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@sofstam" - - "@jfy133" - - "@sateeshperi" From 63bc597daf008a4de9ed5e34c0b8d682f349a34a Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Fri, 8 Apr 2022 11:01:45 +0200 Subject: [PATCH 122/214] Prettier --- modules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 5abebf6..7fbc65c 100644 --- a/modules.json +++ b/modules.json @@ -47,4 +47,4 @@ } } } -} \ No newline at end of file +} From 7b08c49cd6cb6471384a26c1202733cad0fe58ae Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 8 Apr 2022 11:54:54 +0200 Subject: [PATCH 123/214] Re-add run merging and gonna let GHA see if it works >.> --- .github/workflows/ci.yml | 1 + nextflow.config | 2 ++ workflows/taxprofiler.nf | 25 +++++++++++++++++++++++-- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 79148f0..7678645 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,6 +39,7 @@ jobs: - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs" - "--shortread_complexityfilter_tool bbduk" - "--shortread_complexityfilter_tool prinseq" + - "--run_merging" steps: - name: Check out pipeline code diff --git a/nextflow.config b/nextflow.config index 19cc823..1c69d36 100644 --- a/nextflow.config +++ b/nextflow.config @@ -76,6 +76,8 @@ params { shortread_complexityfilter_prinseqplusplus_dustscore = 0.5 save_complexityfiltered_reads = false + // run merging + run_merging = false // MALT run_malt = false diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 3b08402..61eda6e 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -125,13 +125,34 @@ workflow TAXPROFILER { ch_shortreads_filtered = ch_shortreads_preprocessed } + /* + STEP: Run merging + */ + + if ( params.run_merging ) { + ch_reads_for_cat = ch_shortreads_filtered + .mix( ch_longreads_preprocessed ) + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['run_accession'].remove() + [ meta_new, reads ] + } + .groupTuple() + + ch_reads_runmerged = CAT_FASTQ ( ch_reads_for_cat ) + + } else { + ch_reads_runmerged = ch_shortreads_filtered + .mix( ch_longreads_preprocessed ) + } + /* COMBINE READS WITH POSSIBLE DATABASES */ // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] - ch_input_for_profiling = ch_shortreads_filtered - .mix( ch_longreads_preprocessed ) + ch_input_for_profiling = ch_reads_runmerged .combine(DB_CHECK.out.dbs) .branch { malt: it[2]['tool'] == 'malt' From 74c496f6af04e15e1625d7311791004a645b3a21 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 8 Apr 2022 11:58:16 +0200 Subject: [PATCH 124/214] Fix CAT_FASTQ output --- workflows/taxprofiler.nf | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 61eda6e..2d0c17d 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -140,8 +140,7 @@ workflow TAXPROFILER { } .groupTuple() - ch_reads_runmerged = CAT_FASTQ ( ch_reads_for_cat ) - + ch_reads_runmerged = CAT_FASTQ ( ch_reads_for_cat ).reads } else { ch_reads_runmerged = ch_shortreads_filtered .mix( ch_longreads_preprocessed ) @@ -247,6 +246,10 @@ workflow TAXPROFILER { ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } + if (params.run_merging){ + ch_versions = ch_versions.mix(CAT_FASTQ.out.versions) + } + if (params.run_kraken2) { ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) From 4d726a87e98f56bf1a4e0d52259d9e83f89539ce Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 8 Apr 2022 12:01:07 +0200 Subject: [PATCH 125/214] Fix metadata removal --- workflows/taxprofiler.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 2d0c17d..33d9725 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -135,7 +135,7 @@ workflow TAXPROFILER { .map { meta, reads -> def meta_new = meta.clone() - meta_new['run_accession'].remove() + meta_new.remove('run_accession') [ meta_new, reads ] } .groupTuple() From d130a72d74899c3fd85db3c1c751b7b6848fd031 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 8 Apr 2022 13:09:23 +0200 Subject: [PATCH 126/214] Get this working --- .github/workflows/ci.yml | 1 + conf/modules.config | 6 +++--- subworkflows/local/shortread_fastp.nf | 4 ++-- workflows/taxprofiler.nf | 18 ++++++++++++++++-- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7678645..53423cb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,6 +40,7 @@ jobs: - "--shortread_complexityfilter_tool bbduk" - "--shortread_complexityfilter_tool prinseq" - "--run_merging" + - "--run_merging --shortread_clipmerge_mergepairs" steps: - name: Check out pipeline code diff --git a/conf/modules.config b/conf/modules.config index b59850f..7602e3f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -163,7 +163,7 @@ process { withName: MALT_RUN { ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + ext.prefix = if params.run_merging : { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/malt/${meta.db_name}" }, mode: params.publish_dir_mode, @@ -173,7 +173,7 @@ process { withName: KRAKEN2_KRAKEN2 { ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + ext.prefix = if params.run_merging : { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/kraken2/${meta.db_name}" }, mode: params.publish_dir_mode, @@ -183,7 +183,7 @@ process { withName: METAPHLAN3 { publishDir = [ - path: { "${params.outdir}/metaphlan3/${meta.db_name}" }, + ext.prefix = if params.run_merging : { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } mode: params.publish_dir_mode, pattern: '*.{biom,txt}' ] diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index 18baf17..04057b1 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -28,8 +28,8 @@ workflow SHORTREAD_FASTP { .map { meta, reads -> def meta_new = meta.clone() - meta_new['single_end'] = 1 - [ meta_new, reads ] + meta_new['single_end'] = true + [ meta_new, reads.flatten() ] } ch_fastp_reads_prepped = ch_fastp_reads_prepped_pe.mix( FASTP_SINGLE.out.reads ) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 33d9725..e04d4d6 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -130,7 +130,8 @@ workflow TAXPROFILER { */ if ( params.run_merging ) { - ch_reads_for_cat = ch_shortreads_filtered + + ch_reads_for_cat_branch = ch_shortreads_filtered .mix( ch_longreads_preprocessed ) .map { meta, reads -> @@ -139,8 +140,21 @@ workflow TAXPROFILER { [ meta_new, reads ] } .groupTuple() + .map { + meta, reads -> + [ meta, reads.flatten() ] + } + .branch { + // we can't concate files if there is not a second run, we branch + // here to separate them out, and mix after + cat: ( it[0]['single_end'] && it[1].size() > 1 ) || ( !it[0]['single_end'] && it[1].size() > 2 ) + skip: true + } + + ch_reads_for_cat_branch.cat.dump(tag: "for_catting") + + ch_reads_runmerged = CAT_FASTQ ( ch_reads_for_cat_branch.cat ).reads.mix( ch_reads_for_cat_branch.skip ) - ch_reads_runmerged = CAT_FASTQ ( ch_reads_for_cat ).reads } else { ch_reads_runmerged = ch_shortreads_filtered .mix( ch_longreads_preprocessed ) From ca011ccc5b363ea9d6c1eaf44713a9c09e471f39 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 8 Apr 2022 13:28:37 +0200 Subject: [PATCH 127/214] Fix cat_fastq naming logic? --- conf/modules.config | 19 +++++++++++++++---- nextflow.config | 1 + nextflow_schema.json | 38 +++++++++++++++++++++++++++++--------- workflows/taxprofiler.nf | 4 ---- 4 files changed, 45 insertions(+), 17 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 7602e3f..97e9510 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -161,9 +161,19 @@ process { ] } + withName: CAT_FASTQ { + ext.prefix = { "${meta.id}-${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/run_merging/" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_runmerged_reads + ] + } + withName: MALT_RUN { ext.args = { "${meta.db_params}" } - ext.prefix = if params.run_merging : { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + ext.prefix = params.run_merging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/malt/${meta.db_name}" }, mode: params.publish_dir_mode, @@ -173,7 +183,7 @@ process { withName: KRAKEN2_KRAKEN2 { ext.args = { "${meta.db_params}" } - ext.prefix = if params.run_merging : { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + ext.prefix = params.run_merging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/kraken2/${meta.db_name}" }, mode: params.publish_dir_mode, @@ -182,12 +192,13 @@ process { } withName: METAPHLAN3 { + ext.args = { "${meta.db_params}" } + ext.prefix = params.run_merging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ - ext.prefix = if params.run_merging : { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + path: { "${params.outdir}/metaphlan3/${meta.db_name}" }, mode: params.publish_dir_mode, pattern: '*.{biom,txt}' ] - ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } withName: CUSTOM_DUMPSOFTWAREVERSIONS { diff --git a/nextflow.config b/nextflow.config index 1c69d36..d969ed9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -78,6 +78,7 @@ params { // run merging run_merging = false + save_runmerged_reads = false // MALT run_malt = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 6858409..0b4b4fb 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -288,7 +298,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -309,8 +322,7 @@ "default": 15 }, "save_preprocessed_reads": { - "type": "boolean", - "default": false + "type": "boolean" }, "shortread_complexityfilter_tool": { "type": "string", @@ -333,15 +345,23 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", "default": 0.5 }, "save_complexityfiltered_reads": { - "type": "boolean", - "default": false + "type": "boolean" + }, + "run_merging": { + "type": "boolean" + }, + "save_runmerged_reads": { + "type": "boolean" } } -} +} \ No newline at end of file diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index e04d4d6..bdb93ab 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -140,10 +140,6 @@ workflow TAXPROFILER { [ meta_new, reads ] } .groupTuple() - .map { - meta, reads -> - [ meta, reads.flatten() ] - } .branch { // we can't concate files if there is not a second run, we branch // here to separate them out, and mix after From a634814d848a68252d81b231e34942f5fa616c83 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 8 Apr 2022 13:34:00 +0200 Subject: [PATCH 128/214] Formatting and fix fastp output --- nextflow_schema.json | 26 ++++--------------- .../local/shortread_adapterremoval.nf | 2 +- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 0b4b4fb..64836df 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -298,10 +288,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -345,10 +332,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -364,4 +348,4 @@ "type": "boolean" } } -} \ No newline at end of file +} diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index b573be9..a7948e7 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -67,7 +67,7 @@ workflow SHORTREAD_ADAPTERREMOVAL { [meta_new, reads] } .groupTuple() - .map { meta, fastq -> [meta, fastq.flatten()] } + .map { meta, fastq -> [meta, [ fastq ].flatten()] } CAT_FASTQ(ch_concat_fastq) From 6c14f2b230a6c6df99e89c44ed8904cb0a5a7b59 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 8 Apr 2022 13:44:52 +0200 Subject: [PATCH 129/214] Remove the flattening? --- subworkflows/local/shortread_adapterremoval.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index a7948e7..735d3b8 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -67,7 +67,7 @@ workflow SHORTREAD_ADAPTERREMOVAL { [meta_new, reads] } .groupTuple() - .map { meta, fastq -> [meta, [ fastq ].flatten()] } + .map { meta, fastq -> [meta, fastq] } CAT_FASTQ(ch_concat_fastq) From 35cb6e042acf3fbe34c9664bebb9c27ef0d96179 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 8 Apr 2022 13:46:12 +0200 Subject: [PATCH 130/214] Flatten the right thing --- subworkflows/local/shortread_adapterremoval.nf | 2 +- subworkflows/local/shortread_fastp.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index 735d3b8..b573be9 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -67,7 +67,7 @@ workflow SHORTREAD_ADAPTERREMOVAL { [meta_new, reads] } .groupTuple() - .map { meta, fastq -> [meta, fastq] } + .map { meta, fastq -> [meta, fastq.flatten()] } CAT_FASTQ(ch_concat_fastq) diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index 04057b1..4626691 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -29,7 +29,7 @@ workflow SHORTREAD_FASTP { meta, reads -> def meta_new = meta.clone() meta_new['single_end'] = true - [ meta_new, reads.flatten() ] + [ meta_new, [ reads ].flatten() ] } ch_fastp_reads_prepped = ch_fastp_reads_prepped_pe.mix( FASTP_SINGLE.out.reads ) From 8839fe22b8856fb1d69e11e2e1e0a4367cb97dc3 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 8 Apr 2022 20:02:22 +0200 Subject: [PATCH 131/214] Fix output tuple for reads --- workflows/taxprofiler.nf | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index bdb93ab..7d5f60f 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -149,7 +149,15 @@ workflow TAXPROFILER { ch_reads_for_cat_branch.cat.dump(tag: "for_catting") - ch_reads_runmerged = CAT_FASTQ ( ch_reads_for_cat_branch.cat ).reads.mix( ch_reads_for_cat_branch.skip ) + ch_reads_runmerged = CAT_FASTQ ( ch_reads_for_cat_branch.cat ).reads + .mix( ch_reads_for_cat_branch.skip ) + .map { + meta, reads -> + + [ meta, [ reads ].flatten() ] + } + + ch_reads_runmerged.dump(tag: "ch_reads_runmerged" ) } else { ch_reads_runmerged = ch_shortreads_filtered From afb66e445fcb2f9c60f4152e1184f5f8600a843c Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 8 Apr 2022 21:36:59 +0200 Subject: [PATCH 132/214] Append pairment to ID at profiling to prevent multiqc-level filename crash --- workflows/taxprofiler.nf | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 7d5f60f..1da812d 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -147,8 +147,6 @@ workflow TAXPROFILER { skip: true } - ch_reads_for_cat_branch.cat.dump(tag: "for_catting") - ch_reads_runmerged = CAT_FASTQ ( ch_reads_for_cat_branch.cat ).reads .mix( ch_reads_for_cat_branch.skip ) .map { @@ -157,8 +155,6 @@ workflow TAXPROFILER { [ meta, [ reads ].flatten() ] } - ch_reads_runmerged.dump(tag: "ch_reads_runmerged" ) - } else { ch_reads_runmerged = ch_shortreads_filtered .mix( ch_longreads_preprocessed ) @@ -168,8 +164,15 @@ workflow TAXPROFILER { COMBINE READS WITH POSSIBLE DATABASES */ - // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] + // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], [ /2612.merged.fastq.gz ], ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] ch_input_for_profiling = ch_reads_runmerged + .map { + meta, reads -> + def meta_new = meta.clone() + pairtype = meta_new['single_end'] ? '_se' : '_pe' + meta_new['id'] = meta_new['id'] + pairtype + [meta_new, reads] + } .combine(DB_CHECK.out.dbs) .branch { malt: it[2]['tool'] == 'malt' From ecf0eea4f99f0601124661557b41c5092d507ca6 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 10 Apr 2022 06:43:30 +0200 Subject: [PATCH 133/214] Move profiling to subworkflow and standardise outputs --- conf/modules.config | 24 +++---- subworkflows/local/profiling.nf | 120 ++++++++++++++++++++++++++++++++ workflows/taxprofiler.nf | 97 ++------------------------ 3 files changed, 136 insertions(+), 105 deletions(-) create mode 100644 subworkflows/local/profiling.nf diff --git a/conf/modules.config b/conf/modules.config index de41e69..531bd5a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -167,7 +167,7 @@ process { publishDir = [ path: { "${params.outdir}/malt/${meta.db_name}" }, mode: params.publish_dir_mode, - pattern: '*.{rma6,tab,text,sam,log}' + pattern: '*.{log}' ] } @@ -177,7 +177,7 @@ process { publishDir = [ path: { "${params.outdir}/kraken2/${meta.db_name}" }, mode: params.publish_dir_mode, - pattern: '*.{fastq.gz,txt}' + pattern: '*.{txt}' ] } @@ -190,6 +190,16 @@ process { ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } + withName: CENTRIFUGE_CENTRIFUGE { + publishDir = [ + path: { "${params.outdir}/centrifuge/${meta.db_name}" }, + mode: params.publish_dir_mode, + pattern: '*.txt' + ] + ext.args = { "${meta.db_params}" } + ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -198,14 +208,4 @@ process { ] } - withName: CENTRIFUGE_CENTRIFUGE { - publishDir = [ - path: { "${params.outdir}/centrifuge/${meta.db_name}" }, - mode: params.publish_dir_mode, - pattern: '*.{fastq.gz,txt}' - ] - ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } - } - } diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf new file mode 100644 index 0000000..ac48d77 --- /dev/null +++ b/subworkflows/local/profiling.nf @@ -0,0 +1,120 @@ +// +// Run profiling +// + +include { MALT_RUN } from '../../modules/nf-core/modules/malt/run/main' +include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/modules/kraken2/kraken2/main' +include { CENTRIFUGE_CENTRIFUGE } from '../../modules/nf-core/modules/centrifuge/centrifuge/main' +include { METAPHLAN3 } from '../../modules/nf-core/modules/metaphlan3/main' + +workflow PROFILING { + take: + shortreads // [ [ meta ], [ reads ] ] + longreads // [ [ meta ], [ reads ] ] + databases // [ [ meta ], path ] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + +/* + COMBINE READS WITH POSSIBLE DATABASES + */ + + // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] + ch_input_for_profiling = shortreads + .mix( longreads ) + .combine(databases) + .branch { + malt: it[2]['tool'] == 'malt' + kraken2: it[2]['tool'] == 'kraken2' + metaphlan3: it[2]['tool'] == 'metaphlan3' + centrifuge: it[2]['tool'] == 'centrifuge' + unknown: true + } + + /* + PREPARE PROFILER INPUT CHANNELS + */ + + // Each tool as a slightly different input structure and generally separate + // input channels for reads vs databases. We restructure the channel tuple + // for each tool and make liberal use of multiMap to keep reads/databases + // channel element order in sync with each other + + // MALT: We groupTuple to have all samples in one channel for MALT as database + // loading takes a long time, so we only want to run it once per database + // TODO document somewhere we only accept illumina short reads for MALT? + ch_input_for_malt = ch_input_for_profiling.malt + .filter { it[0]['instrument_platform'] == 'ILLUMINA' } + .map { + it -> + def temp_meta = [ id: it[2]['db_name']] + it[2] + def db = it[3] + [ temp_meta, it[1], db ] + } + .groupTuple(by: [0,2]) + .multiMap { + it -> + reads: [ it[0], it[1].flatten() ] + db: it[2] + } + + // All subsequent tools can easily run on a per-sample basis + + ch_input_for_kraken2 = ch_input_for_profiling.kraken2 + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + ch_input_for_centrifuge = ch_input_for_profiling.centrifuge + .dump(tag: "input for centrifuge") + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + + /* + RUN PROFILING + */ + + if ( params.run_malt ) { + MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) + ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) + } + + if ( params.run_kraken2 ) { + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) + ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) + } + + if ( params.run_centrifuge ) { + CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format ) + ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) + } + + if ( params.run_metaphlan3 ) { + METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db ) + ch_versions = ch_versions.mix( METAPHLAN3.out.versions.first() ) + } + + + emit: + // TODO work out if there is enough standardisation of output to export as one? + //output = ch_filtered_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 6b89c66..38afdda 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -44,6 +44,7 @@ include { DB_CHECK } from '../subworkflows/local/db_check' include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_complexityfiltering' +include { PROFILING } from '../subworkflows/local/profiling' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -59,10 +60,6 @@ include { MULTIQC } from '../modules/nf-core/modules/multiqc include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' -include { MALT_RUN } from '../modules/nf-core/modules/malt/run/main' -include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/modules/kraken2/kraken2/main' -include { CENTRIFUGE_CENTRIFUGE } from '../modules/nf-core/modules/centrifuge/centrifuge/main' -include { METAPHLAN3 } from '../modules/nf-core/modules/metaphlan3/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -127,88 +124,10 @@ workflow TAXPROFILER { } /* - COMBINE READS WITH POSSIBLE DATABASES + SUBWORKFLOW: PROFILING */ - // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] - ch_input_for_profiling = ch_shortreads_filtered - .mix( ch_longreads_preprocessed ) - .combine(DB_CHECK.out.dbs) - .branch { - malt: it[2]['tool'] == 'malt' - kraken2: it[2]['tool'] == 'kraken2' - metaphlan3: it[2]['tool'] == 'metaphlan3' - centrifuge: it[2]['tool'] == 'centrifuge' - unknown: true - } - - /* - PREPARE PROFILER INPUT CHANNELS - */ - - // We groupTuple to have all samples in one channel for MALT as database - // loading takes a long time, so we only want to run it once per database - // TODO document somewhere we only accept illumina short reads for MALT? - ch_input_for_malt = ch_input_for_profiling.malt - .filter { it[0]['instrument_platform'] == 'ILLUMINA' } - .map { - it -> - def temp_meta = [ id: it[2]['db_name']] + it[2] - def db = it[3] - [ temp_meta, it[1], db ] - } - .groupTuple(by: [0,2]) - .multiMap { - it -> - reads: [ it[0], it[1].flatten() ] - db: it[2] - } - - // We can run Kraken2 one-by-one sample-wise - ch_input_for_kraken2 = ch_input_for_profiling.kraken2 - .multiMap { - it -> - reads: [ it[0] + it[2], it[1] ] - db: it[3] - } - - // We can run centrifuge one-by-one sample-wise - ch_input_for_centrifuge = ch_input_for_profiling.centrifuge - .dump(tag: "input for centrifuge") - .multiMap { - it -> - reads: [ it[0] + it[2], it[1] ] - db: it[3] - } - - // - // RUN PROFILING - // - ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 - .multiMap { - it -> - reads: [it[0] + it[2], it[1]] - db: it[3] - } - - /* - MODULE: RUN PROFILING - */ - if ( params.run_malt ) { - MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) - } - - if ( params.run_kraken2 ) { - KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) - } - - if ( params.run_centrifuge ) { - CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format ) - } - - if ( params.run_metaphlan3 ) { - METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db ) - } + PROFILING ( ch_shortreads_filtered, ch_longreads_preprocessed, DB_CHECK.out.dbs ) /* MODULE: MultiQC @@ -244,17 +163,9 @@ workflow TAXPROFILER { ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } - if (params.run_kraken2) { - ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) - } + ch_multiqc_files = ch_multiqc_files.mix( PROFILING.out.mqc ) - if (params.run_malt) { - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) - } - // TODO Versions for Karken/MALT not report? // TODO create multiQC module for metaphlan MULTIQC ( ch_multiqc_files.collect() From 082093f3dee43655b4911e5d81ba0eb0c693454c Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 10 Apr 2022 06:44:10 +0200 Subject: [PATCH 134/214] Prettier --- conf/modules.config | 8 ++++++++ workflows/taxprofiler.nf | 1 - 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 531bd5a..2f5710e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -208,4 +208,12 @@ process { ] } + withName: MULTIQC { + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 38afdda..4afdd6c 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -165,7 +165,6 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix( PROFILING.out.mqc ) - // TODO create multiQC module for metaphlan MULTIQC ( ch_multiqc_files.collect() From 80c08af11b28f89289bc0739fbd13da661b45644 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 10 Apr 2022 06:48:25 +0200 Subject: [PATCH 135/214] Mix in profiling versions into mainline versons channel --- workflows/taxprofiler.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 4afdd6c..237f1ea 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -128,6 +128,7 @@ workflow TAXPROFILER { */ PROFILING ( ch_shortreads_filtered, ch_longreads_preprocessed, DB_CHECK.out.dbs ) + ch_versions = ch_versions.mix( PROFILING.out.versions ) /* MODULE: MultiQC From 030099c559a5eeeee2a65c523a9e946748327b45 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 10 Apr 2022 07:26:20 +0200 Subject: [PATCH 136/214] A bit of clean up --- subworkflows/local/profiling.nf | 1 - subworkflows/local/shortread_fastp.nf | 2 +- workflows/taxprofiler.nf | 2 ++ 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index ac48d77..c74c583 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -70,7 +70,6 @@ workflow PROFILING { } ch_input_for_centrifuge = ch_input_for_profiling.centrifuge - .dump(tag: "input for centrifuge") .multiMap { it -> reads: [ it[0] + it[2], it[1] ] diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index 18baf17..9fb9425 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -10,7 +10,7 @@ workflow SHORTREAD_FASTP { reads // [[meta], [reads]] main: - ch_versions = Channel.empty() + ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() ch_input_for_fastp = reads diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 237f1ea..4b9f927 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -85,6 +85,7 @@ workflow TAXPROFILER { DB_CHECK ( ch_databases ) + ch_versions = ch_versions.mix(DB_CHECK.out.versions) /* MODULE: Run FastQC @@ -101,6 +102,7 @@ workflow TAXPROFILER { SUBWORKFLOW: PERFORM PREPROCESSING */ if ( params.shortread_clipmerge ) { + ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads } else { ch_shortreads_preprocessed = INPUT_CHECK.out.fastq From b892fd0be5b754cc361ab6d9c71f15b732353057 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 10 Apr 2022 22:17:44 +0200 Subject: [PATCH 137/214] Initial attempt at docs --- README.md | 32 ++++++++-------- assets/samplesheet.csv | 9 +++-- docs/usage.md | 84 +++++++++++++++++++++++++++++------------- 3 files changed, 80 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index e976f73..88f643b 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ -**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling against multiple profiling tools and databases and produces standardised output tables. +**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling with multiple profiling tools against multiple databases, produces standardised output tables. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! @@ -32,20 +32,20 @@ On release, automated continuous integration tests run the pipeline on a full-si 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) 2. Performs optional read pre-processing - - Adapter clipping and merging (short, and nanopore reads) - - Low complexity filtering - - Host read removal + - Adapter clipping and merging (short read: [fastp](https://github.com/OpenGene/fastp), [AdapterRemoval2](https://github.com/MikkelSchubert/adapterremoval); long read: [porechop](https://github.com/rrwick/Porechop)) + - Low complexity filtering ([bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus)) + - Host read removal ([BowTie2](http://bowtie-bio.sourceforge.net/bowtie2/)) - Run merging -3. Performs taxonomic profiling a choice of: - - Kraken2 - - MetaPhlAn3 - - MALT - - DIAMOND - - Centrifuge - - Kaiju - - mOTUs +3. Performs taxonomic profiling via a choice of any or all of: + - [Kraken2](https://ccb.jhu.edu/software/kraken2/) + - [MetaPhlAn3](https://huttenhower.sph.harvard.edu/metaphlan/) + - [MALT](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/malt/) + - [DIAMOND](https://github.com/bbuchfink/diamond) + - [Centrifuge](https://ccb.jhu.edu/software/centrifuge/) + - [Kaiju](https://kaiju.binf.ku.dk/) + - [mOTUs](https://motu-tool.org/) 4. Perform optional post-processing with: - - bracken + - [bracken](https://ccb.jhu.edu/software/bracken/) 5. Standardises output tables 6. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) @@ -70,10 +70,8 @@ On release, automated continuous integration tests run the pipeline on a full-si 4. Start running your own analysis! - - ```console - nextflow run nf-core/taxprofiler --input samplesheet.csv --outdir --genome GRCh37 -profile + nextflow run nf-core/taxprofiler --input samplesheet.csv --databases database.csv --outdir --run_ --run_ -profile ``` ## Documentation @@ -86,7 +84,7 @@ nf-core/taxprofiler was originally written by nf-core community. We thank the following people for their extensive assistance in the development of this pipeline: - +[James A. Fellows Yates](https://github.com/jfy133), [Moritz Beber](https://github.com/Midnighter), [Lauri Mesilaakso](https://github.com/ljmesi), [Sofia Stamouli](https://github.com/sofsam), [Maxime Borry](https://github.com/maxibor). ## Contributions and Support diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab..82565b1 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,6 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta +2611,ERR5766174,ILLUMINA,,,///fasta/ERX5474930_ERR5766174_1.fa.gz +2612,ERR5766176,ILLUMINA,///fastq/ERX5474932_ERR5766176_1.fastq.gz,///fastq/ERX5474932_ERR5766176_2.fastq.gz, +2612,ERR5766180,ILLUMINA,///fastq/ERX5474936_ERR5766180_1.fastq.gz,, +2613,ERR5766181,ILLUMINA,///fastq/ERX5474937_ERR5766181_1.fastq.gz,///fastq/ERX5474937_ERR5766181_2.fastq.gz, +ERR3201952,ERR3201952,OXFORD_NANOPORE,///fastq/ERR3201952.fastq.gz,, diff --git a/docs/usage.md b/docs/usage.md index bd840a4..0091d57 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,56 +8,90 @@ -## Samplesheet input +## Samplesheet inputs -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row as shown in the examples below. Furthermother, nf-core/taxprofiler also requires a second comma-separated file of 3 columns with a header row as in the examples below. + +This samplesheet is then specified on the command line as follows: ```console ---input '[path to samplesheet file]' +--input '[path to samplesheet file]' --databases '[path to database sheet file]' ``` ### Multiple runs of the same sample -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will processed reads before performing profiling. Below is an example for the same sample sequenced across 3 lanes: ```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz +sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta +2612,run1,ILLUMINA,2612_run1_R1.fq.gz,, +2612,run2,ILLUMINA,2612_run2_R1.fq.gz,, +2612,run3,ILLUMINA,2612_run3_R1.fq.gz,2612_run3_R2.fq.gz, + ``` +> ⚠️ Runs of the sample sample sequenced on Illumina platforms with a combination of single and paired-end data will **not** be run-wise concatenated, unless pair-merging is specified. In the example above, `run3` will be profiled independently of `run1` and `run2` if pairs not merged. + ### Full samplesheet -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 6 columns to match those defined in the table below. -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +A final samplesheet file consisting of both single- and paired-end data, as well as long-read FASTA fies may look something like the one below. This is for 6 samples, where `2612` has been sequenced twice. ```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +2611,ERR5766174,ILLUMINA,,,///fasta/ERX5474930_ERR5766174_1.fa.gz +2612,ERR5766176,ILLUMINA,///fastq/ERX5474932_ERR5766176_1.fastq.gz,///fastq/ERX5474932_ERR5766176_2.fastq.gz, +2612,ERR5766180,ILLUMINA,///fastq/ERX5474936_ERR5766180_1.fastq.gz,, +2613,ERR5766181,ILLUMINA,///fastq/ERX5474937_ERR5766181_1.fastq.gz,///fastq/ERX5474937_ERR5766181_2.fastq.gz, +ERR3201952,ERR3201952,OXFORD_NANOPORE,///fastq/ERR3201952.fastq.gz,, ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1 or Nanopore reads. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| Column | Description | +| --------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Unique sample name [required]. | +| `run_accession` | Run ID or name unique for each (pairs of) file(s) .Can also supply sample name again here, if only a single run was generated [required]. | +| `instrument_platform` | Sequencing platform reads generated on, selected from the EBI ENA [controlled vocabulary](https://www.ebi.ac.uk/ena/portal/api/controlledVocab?field=instrument_platform) [required]. | +| `fastq_1` | Path or URL to sequencing reads or for Illumina R1 sequencing reads in FASTQ format. GZipped compressed files accepted. Can be left empty if data in FASTA is specifed. Cannot be combined with `fasta`. | +| `fastq_2` | Path or URL to Illumina R2 sequencing reads in FASTQ format. GZipped compressed files accepted. Can be left empty if single end data. Cannot be combined with `fasta`. | +| `fasta` | Path or URL to long-reads or contigs in FASTA format. GZipped compressed files accepted. Can be left empty if data in FASTA is specifed. Cannot be combined with `fastq_1` or `fastq_2`. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +### Full database sheet + +nf-core/taxprofiler supports multiple databases being profiled in parallel for each tool. These databases, and specific parameters for each, can be specified in a 4 column comma-separated sheet. + +> ⚠️ nf-core/taxprofiler does not provide any databases by default, nor currently generates them for you. This must be performed manually by the user. + +An example database sheet can look as follows, where 4 tools are being used, and `malt` and `kraken2` will be used against two databases each. + +```console +tool,db_name,db_params,db_path +malt,malt85,-id 85,///malt/testdb-malt/ +malt,malt95,-id 90,///malt/testdb-malt.tar.gz +kraken2,db1,,///kraken2/testdb-kraken2.tar.gz +kraken2,db2,--quick,///kraken2/testdb-kraken2.tar.gz +centrifuge,db1,,///centrifuge/minigut_cf.tar.gz +metaphlan3,db1,,///metaphlan3/metaphlan_database/ +``` + +Column specifications are as follows: + +| Column | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tool` | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required]. | +| `db_name` | A unique name of the particular database [required]. | +| `db_params` | Any parameters of the given taxonomic profiler that you wish to specify that the taxonomic profiling tool should use when profiling against this specific. Can be empty to use taxonomic profiler defaults Must not be surrounded by quotes [required]. | +| `db_path` | Path to the database. Can either be a path to a directory containing the database index files or a `.tar.gz` file which contains the compressed database directory with the same name as the tar archive, minus `.tar.gz` [required]. | + +> 💡 You can also specify the same database directory/file twice (ensuring unique `db_name`s) and specify different parameters for each database to compare the effect of different parameters during profiling. + ## Running the pipeline The typical command for running the pipeline is as follows: ```console -nextflow run nf-core/taxprofiler --input samplesheet.csv --outdir --genome GRCh37 -profile docker +nextflow run nf-core/taxprofiler --input samplesheet.csv --databases databases.csv --outdir -profile docker --run_ --run_ ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -66,7 +100,7 @@ Note that the pipeline will create the following files in your working directory ```console work # Directory containing the nextflow working files - # Finished results in specified location (defined with --outdir) + # Finished results in specified location (defined with --outdir) .nextflow_log # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` From b8b11fd065ee04ced28f586c505c2e180109b30e Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Mon, 11 Apr 2022 10:30:29 +0200 Subject: [PATCH 138/214] Apply suggestions from code review Co-authored-by: Moritz E. Beber --- README.md | 2 +- docs/usage.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 88f643b..f1d59d5 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ On release, automated continuous integration tests run the pipeline on a full-si - Low complexity filtering ([bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus)) - Host read removal ([BowTie2](http://bowtie-bio.sourceforge.net/bowtie2/)) - Run merging -3. Performs taxonomic profiling via a choice of any or all of: +3. Performs taxonomic profiling using one or more of: - [Kraken2](https://ccb.jhu.edu/software/kraken2/) - [MetaPhlAn3](https://huttenhower.sph.harvard.edu/metaphlan/) - [MALT](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/malt/) diff --git a/docs/usage.md b/docs/usage.md index 0091d57..239a55d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -20,7 +20,7 @@ This samplesheet is then specified on the command line as follows: ### Multiple runs of the same sample -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will processed reads before performing profiling. Below is an example for the same sample sequenced across 3 lanes: +The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will process reads before performing profiling. Below is an example for the same sample sequenced across 3 lanes: ```console sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta @@ -30,7 +30,7 @@ sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta ``` -> ⚠️ Runs of the sample sample sequenced on Illumina platforms with a combination of single and paired-end data will **not** be run-wise concatenated, unless pair-merging is specified. In the example above, `run3` will be profiled independently of `run1` and `run2` if pairs not merged. +> ⚠️ Runs of the same sample sequenced on Illumina platforms with a combination of single and paired-end data will **not** be run-wise concatenated, unless pair-merging is specified. In the example above, `run3` will be profiled independently of `run1` and `run2` if pairs are not merged. ### Full samplesheet @@ -61,7 +61,7 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p nf-core/taxprofiler supports multiple databases being profiled in parallel for each tool. These databases, and specific parameters for each, can be specified in a 4 column comma-separated sheet. -> ⚠️ nf-core/taxprofiler does not provide any databases by default, nor currently generates them for you. This must be performed manually by the user. +> ⚠️ nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user. An example database sheet can look as follows, where 4 tools are being used, and `malt` and `kraken2` will be used against two databases each. From 29073e3c93e1ea5a4221ca8ec2080547716b503c Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 11 Apr 2022 13:38:42 +0200 Subject: [PATCH 139/214] Update CI tests --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 79148f0..5e57889 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,6 +39,7 @@ jobs: - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs" - "--shortread_complexityfilter_tool bbduk" - "--shortread_complexityfilter_tool prinseq" + - "--shortread_complexityfilter false --shortread_hostremoval" steps: - name: Check out pipeline code From 78182c2a8ca72d17b3a1004f100180d44439db42 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 11 Apr 2022 13:40:24 +0200 Subject: [PATCH 140/214] Add comma --- workflows/taxprofiler.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index abb330d..95e3588 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -125,7 +125,7 @@ workflow TAXPROFILER { } else { ch_longreads_preprocessed = INPUT_CHECK.out.nanopore } - + /* SUBWORKFLOW: COMPLEXITY FILTERING */ @@ -146,12 +146,12 @@ workflow TAXPROFILER { } else { ch_shortreads_hostremoved = ch_shortreads_filtered } - + /* SUBWORKFLOW: PROFILING */ - PROFILING ( ch_shortreads_hostremoved ch_longreads_preprocessed, DB_CHECK.out.dbs ) + PROFILING ( ch_shortreads_hostremoved, ch_longreads_preprocessed, DB_CHECK.out.dbs ) ch_versions = ch_versions.mix( PROFILING.out.versions ) /* @@ -191,7 +191,7 @@ workflow TAXPROFILER { if (params.shortread_hostremoval) { ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_HOSTREMOVAL.out.mqc.collect{it[1]}.ifEmpty([])) } - + ch_multiqc_files = ch_multiqc_files.mix( PROFILING.out.mqc ) // TODO create multiQC module for metaphlan From 989dc55ce107ae01331727eb9678a0d41e22de03 Mon Sep 17 00:00:00 2001 From: sofstam Date: Mon, 11 Apr 2022 17:42:38 +0200 Subject: [PATCH 141/214] Delete extra nf-core directory --- nf-core/modules/centrifuge/centrifuge/main.nf | 61 ----------------- .../modules/centrifuge/centrifuge/meta.yml | 66 ------------------- 2 files changed, 127 deletions(-) delete mode 100644 nf-core/modules/centrifuge/centrifuge/main.nf delete mode 100644 nf-core/modules/centrifuge/centrifuge/meta.yml diff --git a/nf-core/modules/centrifuge/centrifuge/main.nf b/nf-core/modules/centrifuge/centrifuge/main.nf deleted file mode 100644 index 3d23fc9..0000000 --- a/nf-core/modules/centrifuge/centrifuge/main.nf +++ /dev/null @@ -1,61 +0,0 @@ -process CENTRIFUGE_CENTRIFUGE { - tag "$meta.id" - label 'process_high' - - conda (params.enable_conda ? "bioconda::centrifuge=1.0.4_beta" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6' : - 'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }" - - input: - tuple val(meta), path(reads) - path db - val save_unaligned - val save_aligned - val sam_format - - output: - tuple val(meta), path('*report.txt') , emit: report - tuple val(meta), path('*results.txt') , emit: results - tuple val(meta), path('*.sam') , optional: true, emit: sam - tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_mapped - tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def paired = meta.single_end ? "-U ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" - def unaligned = '' - def aligned = '' - if (meta.single_end) { - unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' - aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : '' - } else { - unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' - aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : '' - } - def sam_output = sam_format ? "--out-fmt 'sam'" : '' - """ - ## we add "-no-name ._" to ensure silly Mac OSX metafiles files aren't included - db_name=`find -L ${db} -name "*.1.cf" -not -name "._*" | sed 's/.1.cf//'` - centrifuge \\ - -x \$db_name \\ - -p $task.cpus \\ - $paired \\ - --report-file ${prefix}.report.txt \\ - -S ${prefix}.results.txt \\ - $unaligned \\ - $aligned \\ - $sam_output \\ - $args - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') - END_VERSIONS - """ -} diff --git a/nf-core/modules/centrifuge/centrifuge/meta.yml b/nf-core/modules/centrifuge/centrifuge/meta.yml deleted file mode 100644 index a252c00..0000000 --- a/nf-core/modules/centrifuge/centrifuge/meta.yml +++ /dev/null @@ -1,66 +0,0 @@ -name: centrifuge_centrifuge -description: Classifies metagenomic sequence data -keywords: - - classify - - metagenomics - - fastq - - db -tools: - - centrifuge: - description: Centrifuge is a classifier for metagenomic sequences. - homepage: https://ccb.jhu.edu/software/centrifuge/ - documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml - doi: 10.1101/gr.210641.116 - licence: ["GPL v3"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. - - db: - type: directory - description: Path to directory containing centrifuge database files - - save_unaligned: - type: value - description: If true unmapped fastq files are saved - - save_aligned: - type: value - description: If true mapped fastq files are saved -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - report: - type: file - description: | - File containing a classification summary - pattern: "*.{report.txt}" - - results: - type: file - description: | - File containing classification results - pattern: "*.{results.txt}" - - fastq_unmapped: - type: file - description: Unmapped fastq files - pattern: "*.unmapped.fastq.gz" - - fastq_mapped: - type: file - description: Mapped fastq files - pattern: "*.mapped.fastq.gz" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@sofstam" - - "@jfy133" - - "@sateeshperi" From a5f4fc42d53fce99863a3d135b017b9e468722be Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 12 Apr 2022 09:25:27 +0200 Subject: [PATCH 142/214] Fix run merging for unmerged PE data --- conf/modules.config | 2 +- workflows/taxprofiler.nf | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 97e9510..eb448bb 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -162,7 +162,7 @@ process { } withName: CAT_FASTQ { - ext.prefix = { "${meta.id}-${meta.run_accession}" } + ext.prefix = { "${meta.id}" } publishDir = [ path: { "${params.outdir}/run_merging/" }, mode: params.publish_dir_mode, diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 1da812d..2eb7e8c 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -139,7 +139,11 @@ workflow TAXPROFILER { meta_new.remove('run_accession') [ meta_new, reads ] } - .groupTuple() + .groupTuple(by: 0) + .map { + meta, reads -> + [ meta, reads.flatten() ] + } .branch { // we can't concate files if there is not a second run, we branch // here to separate them out, and mix after @@ -151,7 +155,6 @@ workflow TAXPROFILER { .mix( ch_reads_for_cat_branch.skip ) .map { meta, reads -> - [ meta, [ reads ].flatten() ] } From 9f221f84cc66ca121a46d8f68db8f727dec523c9 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 12 Apr 2022 10:12:17 +0200 Subject: [PATCH 143/214] Only supply single input channel to profiling, as these are merged into single input channel at run_merging --- subworkflows/local/profiling.nf | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index c74c583..07b6b72 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -9,8 +9,7 @@ include { METAPHLAN3 } from '../../modules/nf-core/modules/meta workflow PROFILING { take: - shortreads // [ [ meta ], [ reads ] ] - longreads // [ [ meta ], [ reads ] ] + reads // [ [ meta ], [ reads ] ] databases // [ [ meta ], path ] main: @@ -22,9 +21,9 @@ workflow PROFILING { */ // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] - ch_input_for_profiling = shortreads - .mix( longreads ) + ch_input_for_profiling = reads .combine(databases) + .dump(tag: "combined_withdbs") .branch { malt: it[2]['tool'] == 'malt' kraken2: it[2]['tool'] == 'kraken2' From 2ef21c6ef3e46ae824ad5c99970b8a5250ef8e38 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 12 Apr 2022 10:14:05 +0200 Subject: [PATCH 144/214] Fix input to profiling --- subworkflows/local/profiling.nf | 1 - workflows/taxprofiler.nf | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 07b6b72..ac744aa 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -23,7 +23,6 @@ workflow PROFILING { // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] ch_input_for_profiling = reads .combine(databases) - .dump(tag: "combined_withdbs") .branch { malt: it[2]['tool'] == 'malt' kraken2: it[2]['tool'] == 'kraken2' diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 0916cac..7c02f4c 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -167,7 +167,7 @@ workflow TAXPROFILER { SUBWORKFLOW: PROFILING */ - PROFILING ( ch_reads_runmerged, ch_longreads_preprocessed, DB_CHECK.out.dbs ) + PROFILING ( ch_reads_runmerged, DB_CHECK.out.dbs ) ch_versions = ch_versions.mix( PROFILING.out.versions ) /* From 26399718b2440499f294ebc59d669dd78dd7cdb6 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 12 Apr 2022 10:46:03 +0200 Subject: [PATCH 145/214] Re-add pairment attachment --- subworkflows/local/profiling.nf | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index ac744aa..b03b83e 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -23,6 +23,12 @@ workflow PROFILING { // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] ch_input_for_profiling = reads .combine(databases) + meta, reads -> + def meta_new = meta.clone() + pairtype = meta_new['single_end'] ? '_se' : '_pe' + meta_new['id'] = meta_new['id'] + pairtype + [meta_new, reads] + } .branch { malt: it[2]['tool'] == 'malt' kraken2: it[2]['tool'] == 'kraken2' From 8d689141924a52cf72e666e76db1b575d541a341 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 12 Apr 2022 10:47:44 +0200 Subject: [PATCH 146/214] Re-add operator name --- subworkflows/local/profiling.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index b03b83e..8a156c2 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -23,6 +23,7 @@ workflow PROFILING { // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] ch_input_for_profiling = reads .combine(databases) + .map { meta, reads -> def meta_new = meta.clone() pairtype = meta_new['single_end'] ? '_se' : '_pe' From a15c45b00cd18e2148baf7112f15c340261acdef Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 12 Apr 2022 10:51:27 +0200 Subject: [PATCH 147/214] Put map in the rigt place --- subworkflows/local/profiling.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 8a156c2..18de739 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -22,7 +22,6 @@ workflow PROFILING { // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] ch_input_for_profiling = reads - .combine(databases) .map { meta, reads -> def meta_new = meta.clone() @@ -30,6 +29,7 @@ workflow PROFILING { meta_new['id'] = meta_new['id'] + pairtype [meta_new, reads] } + .combine(databases) .branch { malt: it[2]['tool'] == 'malt' kraken2: it[2]['tool'] == 'kraken2' From 16a3556bfcfddf8195bbcd486653325aaa1de4ee Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 13 Apr 2022 08:26:08 +0200 Subject: [PATCH 148/214] Changes after code review --- workflows/taxprofiler.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 7c02f4c..58671b3 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -139,15 +139,16 @@ workflow TAXPROFILER { meta_new.remove('run_accession') [ meta_new, reads ] } - .groupTuple(by: 0) + .groupTuple() .map { meta, reads -> [ meta, reads.flatten() ] } .branch { + meta, reads -> // we can't concatenate files if there is not a second run, we branch // here to separate them out, and mix back in after for efficiency - cat: ( it[0]['single_end'] && it[1].size() > 1 ) || ( !it[0]['single_end'] && it[1].size() > 2 ) + cat: ( meta.single_end && reads.size() > 1 ) || ( !meta.single_end && reads.size() > 2 ) skip: true } From de5734052601ce6bedd7be2389fef4bacd0affb1 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 13 Apr 2022 11:49:35 +0200 Subject: [PATCH 149/214] Start work --- modules.json | 5 +- .../nf-core/modules/megan/rma2info/main.nf | 38 ++++++++++++++ .../nf-core/modules/megan/rma2info/meta.yml | 51 +++++++++++++++++++ nextflow.config | 1 + subworkflows/local/profiling.nf | 22 ++++---- .../local/shortread_postprocessing.nf | 39 ++++++++++++++ 6 files changed, 146 insertions(+), 10 deletions(-) create mode 100644 modules/nf-core/modules/megan/rma2info/main.nf create mode 100644 modules/nf-core/modules/megan/rma2info/meta.yml create mode 100644 subworkflows/local/shortread_postprocessing.nf diff --git a/modules.json b/modules.json index 7fbc65c..e921454 100644 --- a/modules.json +++ b/modules.json @@ -30,6 +30,9 @@ "malt/run": { "git_sha": "72b96f4e504eef673f2b5c13560a9d90b669129b" }, + "megan/rma2info": { + "git_sha": "2d38566eca4cc15142b2ffa7c11837569b39aece" + }, "metaphlan3": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, @@ -47,4 +50,4 @@ } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/megan/rma2info/main.nf b/modules/nf-core/modules/megan/rma2info/main.nf new file mode 100644 index 0000000..80d1975 --- /dev/null +++ b/modules/nf-core/modules/megan/rma2info/main.nf @@ -0,0 +1,38 @@ +process MEGAN_RMA2INFO { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::megan=6.21.7" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/megan:6.21.7--h9ee0642_0': + 'quay.io/biocontainers/megan:6.21.7--h9ee0642_0' }" + + input: + tuple val(meta), path(rma6) + val(megan_summary) + + output: + tuple val(meta), path("*.txt.gz") , emit: txt + tuple val(meta), path("*.megan"), optional: true, emit: megan_summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def summary = megan_summary ? "-es ${prefix}.megan" : "" + """ + rma2info \\ + -i ${rma6} \\ + -o ${prefix}.txt.gz \\ + ${summary} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + megan: \$(echo \$(rma2info 2>&1) | grep version | sed 's/.*version //g;s/, built.*//g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/megan/rma2info/meta.yml b/modules/nf-core/modules/megan/rma2info/meta.yml new file mode 100644 index 0000000..0f2d5a9 --- /dev/null +++ b/modules/nf-core/modules/megan/rma2info/meta.yml @@ -0,0 +1,51 @@ +name: "megan_rma2info" +description: Analyses an RMA file and exports information in text format +keywords: + - megan + - rma6 + - classification + - conversion +tools: + - "megan": + description: "A tool for studying the taxonomic content of a set of DNA reads" + homepage: "https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/" + documentation: "https://software-ab.informatik.uni-tuebingen.de/download/megan6/welcome.html" + tool_dev_url: "https://github.com/husonlab/megan-ce" + doi: "10.1371/journal.pcbi.1004957" + licence: "['GPL >=3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - rma6: + type: file + description: RMA6 file from MEGAN or MALT + pattern: "*.rma6" + - megan_summary: + type: boolean + description: Specify whether to generate an MEGAN summary file + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: file + description: Compressed text file + pattern: "*.txt.gz" + - megan_summary: + type: file + description: Optionally generated MEGAN summary file + pattern: "*.megan" + +authors: + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index b4a8d91..8ddf365 100644 --- a/nextflow.config +++ b/nextflow.config @@ -89,6 +89,7 @@ params { centrifuge_save_unaligned = false centrifuge_save_aligned = false centrifuge_sam_format = false + // metaphlan3 run_metaphlan3 = false } diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index c74c583..59b0dc0 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -14,8 +14,9 @@ workflow PROFILING { databases // [ [ meta ], path ] main: - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + ch_raw_profiles = Channel.empty() /* COMBINE READS WITH POSSIBLE DATABASES @@ -89,30 +90,33 @@ workflow PROFILING { if ( params.run_malt ) { MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) + ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( MALT_RUN.out.rma6 ) } if ( params.run_kraken2 ) { KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) - ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) + ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.txt ) } if ( params.run_centrifuge ) { CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format ) - ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) + ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( CENTRIFUGE_CENTRIFUGE.out.report ) } if ( params.run_metaphlan3 ) { METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db ) ch_versions = ch_versions.mix( METAPHLAN3.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN3.out.biom ) } emit: - // TODO work out if there is enough standardisation of output to export as one? - //output = ch_filtered_reads // channel: [ val(meta), [ reads ] ] + profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] versions = ch_versions // channel: [ versions.yml ] mqc = ch_multiqc_files } diff --git a/subworkflows/local/shortread_postprocessing.nf b/subworkflows/local/shortread_postprocessing.nf new file mode 100644 index 0000000..7fb0d70 --- /dev/null +++ b/subworkflows/local/shortread_postprocessing.nf @@ -0,0 +1,39 @@ +// +// Perform read trimming and merging +// + + +include { SHORTREAD_FASTP } from './shortread_fastp' +include { SHORTREAD_ADAPTERREMOVAL } from './shortread_adapterremoval' +include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main' + +workflow SHORTREAD_POSTPROCESSING { + take: + input // [ [ meta ], [ taxon_table/file ] ] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( params.shortread_clipmerge_tool == "fastp" ) { + ch_processed_reads = SHORTREAD_FASTP ( reads ).reads + ch_versions = ch_versions.mix( SHORTREAD_FASTP.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc ) + } else if ( params.shortread_clipmerge_tool == "adapterremoval" ) { + ch_processed_reads = SHORTREAD_ADAPTERREMOVAL ( reads ).reads + ch_versions = ch_versions.mix( SHORTREAD_ADAPTERREMOVAL.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_ADAPTERREMOVAL.out.mqc ) + } else { + ch_processed_reads = reads + } + + FASTQC_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + + emit: + output = output // channel: [ val(meta), taxon_table ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + From dfeaa0d1fe79a6f59c27bdb481904b3bbb620234 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 13 Apr 2022 12:00:28 +0200 Subject: [PATCH 150/214] Rename subworkflow parameters for consistency --- .github/workflows/ci.yml | 8 ++++---- conf/modules.config | 6 +++--- conf/test.config | 6 +++--- nextflow.config | 11 ++++++----- nextflow_schema.json | 22 +++++++++++----------- workflows/taxprofiler.nf | 16 ++++++++-------- 6 files changed, 35 insertions(+), 34 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 53423cb..c373bc8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,8 +29,8 @@ jobs: - NXF_VER: "" NXF_EDGE: "1" parameters: - - "--longread_clip false" - - "--shortread_clip false" + - "--perform_longread_clip false" + - "--perform_shortread_clipmerge false" - "--shortread_clipmerge_tool fastp" - "--shortread_clipmerge_tool fastp --shortread_clipmerge_mergepairs --shortread_clipmerge_excludeunmerged" - "--shortread_clipmerge_tool fastp --shortread_clipmerge_mergepairs" @@ -39,8 +39,8 @@ jobs: - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs" - "--shortread_complexityfilter_tool bbduk" - "--shortread_complexityfilter_tool prinseq" - - "--run_merging" - - "--run_merging --shortread_clipmerge_mergepairs" + - "--perform_runmerging" + - "--perform_runmerging --shortread_clipmerge_mergepairs" steps: - name: Check out pipeline code diff --git a/conf/modules.config b/conf/modules.config index d93486f..42528de 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -173,7 +173,7 @@ process { withName: MALT_RUN { ext.args = { "${meta.db_params}" } - ext.prefix = params.run_merging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/malt/${meta.db_name}" }, mode: params.publish_dir_mode, @@ -183,7 +183,7 @@ process { withName: KRAKEN2_KRAKEN2 { ext.args = { "${meta.db_params}" } - ext.prefix = params.run_merging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/kraken2/${meta.db_name}" }, mode: params.publish_dir_mode, @@ -193,7 +193,7 @@ process { withName: METAPHLAN3 { ext.args = { "${meta.db_params}" } - ext.prefix = params.run_merging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/metaphlan3/${meta.db_name}" }, mode: params.publish_dir_mode, diff --git a/conf/test.config b/conf/test.config index 6e82300..923dda7 100644 --- a/conf/test.config +++ b/conf/test.config @@ -28,7 +28,7 @@ params { run_malt = true run_metaphlan3 = true run_centrifuge = true - shortread_clipmerge = true - longread_clip = false - shortread_complexityfilter = true + perform_shortread_clipmerge = true + perform_longread_clip = false + perform_shortread_complexityfilter = true } diff --git a/nextflow.config b/nextflow.config index da8bbdb..b72b4f9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -55,7 +55,7 @@ params { databases = null // FASTQ preprocessing - shortread_clipmerge = false + perform_shortread_clipmerge = false shortread_clipmerge_tool = 'fastp' shortread_clipmerge_skipadaptertrim = false shortread_clipmerge_mergepairs = false @@ -63,11 +63,11 @@ params { shortread_clipmerge_adapter1 = null shortread_clipmerge_adapter2 = null shortread_clipmerge_minlength = 15 - longread_clip = false + perform_longread_clip = false save_preprocessed_reads = false // Complexity filtering - shortread_complexityfilter = false + perform_shortread_complexityfilter = false shortread_complexityfilter_tool = 'bbduk' shortread_complexityfilter_entropy = 0.3 shortread_complexityfilter_bbduk_windowsize = 50 @@ -77,8 +77,8 @@ params { save_complexityfiltered_reads = false // run merging - run_merging = false - save_runmerged_reads = false + perform_runmerging = false + save_runmerged_reads = false // MALT run_malt = false @@ -92,6 +92,7 @@ params { centrifuge_save_unaligned = false centrifuge_save_aligned = false centrifuge_sam_format = false + // metaphlan3 run_metaphlan3 = false } diff --git a/nextflow_schema.json b/nextflow_schema.json index 4db7fa0..06bd94b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -262,15 +262,9 @@ "type": "string", "default": "None" }, - "shortread_clipmerge": { - "type": "boolean" - }, "shortread_clipmerge_excludeunmerged": { "type": "boolean" }, - "longread_clip": { - "type": "boolean" - }, "run_malt": { "type": "boolean" }, @@ -334,9 +328,6 @@ "shortread_complexityfilter_bbduk_mask": { "type": "boolean" }, - "shortread_complexityfilter": { - "type": "boolean" - }, "shortread_complexityfilter_entropy": { "type": "number", "default": 0.3 @@ -353,10 +344,19 @@ "save_complexityfiltered_reads": { "type": "boolean" }, - "run_merging": { + "save_runmerged_reads": { "type": "boolean" }, - "save_runmerged_reads": { + "perform_shortread_clipmerge": { + "type": "boolean" + }, + "perform_longread_clip": { + "type": "boolean" + }, + "perform_shortread_complexityfilter": { + "type": "boolean" + }, + "perform_runmerging": { "type": "boolean" } } diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 58671b3..f086557 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -101,14 +101,14 @@ workflow TAXPROFILER { /* SUBWORKFLOW: PERFORM PREPROCESSING */ - if ( params.shortread_clipmerge ) { + if ( params.perform_shortread_clipmerge ) { ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads } else { ch_shortreads_preprocessed = INPUT_CHECK.out.fastq } - if ( params.longread_clip ) { + if ( params.perform_longread_clip ) { ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ).reads .map { it -> [ it[0], [it[1]] ] } } else { @@ -119,7 +119,7 @@ workflow TAXPROFILER { SUBWORKFLOW: COMPLEXITY FILTERING */ - if ( params.shortread_complexityfilter ) { + if ( params.perform_shortread_complexityfilter ) { ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads } else { ch_shortreads_filtered = ch_shortreads_preprocessed @@ -129,7 +129,7 @@ workflow TAXPROFILER { STEP: Run merging */ - if ( params.run_merging ) { + if ( params.perform_runmerging ) { ch_reads_for_cat_branch = ch_shortreads_filtered .mix( ch_longreads_preprocessed ) @@ -190,22 +190,22 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) - if (params.shortread_clipmerge) { + if (params.perform_shortread_clipmerge) { ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions ) } - if (params.longread_clip) { + if (params.perform_longread_clip) { ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions ) } - if (params.shortread_complexityfilter){ + if (params.perform_shortread_complexityfilter){ ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } - if (params.run_merging){ + if (params.perform_runmerging){ ch_versions = ch_versions.mix(CAT_FASTQ.out.versions) } From aa2d07c42ae4e451711a4147264782d567576e88 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 13 Apr 2022 14:19:19 +0200 Subject: [PATCH 151/214] Fix merge cockup --- .github/workflows/ci.yml | 2 +- conf/test.config | 2 +- nextflow.config | 2 +- nextflow_schema.json | 2 +- workflows/taxprofiler.nf | 13 +++++-------- 5 files changed, 9 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b18e601..a1ece72 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -41,7 +41,7 @@ jobs: - "--shortread_complexityfilter_tool prinseq" - "--perform_runmerging" - "--perform_runmerging --shortread_clipmerge_mergepairs" - - "--shortread_complexityfilter false --shortread_hostremoval" + - "--shortread_complexityfilter false --perform_shortread_hostremoval" steps: - name: Check out pipeline code diff --git a/conf/test.config b/conf/test.config index 1d08d91..8f12312 100644 --- a/conf/test.config +++ b/conf/test.config @@ -31,6 +31,6 @@ params { perform_shortread_clipmerge = true perform_longread_clip = false perform_shortread_complexityfilter = true - shortread_hostremoval = true + perform_shortread_hostremoval = true shortread_hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' } diff --git a/nextflow.config b/nextflow.config index 6b0a79d..94c5837 100644 --- a/nextflow.config +++ b/nextflow.config @@ -81,7 +81,7 @@ params { save_runmerged_reads = false // Host Removal - shortread_hostremoval = false + perform_shortread_hostremoval = false shortread_hostremoval_reference = null shortread_hostremoval_index = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 07295e3..cf0edab 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -359,7 +359,7 @@ "perform_runmerging": { "type": "boolean" }, - "shortread_hostremoval": { + "perform_shortread_hostremoval": { "type": "boolean" }, "shortread_hostremoval_reference": { diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 894a1e1..9fe8cc8 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -22,11 +22,10 @@ if (params.databases) { ch_databases = file(params.databases) } else { exit 1, ' if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "[nf-core/taxprofiler] error: cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" -// TODO Add check if index but no reference exit 1 -if (params.shortread_hostremoval && !params.shortread_hostremoval_reference) { exit 1, "[nf-core/taxprofiler] error: --shortread_hostremoval requested but no --shortread_hostremoval_reference FASTA supplied. Check input." } +if (params.perform_shortread_hostremoval && !params.shortread_hostremoval_reference) { exit 1, "[nf-core/taxprofiler] error: --shortread_hostremoval requested but no --shortread_hostremoval_reference FASTA supplied. Check input." } if (!params.shortread_hostremoval_reference && params.shortread_hostremoval_reference_index) { exit 1, "[nf-core/taxprofiler] error: --shortread_hostremoval_index provided but no --shortread_hostremoval_reference FASTA supplied. Check input." } -if (params.shortread_hostremoval_reference ) { ch_reference = file(params.shortread_hostremoval_reference) } else { ch_reference = [] } +if (params.shortread_hostremoval_reference ) { ch_reference = file(params.shortread_hostremoval_reference) } if (params.shortread_hostremoval_index ) { ch_reference_index = file(params.shortread_hostremoval_index ) } else { ch_reference_index = [] } /* @@ -140,15 +139,13 @@ workflow TAXPROFILER { SUBWORKFLOW: HOST REMOVAL */ - if ( params.shortread_hostremoval ) { + if ( params.perform_shortread_hostremoval ) { ch_shortreads_hostremoved = SHORTREAD_HOSTREMOVAL ( ch_shortreads_filtered, ch_reference, ch_reference_index ).reads ch_versions = ch_versions.mix(SHORTREAD_HOSTREMOVAL.out.versions.first()) } else { ch_shortreads_hostremoved = ch_shortreads_filtered } - */ - if ( params.perform_runmerging ) { ch_reads_for_cat_branch = ch_shortreads_hostremoved @@ -225,11 +222,11 @@ workflow TAXPROFILER { ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } - if (params.shortread_hostremoval) { + if (params.perform_shortread_hostremoval) { ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_HOSTREMOVAL.out.mqc.collect{it[1]}.ifEmpty([])) ch_versions = ch_versions.mix(SHORTREAD_HOSTREMOVAL.out.versions) } - + if (params.perform_runmerging){ ch_versions = ch_versions.mix(CAT_FASTQ.out.versions) } From 5e80df0f949557883ea6e961b0ea7ba3214065c5 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 13 Apr 2022 15:27:53 +0200 Subject: [PATCH 152/214] Apply suggestions from code review --- conf/test.config | 4 ++-- nextflow.config | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/test.config b/conf/test.config index 8f12312..9fa5de8 100644 --- a/conf/test.config +++ b/conf/test.config @@ -31,6 +31,6 @@ params { perform_shortread_clipmerge = true perform_longread_clip = false perform_shortread_complexityfilter = true - perform_shortread_hostremoval = true - shortread_hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + perform_shortread_hostremoval = true + shortread_hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' } diff --git a/nextflow.config b/nextflow.config index 94c5837..bf3ca92 100644 --- a/nextflow.config +++ b/nextflow.config @@ -81,7 +81,7 @@ params { save_runmerged_reads = false // Host Removal - perform_shortread_hostremoval = false + perform_shortread_hostremoval = false shortread_hostremoval_reference = null shortread_hostremoval_index = null From 68ce5843a475b6d4a28713a82ecc30816368d0ff Mon Sep 17 00:00:00 2001 From: sofstam Date: Wed, 13 Apr 2022 18:51:56 +0200 Subject: [PATCH 153/214] Add kaiju in taxprofiler --- conf/modules.config | 9 ++++ conf/test.config | 13 ++--- modules/nf-core/modules/kaiju/kaiju/main.nf | 41 +++++++++++++++ modules/nf-core/modules/kaiju/kaiju/meta.yml | 53 ++++++++++++++++++++ nextflow.config | 3 ++ subworkflows/local/profiling.nf | 13 +++++ 6 files changed, 126 insertions(+), 6 deletions(-) create mode 100644 modules/nf-core/modules/kaiju/kaiju/main.nf create mode 100644 modules/nf-core/modules/kaiju/kaiju/meta.yml diff --git a/conf/modules.config b/conf/modules.config index ccd1748..f85df30 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -245,4 +245,13 @@ process { ] } + withName: KAIJU_KAIJU { + publishDir = [ + path: { "${params.outdir}/kaiju/${meta.db_name}" }, + mode: params.publish_dir_mode, + pattern: '*.tsv' + ] + ext.args = { "${meta.db_params}" } + ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + } } diff --git a/conf/test.config b/conf/test.config index 9fa5de8..884ff32 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,15 +22,16 @@ params { // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' - run_kraken2 = true - run_malt = true - run_metaphlan3 = true - run_centrifuge = true + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + run_kraken2 = true + run_malt = true + run_metaphlan3 = true + run_centrifuge = true perform_shortread_clipmerge = true perform_longread_clip = false perform_shortread_complexityfilter = true perform_shortread_hostremoval = true shortread_hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true } diff --git a/modules/nf-core/modules/kaiju/kaiju/main.nf b/modules/nf-core/modules/kaiju/kaiju/main.nf new file mode 100644 index 0000000..ae8f99e --- /dev/null +++ b/modules/nf-core/modules/kaiju/kaiju/main.nf @@ -0,0 +1,41 @@ +process KAIJU_KAIJU { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? "bioconda::kaiju=1.8.2" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kaiju:1.8.2--h5b5514e_1': + 'quay.io/biocontainers/kaiju:1.8.2--h5b5514e_1' }" + + input: + tuple val(meta), path(reads) + path(db) + + output: + tuple val(meta), path('*.tsv'), emit: results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input = meta.single_end ? "-i ${reads}" : "-i ${reads[0]} -j ${reads[1]}" + """ + dbnodes=`find -L ${db} -name "*nodes.dmp"` + dbname=`find -L ${db} -name "*.fmi" -not -name "._*"` + kaiju \\ + $args \\ + -z $task.cpus \\ + -t \$dbnodes \\ + -f \$dbname \\ + -o ${prefix}.tsv \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kaiju: \$(echo \$( kaiju -h 2>&1 | sed -n 1p | sed 's/^.*Kaiju //' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/kaiju/kaiju/meta.yml b/modules/nf-core/modules/kaiju/kaiju/meta.yml new file mode 100644 index 0000000..e24c8ef --- /dev/null +++ b/modules/nf-core/modules/kaiju/kaiju/meta.yml @@ -0,0 +1,53 @@ +name: kaiju_kaiju +description: Taxonomic classification of metagenomic sequence data using a protein reference database +keywords: + - classify + - metagenomics + - fastq + - taxonomic profiling +tools: + - kaiju: + description: Fast and sensitive taxonomic classification for metagenomics + homepage: https://kaiju.binf.ku.dk/ + documentation: https://github.com/bioinformatics-centre/kaiju/blob/master/README.md + tool_dev_url: https://github.com/bioinformatics-centre/kaiju + doi: "10.1038/ncomms11257" + licence: ["GNU GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input fastq/fasta files of size 1 and 2 for single-end and paired-end data, + respectively. + pattern: "*.{fastq,fq,fasta,fa,fsa,fas,fna,fastq.gz,fq.gz,fasta.gz,fa.gz,fsa.gz,fas.gz,fna.gz}" + - db: + type: files + description: | + List containing the database and nodes files for Kaiju + e.g. [ 'database.fmi', 'nodes.dmp' ] + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - results: + type: file + description: Results with taxonomic classification of each read + pattern: "*.tsv" + +authors: + - "@talnor" + - "@sofstam" + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index bf3ca92..9ad84d5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -100,6 +100,9 @@ params { // metaphlan3 run_metaphlan3 = false + + // kaiju + run_kaiju = false } // Load base.config by default for all pipelines diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 18de739..272b5b4 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -6,6 +6,7 @@ include { MALT_RUN } from '../../modules/nf-core/modules/malt include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/modules/kraken2/kraken2/main' include { CENTRIFUGE_CENTRIFUGE } from '../../modules/nf-core/modules/centrifuge/centrifuge/main' include { METAPHLAN3 } from '../../modules/nf-core/modules/metaphlan3/main' +include { KAIJU_KAIJU } from '../../modules/nf-core/modules/kaiju/kaiju/main' workflow PROFILING { take: @@ -35,6 +36,7 @@ workflow PROFILING { kraken2: it[2]['tool'] == 'kraken2' metaphlan3: it[2]['tool'] == 'metaphlan3' centrifuge: it[2]['tool'] == 'centrifuge' + kaiju: it[2]['tool'] == 'kaiju' unknown: true } @@ -88,6 +90,13 @@ workflow PROFILING { db: it[3] } + ch_input_for_kaiju = ch_input_for_profiling.kaiju + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + /* RUN PROFILING */ @@ -114,6 +123,10 @@ workflow PROFILING { ch_versions = ch_versions.mix( METAPHLAN3.out.versions.first() ) } + if ( params.run_kaiju ) { + KAIJU_KAIJU ( ch_input_for_kaiju.reads, ch_input_for_kaiju.db ) + ch_versions = ch_versions.mix( KAIJU_KAIJU.out.versions.first() ) + } emit: // TODO work out if there is enough standardisation of output to export as one? From 6736a2a980edd1244c8f3943d385f07f5b8a69ff Mon Sep 17 00:00:00 2001 From: sofstam Date: Wed, 13 Apr 2022 19:00:52 +0200 Subject: [PATCH 154/214] Update README.md, nextflow_schema.json --- README.md | 1 + modules.json | 3 +++ nextflow_schema.json | 34 +++++++++++++++++++++++++++------- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index f1d59d5..66578e5 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ On release, automated continuous integration tests run the pipeline on a full-si - [Centrifuge](https://ccb.jhu.edu/software/centrifuge/) - [Kaiju](https://kaiju.binf.ku.dk/) - [mOTUs](https://motu-tool.org/) + - [MetaMaps](https://github.com/DiltheyLab/MetaMaps) 4. Perform optional post-processing with: - [bracken](https://ccb.jhu.edu/software/bracken/) 5. Standardises output tables diff --git a/modules.json b/modules.json index 2be0b7f..726e701 100644 --- a/modules.json +++ b/modules.json @@ -50,6 +50,9 @@ }, "untar": { "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" + }, + "kaiju/kaiju": { + "git_sha": "8856f127c58f6af479128be8b8df4d42e442ddbe" } } } diff --git a/nextflow_schema.json b/nextflow_schema.json index cf0edab..75854bf 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -294,7 +304,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -335,7 +348,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -364,11 +380,15 @@ }, "shortread_hostremoval_reference": { "type": "string", - "default": null + "default": "None" }, "shortread_hostremoval_index": { "type": "string", - "default": null + "default": "None" + }, + "run_kaiju": { + "type": "string", + "default": "false" } } -} +} \ No newline at end of file From 0e5c9e7bdd4f00323969f5f6d7217cc33ba63f5b Mon Sep 17 00:00:00 2001 From: sofstam Date: Thu, 14 Apr 2022 09:14:05 +0200 Subject: [PATCH 155/214] Type:boolean for kaiju in nextflow_schema.json --- nextflow_schema.json | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 75854bf..5fe371d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -387,8 +387,7 @@ "default": "None" }, "run_kaiju": { - "type": "string", - "default": "false" + "type": "boolean" } } -} \ No newline at end of file +} From cc73cdd51d26330d831ec306f8a5da081fb8665a Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 16 Apr 2022 07:42:30 +0200 Subject: [PATCH 156/214] Add generation of taxon-table like output for MALT --- CITATIONS.md | 10 ++++- conf/modules.config | 15 ++++++- nextflow.config | 1 + nextflow_schema.json | 7 +++- subworkflows/local/profiling.nf | 32 +++++++++++---- .../local/shortread_postprocessing.nf | 39 ------------------- 6 files changed, 52 insertions(+), 52 deletions(-) delete mode 100644 subworkflows/local/shortread_postprocessing.nf diff --git a/CITATIONS.md b/CITATIONS.md index e7bcf47..02621d9 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -40,9 +40,17 @@ > Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico. Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6. +- [MEGAN](https://doi.org/10.1371/journal.pcbi.1004957) + + > Huson, Daniel H., Sina Beier, Isabell Flade, Anna Górska, Mohamed El-Hadidi, Suparna Mitra, Hans-Joachim Ruscheweyh, and Rewati Tappu. 2016. “MEGAN Community Edition - Interactive Exploration and Analysis of Large-Scale Microbiome Sequencing Data.” PLoS Computational Biology 12 (6): e1004957. doi: 10.1371/journal.pcbi.1004957. + - [MetaPhlAn3](https://doi.org/10.7554/eLife.65088) - > Beghini, Francesco, Lauren J McIver, Aitor Blanco-Míguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, et al. 2021. “Integrating Taxonomic, Functional, and Strain-Level Profiling of Diverse Microbial Communities with BioBakery 3.” Edited by Peter Turnbaugh, Eduardo Franco, and C Titus Brown. ELife 10 (May): e65088. + > Beghini, Francesco, Lauren J McIver, Aitor Blanco-Míguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, et al. 2021. “Integrating Taxonomic, Functional, and Strain-Level Profiling of Diverse Microbial Communities with BioBakery 3.” Edited by Peter Turnbaugh, Eduardo Franco, and C Titus Brown. ELife 10 (May): e65088. doi: 10.7554/eLife.65088 + +- [Centrifuge](https://doi.org/10.1101/gr.210641.116) + + > Kim, Daehwan, Li Song, Florian P. Breitwieser, and Steven L. Salzberg. 2016. “Centrifuge: Rapid and Sensitive Classification of Metagenomic Sequences.” Genome Research 26 (12): 1721-29. doi: 10.1101/gr.210641.116. ## Software packaging/containerisation tools diff --git a/conf/modules.config b/conf/modules.config index ccd1748..23455a4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -191,11 +191,22 @@ process { withName: MALT_RUN { ext.args = { "${meta.db_params}" } - ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + // one run with multiple samples, so fix ID to just db name to ensure clean log name + ext.prefix = { "${meta.db_name}" } publishDir = [ path: { "${params.outdir}/malt/${meta.db_name}" }, mode: params.publish_dir_mode, - pattern: '*.{log}' + pattern: '*.{rma6,log,sam}' + ] + } + + withName: MEGAN_RMA2INFO { + ext.args = "-c2c Taxonomy" + ext.prefix = { "${meta.id}" } + publishDir = [ + path: { "${params.outdir}/malt/${meta.db_name}" }, + mode: params.publish_dir_mode, + pattern: '*.{txt.gz,megan}' ] } diff --git a/nextflow.config b/nextflow.config index bf3ca92..a618f66 100644 --- a/nextflow.config +++ b/nextflow.config @@ -88,6 +88,7 @@ params { // MALT run_malt = false malt_mode = 'BlastN' + malt_generatemegansummary = false // kraken2 run_kraken2 = false diff --git a/nextflow_schema.json b/nextflow_schema.json index cf0edab..2cbfb0e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -364,11 +364,14 @@ }, "shortread_hostremoval_reference": { "type": "string", - "default": null + "default": "None" }, "shortread_hostremoval_index": { "type": "string", - "default": null + "default": "None" + }, + "malt_generatemegansummary": { + "type": "boolean" } } } diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 23b96e7..d7532b7 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -3,6 +3,7 @@ // include { MALT_RUN } from '../../modules/nf-core/modules/malt/run/main' +include { MEGAN_RMA2INFO } from '../../modules/nf-core/modules/megan/rma2info/main' include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/modules/kraken2/kraken2/main' include { CENTRIFUGE_CENTRIFUGE } from '../../modules/nf-core/modules/centrifuge/centrifuge/main' include { METAPHLAN3 } from '../../modules/nf-core/modules/metaphlan3/main' @@ -95,33 +96,48 @@ workflow PROFILING { if ( params.run_malt ) { MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) - ch_raw_profiles = ch_raw_profiles.mix( MALT_RUN.out.rma6 ) + + ch_maltrun_for_megan = MALT_RUN.out.rma6 + .transpose() + .map{ + meta, rma -> + // re-extract meta from file names, use filename without rma to + // ensure we keep paired-end information in downstream filenames + // when no pair-merging + def meta_new = meta.clone() + meta_new['db_name'] = meta.id + meta_new['id'] = rma.name - ( '.' + rma.extension ) + [ meta_new, rma ] + } + + MEGAN_RMA2INFO (ch_maltrun_for_megan, params.malt_generatemegansummary ) + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) + ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( MEGAN_RMA2INFO.out.txt ) } if ( params.run_kraken2 ) { KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) - ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) + ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.txt ) } if ( params.run_centrifuge ) { CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format ) - ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) + ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( CENTRIFUGE_CENTRIFUGE.out.report ) } if ( params.run_metaphlan3 ) { METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db ) - ch_versions = ch_versions.mix( METAPHLAN3.out.versions.first() ) + ch_versions = ch_versions.mix( METAPHLAN3.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN3.out.biom ) } emit: - profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] + profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom versions = ch_versions // channel: [ versions.yml ] mqc = ch_multiqc_files } diff --git a/subworkflows/local/shortread_postprocessing.nf b/subworkflows/local/shortread_postprocessing.nf deleted file mode 100644 index 7fb0d70..0000000 --- a/subworkflows/local/shortread_postprocessing.nf +++ /dev/null @@ -1,39 +0,0 @@ -// -// Perform read trimming and merging -// - - -include { SHORTREAD_FASTP } from './shortread_fastp' -include { SHORTREAD_ADAPTERREMOVAL } from './shortread_adapterremoval' -include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main' - -workflow SHORTREAD_POSTPROCESSING { - take: - input // [ [ meta ], [ taxon_table/file ] ] - - main: - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() - - if ( params.shortread_clipmerge_tool == "fastp" ) { - ch_processed_reads = SHORTREAD_FASTP ( reads ).reads - ch_versions = ch_versions.mix( SHORTREAD_FASTP.out.versions ) - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc ) - } else if ( params.shortread_clipmerge_tool == "adapterremoval" ) { - ch_processed_reads = SHORTREAD_ADAPTERREMOVAL ( reads ).reads - ch_versions = ch_versions.mix( SHORTREAD_ADAPTERREMOVAL.out.versions ) - ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_ADAPTERREMOVAL.out.mqc ) - } else { - ch_processed_reads = reads - } - - FASTQC_PROCESSED ( ch_processed_reads ) - ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) - - emit: - output = output // channel: [ val(meta), taxon_table ] - versions = ch_versions // channel: [ versions.yml ] - mqc = ch_multiqc_files -} - From d0256ec9378eac0cad715fe364d997a84e9b312b Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 16 Apr 2022 07:45:24 +0200 Subject: [PATCH 157/214] Prettier --- modules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 18c0e69..18dea60 100644 --- a/modules.json +++ b/modules.json @@ -56,4 +56,4 @@ } } } -} \ No newline at end of file +} From 818d3bd053beb2a33730d91d8556c544ff94b417 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 17 Apr 2022 17:53:22 +0200 Subject: [PATCH 158/214] Complete documentation for preprocessing subworkflows --- docs/usage.md | 107 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 103 insertions(+), 4 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 239a55d..528aeef 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -10,6 +10,8 @@ ## Samplesheet inputs +nf-core/profiler can accept as input raw or preprocessed single- or paired-end short-read (e.g. Illumina) FASTQ files, long-read FASTQ files (e.g. Oxford Nanopore), or FASTA sequences (when accepted by a profiler). + You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row as shown in the examples below. Furthermother, nf-core/taxprofiler also requires a second comma-separated file of 3 columns with a header row as in the examples below. This samplesheet is then specified on the command line as follows: @@ -20,7 +22,7 @@ This samplesheet is then specified on the command line as follows: ### Multiple runs of the same sample -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will process reads before performing profiling. Below is an example for the same sample sequenced across 3 lanes: +The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate different runs FASTQ files of the same sample before performing profiling, when `--perform_runmerging` is supplied. Below is an example for the same sample sequenced across 3 lanes: ```console sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta @@ -36,7 +38,7 @@ sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 6 columns to match those defined in the table below. -A final samplesheet file consisting of both single- and paired-end data, as well as long-read FASTA fies may look something like the one below. This is for 6 samples, where `2612` has been sequenced twice. +A final samplesheet file consisting of both single- and paired-end data, as well as long-read FASTA files may look something like the one below. This is for 6 samples, where `2612` has been sequenced twice. ```console 2611,ERR5766174,ILLUMINA,,,///fasta/ERX5474930_ERR5766174_1.fa.gz @@ -59,9 +61,11 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p ### Full database sheet -nf-core/taxprofiler supports multiple databases being profiled in parallel for each tool. These databases, and specific parameters for each, can be specified in a 4 column comma-separated sheet. +nf-core/taxprofiler supports multiple databases being profiled in parallel for each tool. +Databases can be supplied either be in the form of a compressed `.tar.gz` archive of a folder containing all relevant database files or the path to an (uncompressed) directory. +The pipelines takes the locations these of databases as input, and specific parameters for each, via a 4 column comma-separated sheet. -> ⚠️ nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user. +> ⚠️ nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user. See below for more information of the expected database files. An example database sheet can look as follows, where 4 tools are being used, and `malt` and `kraken2` will be used against two databases each. @@ -86,6 +90,41 @@ Column specifications are as follows: > 💡 You can also specify the same database directory/file twice (ensuring unique `db_name`s) and specify different parameters for each database to compare the effect of different parameters during profiling. +nf-core/taxprofiler will automatically decompress and extract any compressed archives for you. + +Expected (uncompressed) database files for each tool are as follows: + +- **MALT** output of `malt-build`. A directory containing: + - `ref.idx` + - `taxonomy.idx` + - `taxonomy.map` + - `index0.idx` + - `table0.idx` + - `table0.db` + - `ref.inf` + - `ref.db` + - `taxonomy.tre` +- **Kraken2** output of `kraken2-build` command(s) A directory containing: + - `opts.k2d` + - `hash.k2d` + - `taxo.k2d` +- **Centrifuge** output of `centrifuge-build`. A directory containing: + - `..cf` + - `..cf` + - `..cf` + - `..cf` +- **MetaPhlAn3** generated with `metaphlan --install` or downloaded from links on the [MetaPhlAn3 wiki](https://github.com/biobakery/MetaPhlAn/wiki/MetaPhlAn-3.0#customizing-the-database). A directory containing: + - `mpa_v30_CHOCOPhlAn_201901.pkl` + - `mpa_v30_CHOCOPhlAn_201901.pkl` + - `mpa_v30_CHOCOPhlAn_201901.fasta` + - `mpa_v30_CHOCOPhlAn_201901.3.bt2` + - `mpa_v30_CHOCOPhlAn_201901.4.bt2` + - `mpa_v30_CHOCOPhlAn_201901.1.bt2` + - `mpa_v30_CHOCOPhlAn_201901.2.bt2` + - `mpa_v30_CHOCOPhlAn_201901.rev.1.bt2` + - `mpa_v30_CHOCOPhlAn_201901.rev.2.bt2` + - `mpa_latest` + ## Running the pipeline The typical command for running the pipeline is as follows: @@ -105,6 +144,66 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` +### Preprocessing Steps + +nf-core/taxprofiler offers four main preprocessing steps + +- Read processing: adapter clipping and pair-merging. +- Complexity filtering: removal of low-sequence complexity reads. +- Host read-removal: removal of reads aligning to reference genome(s) of a host. +- Run merging: concatenation of multiple FASTQ chunks/sequencing runs/libraries of a sample. + +#### Read Processing + +Raw sequencing read processing in the form of adapter clipping and paired-end read merging can be activated via the `--perform_shortread_clipmerge` or `--perform_longread_clip` flags. + +It is highly recommended to run this on raw reads to remove artefacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles. + +There are currently two options for short-read preprocessing: `fastp` or `adapterremoval`. + +For adapter clipping, you can either rely on tool default adapter sequences, or supply your own adapters (`--shortread_clipmerge_adapter1` and `--shortread_clipmerge_adapter2`) +By default, paired-end merging is not activated and paired-end profiling is performed where supported otherwise pairs will be independently profiled. If paired-end merging is activated you can also specify whether to exclude unmerged reads in the reads sent for profiling (`--shortread_clipmerge_mergepairs` and `--shortread_clipmerge_excludeunmerged`). +You can also turn off clipping and only perform paired-end merging, if requested. This can be useful when processing data downloaded from the ENA, SRA, or DDBJ (--shortread_clipmerge_skipadaptertrim). +Both tools support length filtering of reads and can be tuned with `--shortread_clipmerge_minlength`. Performing length filtering can be useful to remove short (often low sequencing complexity) sequences that result in unspecific classification and therefore slow down runtime during profiling, with minimal gain. + +There is currently one option for long-read Oxford Nanopore processing: `porechop`. + +For both short-read and long-read preprocessing, you can optionally save the resulting processed reads with `--save_preprocessed_reads`. + +#### Complexity Filtering + +Complexity filtering can be activated via the `--perform_shortread_complexityfilter` flag. + +Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informatic taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources. + +There are currently two options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) and [`prinseq++`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/). + +The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of both tools (see links above) to decide on optimal methods and parameters for your dataset. + +You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. + +#### Host Removal + +Removal of possible-host reads from FASTQ files prior profiling can be activated with `--perform_shortread_hostremoval` + +Similarly to complexity filtering, host-removal can be useful for runtime optimisation and reduction in misclassified reads. It is not always necessary to report classification of reads from a host when you already know the host of the sample, therefore you can gain a run-time and computational advantage by removing these prior typically resource-heavy profiling with more efficient methods. Furthermore, particularly with human samples, you can reduce the number of false positives during profiling that occur due to host-sequence contamination in reference genomes on public databases. + +nf-core/taxprofiler currently offers host-removal via alignment against a reference genome with Bowtie2, and the use of the unaligned reads for downstream profiling. + +You can supply your reference genome in FASTA format with `--shortread_hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index`, however nf-core/taxprofiler will generate this for you if necessary. + +> 💡 If you have multiple taxa or sequences you wish to remove (e.g., the host genome and then also PhiX - common quality-control reagent during sequencing) you can simply concatenate the FASTAs of each taxa or sequences into a single reference file. + +#### Run Merging + +For samples that may have been sequenced over multiple runs, or for FASTQ files split into multiple chunks, you can activate the ability to merge across all runs or chunks with `--perform_runmerging`. + +For more information how to set up your input samplesheet, see [Multiple runs of the same sample](#multiple-runs-of-the-same-sample). + +Activating this functionality will concatenate togther the FASTQ files with the same sample name _after_ the optional preprocessing steps and prior profiling. Note that libraries with runs of different pairment types will **not** the different types merged together, and output files will indicate with a `_se` or `_pe` suffix to the sample name accordingly. + +You can optionally save the FASTQ output of the run merging with the `--save_runmerged_reads`. + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: From 457c13e8b74f4fef01f8631bbbdfb9583777760c Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 17 Apr 2022 17:55:23 +0200 Subject: [PATCH 159/214] Add additional tips for host removal --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 528aeef..919569c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -190,7 +190,7 @@ Similarly to complexity filtering, host-removal can be useful for runtime optimi nf-core/taxprofiler currently offers host-removal via alignment against a reference genome with Bowtie2, and the use of the unaligned reads for downstream profiling. -You can supply your reference genome in FASTA format with `--shortread_hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index`, however nf-core/taxprofiler will generate this for you if necessary. +You can supply your reference genome in FASTA format with `--shortread_hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index`, however nf-core/taxprofiler will generate this for you if necessary. Pre-supplying the directory of index files can greatly speed up the process, and these can be re-used. > 💡 If you have multiple taxa or sequences you wish to remove (e.g., the host genome and then also PhiX - common quality-control reagent during sequencing) you can simply concatenate the FASTAs of each taxa or sequences into a single reference file. From 2cfcdf2dd2bcc5141b0e7a6474dd27646fa6ee25 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 17 Apr 2022 22:27:04 +0200 Subject: [PATCH 160/214] Add finer controler over for hostremoval saving --- conf/modules.config | 20 ++++++++++++++++---- nextflow.config | 7 +++++-- nextflow_schema.json | 39 ++++++++++++++++++++++++++++++++------- 3 files changed, 53 insertions(+), 13 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ccd1748..55119ff 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -133,21 +133,33 @@ process { } withName: BOWTIE2_BUILD { - ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ path: { "${params.outdir}/bowtie2/build" }, mode: params.publish_dir_mode, - pattern: '*.bt2' + enabled: params.save_hostremoval_index, + pattern: 'bowtie2/*' ] } withName: BOWTIE2_ALIGN { ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [[ + path: { "${params.outdir}/bowtie2/align" }, + mode: params.publish_dir_mode, + pattern: '*.log' + ], + [ + path: { "${params.outdir}/bowtie2/align" }, + mode: params.publish_dir_mode, + enabled: params.save_hostremoval_mapped, + pattern: '*.bam' + ], publishDir = [ path: { "${params.outdir}/bowtie2/align" }, mode: params.publish_dir_mode, - pattern: '*.{fastq.gz,bam}' - ] + enabled: params.save_hostremoval_unmapped, + pattern: '*.fastq.gz' + ]] } withName: BBMAP_BBDUK { diff --git a/nextflow.config b/nextflow.config index bf3ca92..5616b0e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -82,8 +82,11 @@ params { // Host Removal perform_shortread_hostremoval = false - shortread_hostremoval_reference = null - shortread_hostremoval_index = null + shortread_hostremoval_reference = null + shortread_hostremoval_index = null + save_hostremoval_index = false + save_hostremoval_mapped = false + save_hostremoval_unmapped = false // MALT run_malt = false diff --git a/nextflow_schema.json b/nextflow_schema.json index cf0edab..7890517 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -294,7 +304,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -335,7 +348,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -364,11 +380,20 @@ }, "shortread_hostremoval_reference": { "type": "string", - "default": null + "default": "None" }, "shortread_hostremoval_index": { "type": "string", - "default": null + "default": "None" + }, + "save_hostremoval_index": { + "type": "boolean" + }, + "save_hostremoval_mapped": { + "type": "boolean" + }, + "save_hostremoval_unmapped": { + "type": "boolean" } } -} +} \ No newline at end of file From b036ed5b7354df252bf819dfdd65771b8ebcfc98 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 17 Apr 2022 22:28:18 +0200 Subject: [PATCH 161/214] Prettier --- nextflow_schema.json | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 7890517..c0d3597 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -304,10 +294,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -348,10 +335,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -396,4 +380,4 @@ "type": "boolean" } } -} \ No newline at end of file +} From 9880baa0b813a1d7721c9270224a37bb064a31fc Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 18 Apr 2022 06:37:23 +0200 Subject: [PATCH 162/214] Get build directory working --- conf/modules.config | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 55119ff..ba3c5c9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -137,29 +137,31 @@ process { path: { "${params.outdir}/bowtie2/build" }, mode: params.publish_dir_mode, enabled: params.save_hostremoval_index, - pattern: 'bowtie2/*' + pattern: 'bowtie2' ] } withName: BOWTIE2_ALIGN { ext.prefix = { "${meta.id}_${meta.run_accession}" } - publishDir = [[ + publishDir = [ + [ path: { "${params.outdir}/bowtie2/align" }, mode: params.publish_dir_mode, pattern: '*.log' - ], - [ + ], + [ path: { "${params.outdir}/bowtie2/align" }, mode: params.publish_dir_mode, enabled: params.save_hostremoval_mapped, pattern: '*.bam' - ], - publishDir = [ + ], + [ path: { "${params.outdir}/bowtie2/align" }, mode: params.publish_dir_mode, enabled: params.save_hostremoval_unmapped, pattern: '*.fastq.gz' - ]] + ] + ] } withName: BBMAP_BBDUK { From 564474cafeb77ed1e39d127ed4a08667a4522878 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 18 Apr 2022 07:21:12 +0200 Subject: [PATCH 163/214] Adds filtering and warning for non-FASTA supporting tool. Standardised warning messages. --- subworkflows/local/input_check.nf | 2 ++ subworkflows/local/profiling.nf | 8 ++++++++ workflows/taxprofiler.nf | 11 ++++++----- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index db46f2b..eb21b9d 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -45,6 +45,7 @@ def create_fastq_channel(LinkedHashMap row) { meta.run_accession = row.run_accession meta.instrument_platform = row.instrument_platform meta.single_end = row.single_end.toBoolean() + meta.is_fasta = false // add path(s) of the fastq file(s) to the meta map def fastq_meta = [] @@ -75,6 +76,7 @@ def create_fasta_channel(LinkedHashMap row) { meta.run_accession = row.run_accession meta.instrument_platform = row.instrument_platform meta.single_end = true + meta.is_fasta = true def array = [] if (!file(row.fasta).exists()) { diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 18de739..f2f7eb7 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -75,6 +75,10 @@ workflow PROFILING { } ch_input_for_centrifuge = ch_input_for_profiling.centrifuge + .filter{ + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] Centrifuge does not accept FASTA files as input. Skipping Centrifuge for sample " + it[0].id + !it[0].is_fasta + } .multiMap { it -> reads: [ it[0] + it[2], it[1] ] @@ -82,6 +86,10 @@ workflow PROFILING { } ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 + .filter{ + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] MetaPhlAn3 does not accept FASTA files as input. Skipping MetaPhlAn3 for sample " + it[0].id + !it[0].is_fasta + } .multiMap { it -> reads: [it[0] + it[2], it[1]] diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 9fe8cc8..e7ff78b 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -19,11 +19,11 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } -if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." -if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "[nf-core/taxprofiler] error: cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" +if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler]: MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." +if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" -if (params.perform_shortread_hostremoval && !params.shortread_hostremoval_reference) { exit 1, "[nf-core/taxprofiler] error: --shortread_hostremoval requested but no --shortread_hostremoval_reference FASTA supplied. Check input." } -if (!params.shortread_hostremoval_reference && params.shortread_hostremoval_reference_index) { exit 1, "[nf-core/taxprofiler] error: --shortread_hostremoval_index provided but no --shortread_hostremoval_reference FASTA supplied. Check input." } +if (params.perform_shortread_hostremoval && !params.shortread_hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --shortread_hostremoval_reference FASTA supplied. Check input." } +if (!params.shortread_hostremoval_reference && params.shortread_hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --shortread_hostremoval_reference FASTA supplied. Check input." } if (params.shortread_hostremoval_reference ) { ch_reference = file(params.shortread_hostremoval_reference) } if (params.shortread_hostremoval_index ) { ch_reference_index = file(params.shortread_hostremoval_index ) } else { ch_reference_index = [] } @@ -175,10 +175,11 @@ workflow TAXPROFILER { meta, reads -> [ meta, [ reads ].flatten() ] } - + .mix( INPUT_CHECK.out.fasta ) } else { ch_reads_runmerged = ch_shortreads_hostremoved .mix( ch_longreads_preprocessed ) + .mix( INPUT_CHECK.out.fasta ) } /* From 491c2d894967f76481b566758c40be369b448603 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 18 Apr 2022 07:26:15 +0200 Subject: [PATCH 164/214] Revert "Adds filtering and warning for non-FASTA supporting tool. Standardised warning messages." This reverts commit 564474cafeb77ed1e39d127ed4a08667a4522878. --- subworkflows/local/input_check.nf | 2 -- subworkflows/local/profiling.nf | 8 -------- workflows/taxprofiler.nf | 11 +++++------ 3 files changed, 5 insertions(+), 16 deletions(-) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index eb21b9d..db46f2b 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -45,7 +45,6 @@ def create_fastq_channel(LinkedHashMap row) { meta.run_accession = row.run_accession meta.instrument_platform = row.instrument_platform meta.single_end = row.single_end.toBoolean() - meta.is_fasta = false // add path(s) of the fastq file(s) to the meta map def fastq_meta = [] @@ -76,7 +75,6 @@ def create_fasta_channel(LinkedHashMap row) { meta.run_accession = row.run_accession meta.instrument_platform = row.instrument_platform meta.single_end = true - meta.is_fasta = true def array = [] if (!file(row.fasta).exists()) { diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index f2f7eb7..18de739 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -75,10 +75,6 @@ workflow PROFILING { } ch_input_for_centrifuge = ch_input_for_profiling.centrifuge - .filter{ - if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] Centrifuge does not accept FASTA files as input. Skipping Centrifuge for sample " + it[0].id - !it[0].is_fasta - } .multiMap { it -> reads: [ it[0] + it[2], it[1] ] @@ -86,10 +82,6 @@ workflow PROFILING { } ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 - .filter{ - if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] MetaPhlAn3 does not accept FASTA files as input. Skipping MetaPhlAn3 for sample " + it[0].id - !it[0].is_fasta - } .multiMap { it -> reads: [it[0] + it[2], it[1]] diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index e7ff78b..9fe8cc8 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -19,11 +19,11 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } -if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler]: MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." -if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" +if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." +if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "[nf-core/taxprofiler] error: cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" -if (params.perform_shortread_hostremoval && !params.shortread_hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --shortread_hostremoval_reference FASTA supplied. Check input." } -if (!params.shortread_hostremoval_reference && params.shortread_hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --shortread_hostremoval_reference FASTA supplied. Check input." } +if (params.perform_shortread_hostremoval && !params.shortread_hostremoval_reference) { exit 1, "[nf-core/taxprofiler] error: --shortread_hostremoval requested but no --shortread_hostremoval_reference FASTA supplied. Check input." } +if (!params.shortread_hostremoval_reference && params.shortread_hostremoval_reference_index) { exit 1, "[nf-core/taxprofiler] error: --shortread_hostremoval_index provided but no --shortread_hostremoval_reference FASTA supplied. Check input." } if (params.shortread_hostremoval_reference ) { ch_reference = file(params.shortread_hostremoval_reference) } if (params.shortread_hostremoval_index ) { ch_reference_index = file(params.shortread_hostremoval_index ) } else { ch_reference_index = [] } @@ -175,11 +175,10 @@ workflow TAXPROFILER { meta, reads -> [ meta, [ reads ].flatten() ] } - .mix( INPUT_CHECK.out.fasta ) + } else { ch_reads_runmerged = ch_shortreads_hostremoved .mix( ch_longreads_preprocessed ) - .mix( INPUT_CHECK.out.fasta ) } /* From 6307a9cf4342a05fdc6cb2a127527a66d3993155 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 18 Apr 2022 07:28:09 +0200 Subject: [PATCH 165/214] Typos --- subworkflows/local/profiling.nf | 2 +- workflows/taxprofiler.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index f2f7eb7..156259b 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -76,7 +76,7 @@ workflow PROFILING { ch_input_for_centrifuge = ch_input_for_profiling.centrifuge .filter{ - if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] Centrifuge does not accept FASTA files as input. Skipping Centrifuge for sample " + it[0].id + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] Centrifuge currently does not accept FASTA files as input. Skipping Centrifuge for sample " + it[0].id !it[0].is_fasta } .multiMap { diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index e7ff78b..884ee46 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -19,7 +19,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } -if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler]: MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." +if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" if (params.perform_shortread_hostremoval && !params.shortread_hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --shortread_hostremoval_reference FASTA supplied. Check input." } From 9f634805e402c99bd906c523fdbba3350f4086e4 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Mon, 18 Apr 2022 07:31:44 +0200 Subject: [PATCH 166/214] Update subworkflows/local/profiling.nf --- subworkflows/local/profiling.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 4e0a3a9..659430f 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -87,7 +87,7 @@ workflow PROFILING { ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 .filter{ - if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] Centrifuge currently does not accept FASTA files as input. Skipping Centrifuge for sample " + it[0].id + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] MetaPhlAn3 currently does not accept FASTA files as input. Skipping MetaPhlAn3 for sample " + it[0].id !it[0].is_fasta } .multiMap { From ecaff371aa8047b1fea25f8b42fa7df1fc58e93b Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Mon, 18 Apr 2022 07:35:19 +0200 Subject: [PATCH 167/214] Update input_check.nf --- subworkflows/local/input_check.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index db46f2b..eb21b9d 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -45,6 +45,7 @@ def create_fastq_channel(LinkedHashMap row) { meta.run_accession = row.run_accession meta.instrument_platform = row.instrument_platform meta.single_end = row.single_end.toBoolean() + meta.is_fasta = false // add path(s) of the fastq file(s) to the meta map def fastq_meta = [] @@ -75,6 +76,7 @@ def create_fasta_channel(LinkedHashMap row) { meta.run_accession = row.run_accession meta.instrument_platform = row.instrument_platform meta.single_end = true + meta.is_fasta = true def array = [] if (!file(row.fasta).exists()) { From 87777a428959bc89f4f6cfda956c43312d4b6d16 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Tue, 19 Apr 2022 12:54:57 +0200 Subject: [PATCH 168/214] Update subworkflows/local/profiling.nf Co-authored-by: Moritz E. Beber --- subworkflows/local/profiling.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index d7532b7..5f9c111 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -106,7 +106,7 @@ workflow PROFILING { // when no pair-merging def meta_new = meta.clone() meta_new['db_name'] = meta.id - meta_new['id'] = rma.name - ( '.' + rma.extension ) + meta_new['id'] = rma.baseName [ meta_new, rma ] } From 1b7ff2265d146577c840399fcb167d032f98c88d Mon Sep 17 00:00:00 2001 From: sofstam Date: Tue, 19 Apr 2022 13:04:58 +0200 Subject: [PATCH 169/214] Update usage.md, reformat test.config and modules.config --- conf/modules.config | 4 ++-- conf/test.config | 8 ++++---- docs/usage.md | 5 +++++ 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index f85df30..d9e9125 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -226,7 +226,7 @@ process { pattern: '*.txt' ] ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } withName: CUSTOM_DUMPSOFTWAREVERSIONS { @@ -252,6 +252,6 @@ process { pattern: '*.tsv' ] ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } } diff --git a/conf/test.config b/conf/test.config index 884ff32..107beb5 100644 --- a/conf/test.config +++ b/conf/test.config @@ -24,14 +24,14 @@ params { // TODO nf-core: Give any required params for the test so that command line flags are not needed input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' - run_kraken2 = true - run_malt = true - run_metaphlan3 = true - run_centrifuge = true perform_shortread_clipmerge = true perform_longread_clip = false perform_shortread_complexityfilter = true perform_shortread_hostremoval = true shortread_hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' run_kaiju = true + run_kraken2 = true + run_malt = true + run_metaphlan3 = true + run_centrifuge = true } diff --git a/docs/usage.md b/docs/usage.md index 239a55d..2b6937b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -86,6 +86,11 @@ Column specifications are as follows: > 💡 You can also specify the same database directory/file twice (ensuring unique `db_name`s) and specify different parameters for each database to compare the effect of different parameters during profiling. +## Profilers + +- `kaiju` + - The file `nodes.dmp` must be inside the database directory that is specified to `--databases` file + ## Running the pipeline The typical command for running the pipeline is as follows: From 4e1b1269be42b3547678ac1ec2905401b8e47ab0 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Tue, 19 Apr 2022 13:11:43 +0200 Subject: [PATCH 170/214] Apply prettier --- nextflow_schema.json | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 5fe371d..12c0bac 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -304,10 +294,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -348,10 +335,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", From b7d0557ee08ff0aac70fc84f4c36bb6c881ddcc9 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 20 Apr 2022 09:15:45 +0200 Subject: [PATCH 171/214] Update subworkflows/local/profiling.nf Co-authored-by: Moritz E. Beber --- subworkflows/local/profiling.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 659430f..1968741 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -76,7 +76,9 @@ workflow PROFILING { ch_input_for_centrifuge = ch_input_for_profiling.centrifuge .filter{ - if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] Centrifuge currently does not accept FASTA files as input. Skipping Centrifuge for sample " + it[0].id + if (it[0].is_fasta) { + WorkflowTaxprofiler.logMessage("Centrifuge currently does not accept FASTA files as input. Skipping Centrifuge for sample ${it[0].id}.", log) + } !it[0].is_fasta } .multiMap { From ac714dd30f758a0f419ad4155b296cdba5e8a80a Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 20 Apr 2022 09:22:10 +0200 Subject: [PATCH 172/214] Apply suggestions from code review Co-authored-by: Moritz E. Beber --- docs/usage.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 919569c..f4f11fb 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -10,7 +10,7 @@ ## Samplesheet inputs -nf-core/profiler can accept as input raw or preprocessed single- or paired-end short-read (e.g. Illumina) FASTQ files, long-read FASTQ files (e.g. Oxford Nanopore), or FASTA sequences (when accepted by a profiler). +nf-core/taxprofiler can accept as input raw or preprocessed single- or paired-end short-read (e.g. Illumina) FASTQ files, long-read FASTQ files (e.g. Oxford Nanopore), or FASTA sequences (available for a subset of profilers). You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row as shown in the examples below. Furthermother, nf-core/taxprofiler also requires a second comma-separated file of 3 columns with a header row as in the examples below. @@ -62,8 +62,8 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p ### Full database sheet nf-core/taxprofiler supports multiple databases being profiled in parallel for each tool. -Databases can be supplied either be in the form of a compressed `.tar.gz` archive of a folder containing all relevant database files or the path to an (uncompressed) directory. -The pipelines takes the locations these of databases as input, and specific parameters for each, via a 4 column comma-separated sheet. +Databases can be supplied either in the form of a compressed `.tar.gz` archive of a directory containing all relevant database files or the path to a directory on the filesystem. +The pipeline takes the locations and specific parameters of these databases as input via a four column comma-separated sheet. > ⚠️ nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user. See below for more information of the expected database files. @@ -163,7 +163,7 @@ There are currently two options for short-read preprocessing: `fastp` or `adapte For adapter clipping, you can either rely on tool default adapter sequences, or supply your own adapters (`--shortread_clipmerge_adapter1` and `--shortread_clipmerge_adapter2`) By default, paired-end merging is not activated and paired-end profiling is performed where supported otherwise pairs will be independently profiled. If paired-end merging is activated you can also specify whether to exclude unmerged reads in the reads sent for profiling (`--shortread_clipmerge_mergepairs` and `--shortread_clipmerge_excludeunmerged`). -You can also turn off clipping and only perform paired-end merging, if requested. This can be useful when processing data downloaded from the ENA, SRA, or DDBJ (--shortread_clipmerge_skipadaptertrim). +You can also turn off clipping and only perform paired-end merging, if requested. This can be useful when processing data downloaded from the ENA, SRA, or DDBJ (`--shortread_clipmerge_skipadaptertrim`). Both tools support length filtering of reads and can be tuned with `--shortread_clipmerge_minlength`. Performing length filtering can be useful to remove short (often low sequencing complexity) sequences that result in unspecific classification and therefore slow down runtime during profiling, with minimal gain. There is currently one option for long-read Oxford Nanopore processing: `porechop`. @@ -174,7 +174,7 @@ For both short-read and long-read preprocessing, you can optionally save the res Complexity filtering can be activated via the `--perform_shortread_complexityfilter` flag. -Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informatic taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources. +Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources. There are currently two options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) and [`prinseq++`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/). @@ -200,7 +200,7 @@ For samples that may have been sequenced over multiple runs, or for FASTQ files For more information how to set up your input samplesheet, see [Multiple runs of the same sample](#multiple-runs-of-the-same-sample). -Activating this functionality will concatenate togther the FASTQ files with the same sample name _after_ the optional preprocessing steps and prior profiling. Note that libraries with runs of different pairment types will **not** the different types merged together, and output files will indicate with a `_se` or `_pe` suffix to the sample name accordingly. +Activating this functionality will concatenate the FASTQ files with the same sample name _after_ the optional preprocessing steps and _before_ profiling. Note that libraries with runs of different pairing types will **not** be merged and this will be indicated on output files with a `_se` or `_pe` suffix to the sample name accordingly. You can optionally save the FASTQ output of the run merging with the `--save_runmerged_reads`. From 60d232d62ce42f77df45ceaf71a04751960cbf7d Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 20 Apr 2022 09:23:38 +0200 Subject: [PATCH 173/214] Revert "Update subworkflows/local/profiling.nf" This reverts commit b7d0557ee08ff0aac70fc84f4c36bb6c881ddcc9. --- subworkflows/local/profiling.nf | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 1968741..659430f 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -76,9 +76,7 @@ workflow PROFILING { ch_input_for_centrifuge = ch_input_for_profiling.centrifuge .filter{ - if (it[0].is_fasta) { - WorkflowTaxprofiler.logMessage("Centrifuge currently does not accept FASTA files as input. Skipping Centrifuge for sample ${it[0].id}.", log) - } + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] Centrifuge currently does not accept FASTA files as input. Skipping Centrifuge for sample " + it[0].id !it[0].is_fasta } .multiMap { From 9fd94b13892018963ed387e8f425e295af5e4d61 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 20 Apr 2022 09:25:54 +0200 Subject: [PATCH 174/214] Switch to string interpolation --- subworkflows/local/profiling.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 659430f..87bea1a 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -76,18 +76,18 @@ workflow PROFILING { ch_input_for_centrifuge = ch_input_for_profiling.centrifuge .filter{ - if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] Centrifuge currently does not accept FASTA files as input. Skipping Centrifuge for sample " + it[0].id - !it[0].is_fasta + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] Centrifuge currently does not accept FASTA files as input. Skipping Centrifuge for sample ${it[0].id}." + !it[0].is_fasta } .multiMap { it -> - reads: [ it[0] + it[2], it[1] ] - db: it[3] + reads: [ it[0] + it[2], it[1] ] + db: it[3] } ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 .filter{ - if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] MetaPhlAn3 currently does not accept FASTA files as input. Skipping MetaPhlAn3 for sample " + it[0].id + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] MetaPhlAn3 currently does not accept FASTA files as input. Skipping MetaPhlAn3 for sample ${it[0].id}." !it[0].is_fasta } .multiMap { From d031322c16aca1c2a6dcd4103c17a097db3f136f Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 20 Apr 2022 13:41:01 +0200 Subject: [PATCH 175/214] Update test.config --- conf/test.config | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/conf/test.config b/conf/test.config index 107beb5..0bd09ab 100644 --- a/conf/test.config +++ b/conf/test.config @@ -34,4 +34,10 @@ params { run_malt = true run_metaphlan3 = true run_centrifuge = true + } + +process { + withName: MALT_RUN { + maxForks = 1 + } } From d850333e11db1d1fb4807b15b94c9e8332a71cac Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 20 Apr 2022 13:42:26 +0200 Subject: [PATCH 176/214] Linting --- conf/test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index 0bd09ab..ecf55bd 100644 --- a/conf/test.config +++ b/conf/test.config @@ -34,7 +34,7 @@ params { run_malt = true run_metaphlan3 = true run_centrifuge = true - } +} process { withName: MALT_RUN { From c00ca1c2b80a9032f66dd068c2f51cb42090a55d Mon Sep 17 00:00:00 2001 From: sofstam Date: Fri, 22 Apr 2022 15:24:10 +0200 Subject: [PATCH 177/214] Add centrifuge kreport in taxprofiler --- conf/modules.config | 10 +++++ modules.json | 11 +++-- .../modules/centrifuge/kreport/main.nf | 33 +++++++++++++++ .../modules/centrifuge/kreport/meta.yml | 41 +++++++++++++++++++ nextflow.config | 1 + nextflow_schema.json | 27 ++++++++++-- subworkflows/local/profiling.nf | 4 +- 7 files changed, 118 insertions(+), 9 deletions(-) create mode 100644 modules/nf-core/modules/centrifuge/kreport/main.nf create mode 100644 modules/nf-core/modules/centrifuge/kreport/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 5b9a815..19cfef7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -254,6 +254,16 @@ process { ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } + withName: CENTRIFUGE_KREPORT { + ext.args = { "${meta.db_params}" } + ext.prefix = { "${meta.id}" } + publishDir = [ + path: { "${params.outdir}/centrifuge/${meta.db_name}" }, + mode: params.publish_dir_mode, + pattern: '*.{txt}' + ] + } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, diff --git a/modules.json b/modules.json index d6be2da..6fc4f51 100644 --- a/modules.json +++ b/modules.json @@ -21,6 +21,9 @@ "centrifuge/centrifuge": { "git_sha": "d2726fcf75063960f06b36d2229a4c0966614108" }, + "centrifuge/kreport": { + "git_sha": "be4ae28c3c95b3c4047a7d9fb4cb0ed749631cea" + }, "custom/dumpsoftwareversions": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, @@ -30,6 +33,9 @@ "fastqc": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, + "kaiju/kaiju": { + "git_sha": "8856f127c58f6af479128be8b8df4d42e442ddbe" + }, "kraken2/kraken2": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, @@ -53,10 +59,7 @@ }, "untar": { "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" - }, - "kaiju/kaiju": { - "git_sha": "8856f127c58f6af479128be8b8df4d42e442ddbe" } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/centrifuge/kreport/main.nf b/modules/nf-core/modules/centrifuge/kreport/main.nf new file mode 100644 index 0000000..124cbdb --- /dev/null +++ b/modules/nf-core/modules/centrifuge/kreport/main.nf @@ -0,0 +1,33 @@ +process CENTRIFUGE_KREPORT { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::centrifuge=1.0.4_beta" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6': + 'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }" + + input: + tuple val(meta), path(results) + path db + + output: + tuple val(meta), path('*.txt') , emit: kreport + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + db_name=`find -L ${db} -name "*.1.cf" -not -name "._*" | sed 's/.1.cf//'` + centrifuge-kreport -x \$db_name ${results} > ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/centrifuge/kreport/meta.yml b/modules/nf-core/modules/centrifuge/kreport/meta.yml new file mode 100644 index 0000000..fbcae24 --- /dev/null +++ b/modules/nf-core/modules/centrifuge/kreport/meta.yml @@ -0,0 +1,41 @@ +name: "centrifuge_kreport" +description: Creates Kraken-style reports from centrifuge out files +keywords: + - metagenomics +tools: + - centrifuge: + description: Centrifuge is a classifier for metagenomic sequences. + homepage: https://ccb.jhu.edu/software/centrifuge/ + documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml + doi: 10.1101/gr.210641.116 + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - results: + type: file + description: File containing the centrifuge classification results + pattern: "*.{txt}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - kreport: + type: file + description: | + File containing kraken-style report from centrifuge + out files. + pattern: "*.{txt}" +authors: + - "@sofstam" + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index 43aaae8..61b4905 100644 --- a/nextflow.config +++ b/nextflow.config @@ -98,6 +98,7 @@ params { // centrifuge run_centrifuge = false + centrifuge_kreport = false centrifuge_save_unaligned = false centrifuge_save_aligned = false centrifuge_sam_format = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 2181cce..f5822db 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -294,7 +304,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -335,7 +348,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -384,6 +400,9 @@ }, "malt_generatemegansummary": { "type": "boolean" + }, + "centrifuge_kreport": { + "type": "boolean" } } } diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 8d19bfe..1369f4c 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -6,6 +6,7 @@ include { MALT_RUN } from '../../modules/nf-core/modules/malt include { MEGAN_RMA2INFO } from '../../modules/nf-core/modules/megan/rma2info/main' include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/modules/kraken2/kraken2/main' include { CENTRIFUGE_CENTRIFUGE } from '../../modules/nf-core/modules/centrifuge/centrifuge/main' +include { CENTRIFUGE_KREPORT } from '../../modules/nf-core/modules/centrifuge/kreport/main' include { METAPHLAN3 } from '../../modules/nf-core/modules/metaphlan3/main' include { KAIJU_KAIJU } from '../../modules/nf-core/modules/kaiju/kaiju/main' @@ -142,8 +143,9 @@ workflow PROFILING { if ( params.run_centrifuge ) { CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format ) + CENTRIFUGE_KREPORT (CENTRIFUGE_CENTRIFUGE.out.results, ch_input_for_centrifuge.db) ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) - ch_raw_profiles = ch_raw_profiles.mix( CENTRIFUGE_CENTRIFUGE.out.report ) + ch_raw_profiles = ch_raw_profiles.mix( CENTRIFUGE_KREPORT.out.kreport ) } if ( params.run_metaphlan3 ) { From cda12749181128267b721e1931018e1c67bbf1a6 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 23 Apr 2022 06:47:59 +0200 Subject: [PATCH 178/214] Moves ordering of modules/subworkflow versions mixing to before custom_dumpsoftwareversions mix, otherwise not used in collectFile --- workflows/taxprofiler.nf | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 3ae1351..9981ca4 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -111,8 +111,8 @@ workflow TAXPROFILER { SUBWORKFLOW: PERFORM PREPROCESSING */ if ( params.perform_shortread_clipmerge ) { - ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads + ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions ) } else { ch_shortreads_preprocessed = INPUT_CHECK.out.fastq } @@ -120,7 +120,7 @@ workflow TAXPROFILER { if ( params.perform_longread_clip ) { ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ).reads .map { it -> [ it[0], [it[1]] ] } - ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first()) + ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions ) } else { ch_longreads_preprocessed = INPUT_CHECK.out.nanopore } @@ -131,6 +131,7 @@ workflow TAXPROFILER { if ( params.perform_shortread_complexityfilter ) { ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads + ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } else { ch_shortreads_filtered = ch_shortreads_preprocessed } @@ -141,7 +142,7 @@ workflow TAXPROFILER { if ( params.perform_shortread_hostremoval ) { ch_shortreads_hostremoved = SHORTREAD_HOSTREMOVAL ( ch_shortreads_filtered, ch_reference, ch_reference_index ).reads - ch_versions = ch_versions.mix(SHORTREAD_HOSTREMOVAL.out.versions.first()) + ch_versions = ch_versions.mix(SHORTREAD_HOSTREMOVAL.out.versions) } else { ch_shortreads_hostremoved = ch_shortreads_filtered } @@ -177,6 +178,8 @@ workflow TAXPROFILER { } .mix( INPUT_CHECK.out.fasta ) + ch_versions = ch_versions.mix(CAT_FASTQ.out.versions) + } else { ch_reads_runmerged = ch_shortreads_hostremoved .mix( ch_longreads_preprocessed, INPUT_CHECK.out.fasta ) @@ -193,8 +196,10 @@ workflow TAXPROFILER { MODULE: MultiQC */ + ch_versions = ch_versions.mix(MULTIQC.out.versions) + CUSTOM_DUMPSOFTWAREVERSIONS ( - ch_versions.unique().collectFile(name: 'collated_versions.yml') + ch_versions.dump(tag: "software_versions_beforeuniqe").unique().dump(tag: "software_versions").collectFile(name: 'collated_versions.yml') ) @@ -210,26 +215,18 @@ workflow TAXPROFILER { if (params.perform_shortread_clipmerge) { ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions ) } if (params.perform_longread_clip) { ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions ) } if (params.perform_shortread_complexityfilter){ ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } if (params.perform_shortread_hostremoval) { ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_HOSTREMOVAL.out.mqc.collect{it[1]}.ifEmpty([])) - ch_versions = ch_versions.mix(SHORTREAD_HOSTREMOVAL.out.versions) - } - - if (params.perform_runmerging){ - ch_versions = ch_versions.mix(CAT_FASTQ.out.versions) } ch_multiqc_files = ch_multiqc_files.mix( PROFILING.out.mqc ) @@ -239,7 +236,6 @@ workflow TAXPROFILER { ch_multiqc_files.collect() ) multiqc_report = MULTIQC.out.report.toList() - ch_versions = ch_versions.mix(MULTIQC.out.versions) } /* From 06b38f3699edd49d1d4a25e02196e012d9221dba Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 23 Apr 2022 08:36:01 +0200 Subject: [PATCH 179/214] Fix crash, add missing versions --- subworkflows/local/db_check.nf | 2 +- subworkflows/local/profiling.nf | 2 +- workflows/taxprofiler.nf | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index a718fa9..f3464d5 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -34,7 +34,7 @@ workflow DB_CHECK { emit: dbs = ch_final_dbs // channel: [ val(meta), [ db ] ] - versions = DATABASE_CHECK.out.versions // channel: [ versions.yml ] + versions = DATABASE_CHECK.out.versions.mix(UNTAR.out.versions.first()) // channel: [ versions.yml ] } def create_db_channels(LinkedHashMap row) { diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 8d19bfe..a5afaaf 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -129,7 +129,7 @@ workflow PROFILING { MEGAN_RMA2INFO (ch_maltrun_for_megan, params.malt_generatemegansummary ) ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) + ch_versions = ch_versions.mix( MALT_RUN.out.versions.first(), MEGAN_RMA2INFO.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( MEGAN_RMA2INFO.out.txt ) } diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 9981ca4..a0046b2 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -196,10 +196,8 @@ workflow TAXPROFILER { MODULE: MultiQC */ - ch_versions = ch_versions.mix(MULTIQC.out.versions) - CUSTOM_DUMPSOFTWAREVERSIONS ( - ch_versions.dump(tag: "software_versions_beforeuniqe").unique().dump(tag: "software_versions").collectFile(name: 'collated_versions.yml') + ch_versions.unique().collectFile(name: 'collated_versions.yml') ) @@ -236,6 +234,7 @@ workflow TAXPROFILER { ch_multiqc_files.collect() ) multiqc_report = MULTIQC.out.report.toList() + ch_versions = ch_versions.mix(MULTIQC.out.versions) } /* From e7e24c2d95c43d28c024817b0ebf67b2a9febf75 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 25 Apr 2022 09:21:29 +0200 Subject: [PATCH 180/214] Prettier and suggestions from review --- conf/modules.config | 2 +- nextflow.config | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 19cfef7..5c5de54 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -256,7 +256,7 @@ process { withName: CENTRIFUGE_KREPORT { ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/centrifuge/${meta.db_name}" }, mode: params.publish_dir_mode, diff --git a/nextflow.config b/nextflow.config index 61b4905..43aaae8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -98,7 +98,6 @@ params { // centrifuge run_centrifuge = false - centrifuge_kreport = false centrifuge_save_unaligned = false centrifuge_save_aligned = false centrifuge_sam_format = false From ba2479b95bf0cb07423dfff9c1a75db09ac3abff Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 25 Apr 2022 09:26:49 +0200 Subject: [PATCH 181/214] Prettier --- modules.json | 2 +- nextflow_schema.json | 24 ++++-------------------- 2 files changed, 5 insertions(+), 21 deletions(-) diff --git a/modules.json b/modules.json index 6fc4f51..e9c1310 100644 --- a/modules.json +++ b/modules.json @@ -62,4 +62,4 @@ } } } -} \ No newline at end of file +} diff --git a/nextflow_schema.json b/nextflow_schema.json index f5822db..fe66308 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -304,10 +294,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -348,10 +335,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", From d8a5c793e23315557ae0772a1a82db895ac5d536 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 25 Apr 2022 09:29:43 +0200 Subject: [PATCH 182/214] Remove parameter from nextflow_schema --- nextflow_schema.json | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index fe66308..f53e036 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -294,7 +304,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -335,7 +348,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -384,9 +400,6 @@ }, "malt_generatemegansummary": { "type": "boolean" - }, - "centrifuge_kreport": { - "type": "boolean" } } -} +} \ No newline at end of file From e3a613de00416b470ca69e6d36fd6d46c5d1c491 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 25 Apr 2022 09:32:25 +0200 Subject: [PATCH 183/214] Prettier --- nextflow_schema.json | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index f53e036..2181cce 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -304,10 +294,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -348,10 +335,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -402,4 +386,4 @@ "type": "boolean" } } -} \ No newline at end of file +} From d082d67bcb6e55dfcd46996b97a947819e86b3c7 Mon Sep 17 00:00:00 2001 From: sofstam Date: Mon, 25 Apr 2022 17:36:29 +0200 Subject: [PATCH 184/214] Add kaiju2table to taxprofiler --- conf/modules.config | 10 ++++ modules.json | 5 +- .../nf-core/modules/kaiju/kaiju2table/main.nf | 40 +++++++++++++++ .../modules/kaiju/kaiju2table/meta.yml | 50 +++++++++++++++++++ nextflow.config | 1 + nextflow_schema.json | 30 +++++++++-- subworkflows/local/profiling.nf | 6 ++- 7 files changed, 135 insertions(+), 7 deletions(-) create mode 100644 modules/nf-core/modules/kaiju/kaiju2table/main.nf create mode 100644 modules/nf-core/modules/kaiju/kaiju2table/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 5c5de54..a72561d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -289,4 +289,14 @@ process { ext.args = { "${meta.db_params}" } ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } + + withName: KAIJU_KAIJU2TABLE { + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/kaiju/${meta.db_name}" }, + mode: params.publish_dir_mode, + pattern: '*.{txt}' + ] + } } diff --git a/modules.json b/modules.json index e9c1310..9beb010 100644 --- a/modules.json +++ b/modules.json @@ -36,6 +36,9 @@ "kaiju/kaiju": { "git_sha": "8856f127c58f6af479128be8b8df4d42e442ddbe" }, + "kaiju/kaiju2table": { + "git_sha": "538dbac98ba9c8f799536cd5a617195501439457" + }, "kraken2/kraken2": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, @@ -62,4 +65,4 @@ } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/kaiju/kaiju2table/main.nf b/modules/nf-core/modules/kaiju/kaiju2table/main.nf new file mode 100644 index 0000000..00739d1 --- /dev/null +++ b/modules/nf-core/modules/kaiju/kaiju2table/main.nf @@ -0,0 +1,40 @@ +process KAIJU_KAIJU2TABLE { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::kaiju=1.8.2" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kaiju:1.8.2--h5b5514e_1': + 'quay.io/biocontainers/kaiju:1.8.2--h2e03b76_0' }" + + input: + tuple val(meta), path(results) + path db + val taxon_rank + + output: + tuple val(meta), path('*.txt'), emit: summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + dbnodes=`find -L ${db} -name "*nodes.dmp"` + dbname=`find -L ${db} -name "*.fmi" -not -name "._*"` + kaiju2table $args \\ + -t \$dbnodes \\ + -n \$dbname \\ + -r ${taxon_rank} \\ + -o ${prefix}.txt \\ + ${results} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kaiju: \$(echo \$( kaiju -h 2>&1 | sed -n 1p | sed 's/^.*Kaiju //' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/kaiju/kaiju2table/meta.yml b/modules/nf-core/modules/kaiju/kaiju2table/meta.yml new file mode 100644 index 0000000..bc3e85d --- /dev/null +++ b/modules/nf-core/modules/kaiju/kaiju2table/meta.yml @@ -0,0 +1,50 @@ +name: "kaiju_kaiju2table" +description: write your description here +keywords: + - classify + - metagenomics +tools: + - kaiju: + description: Fast and sensitive taxonomic classification for metagenomics + homepage: https://kaiju.binf.ku.dk/ + documentation: https://github.com/bioinformatics-centre/kaiju/blob/master/README.md + tool_dev_url: https://github.com/bioinformatics-centre/kaiju + doi: "10.1038/ncomms11257" + licence: ["GNU GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - results: + type: file + description: File containing the kaiju classification results + pattern: "*.{txt}" + - taxon_rank: + type: string + description: | + Taxonomic rank to display in report + pattern: "phylum|class|order|family|genus|species" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - results: + type: file + description: | + Summary table for a given taxonomic rank + pattern: "*.{tsv}" + +authors: + - "@sofstam" + - "@talnor" + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index 43aaae8..909da25 100644 --- a/nextflow.config +++ b/nextflow.config @@ -107,6 +107,7 @@ params { // kaiju run_kaiju = false + kaiju_taxon_name = 'species' } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index 2181cce..9516b6a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -294,7 +304,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -335,7 +348,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -384,6 +400,10 @@ }, "malt_generatemegansummary": { "type": "boolean" + }, + "kaiju_taxon_name": { + "type": "string", + "default": "species" } } -} +} \ No newline at end of file diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index fcb2678..1f1d4da 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -9,6 +9,7 @@ include { CENTRIFUGE_CENTRIFUGE } from '../../modules/nf-core/modules/cent include { CENTRIFUGE_KREPORT } from '../../modules/nf-core/modules/centrifuge/kreport/main' include { METAPHLAN3 } from '../../modules/nf-core/modules/metaphlan3/main' include { KAIJU_KAIJU } from '../../modules/nf-core/modules/kaiju/kaiju/main' +include { KAIJU_KAIJU2TABLE } from '../../modules/nf-core/modules/kaiju/kaiju2table/main' workflow PROFILING { take: @@ -155,8 +156,11 @@ workflow PROFILING { } if ( params.run_kaiju ) { - KAIJU_KAIJU ( ch_input_for_kaiju.reads, ch_input_for_kaiju.db ) + KAIJU_KAIJU ( ch_input_for_kaiju.reads, ch_input_for_kaiju.db) + KAIJU_KAIJU2TABLE (KAIJU_KAIJU.out.results, ch_input_for_kaiju.db, params.kaiju_taxon_name) + ch_multiqc_files = ch_multiqc_files.mix( KAIJU_KAIJU2TABLE.out.summary.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( KAIJU_KAIJU.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( KAIJU_KAIJU2TABLE.out.summary ) } emit: From f4480feb9bb5f76a7e12f84682d512d2932022d2 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 25 Apr 2022 17:45:26 +0200 Subject: [PATCH 185/214] Prettier --- modules.json | 2 +- nextflow_schema.json | 26 +++++--------------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/modules.json b/modules.json index 9beb010..ffcde5d 100644 --- a/modules.json +++ b/modules.json @@ -65,4 +65,4 @@ } } } -} \ No newline at end of file +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 9516b6a..93450d2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -304,10 +294,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -348,10 +335,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -406,4 +390,4 @@ "default": "species" } } -} \ No newline at end of file +} From 7afda265c5c99149739a62dfc0449aad20582184 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Tue, 26 Apr 2022 11:27:18 +0200 Subject: [PATCH 186/214] Enumerating taxon --- nextflow_schema.json | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 93450d2..2fda6e1 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -294,7 +304,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -335,7 +348,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -387,7 +403,15 @@ }, "kaiju_taxon_name": { "type": "string", - "default": "species" + "default": "species", + "enum": [ + "phylum", + "class", + "order", + "family", + "genus", + "species" + ] } } -} +} \ No newline at end of file From 194b89e66a32862d234ef39cc36aad46c97a2da7 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Tue, 26 Apr 2022 12:53:39 +0200 Subject: [PATCH 187/214] Prettier --- nextflow_schema.json | 35 ++++++----------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 2fda6e1..83793e8 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -304,10 +294,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -348,10 +335,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -404,14 +388,7 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": [ - "phylum", - "class", - "order", - "family", - "genus", - "species" - ] + "enum": ["phylum", "class", "order", "family", "genus", "species"] } } -} \ No newline at end of file +} From 8126d16dee7e60f80d9cfb159db9199435bccd03 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 29 Apr 2022 21:59:42 +0200 Subject: [PATCH 188/214] Add draft version of DIAMOND --- CITATIONS.md | 4 ++ conf/modules.config | 50 ++++++++++------- conf/test.config | 5 ++ docs/usage.md | 3 ++ modules.json | 5 +- .../nf-core/modules/diamond/blastx/main.nf | 53 +++++++++++++++++++ .../nf-core/modules/diamond/blastx/meta.yml | 52 ++++++++++++++++++ nextflow.config | 4 ++ nextflow_schema.json | 51 +++++++++++++++--- subworkflows/local/profiling.nf | 16 ++++++ 10 files changed, 216 insertions(+), 27 deletions(-) create mode 100644 modules/nf-core/modules/diamond/blastx/main.nf create mode 100644 modules/nf-core/modules/diamond/blastx/meta.yml diff --git a/CITATIONS.md b/CITATIONS.md index 02621d9..fd8c52a 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -52,6 +52,10 @@ > Kim, Daehwan, Li Song, Florian P. Breitwieser, and Steven L. Salzberg. 2016. “Centrifuge: Rapid and Sensitive Classification of Metagenomic Sequences.” Genome Research 26 (12): 1721-29. doi: 10.1101/gr.210641.116. +- [DIAMOND](https://doi.org/10.1038/nmeth.3176) + +> Buchfink, Benjamin, Chao Xie, and Daniel H. Huson. 2015. “Fast and Sensitive Protein Alignment Using DIAMOND.” Nature Methods 12 (1): 59-60. doi: 10.1038/nmeth.3176. + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/conf/modules.config b/conf/modules.config index a72561d..9b081b5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -264,6 +264,36 @@ process { ] } + withName: KAIJU_KAIJU { + publishDir = [ + path: { "${params.outdir}/kaiju/${meta.db_name}" }, + mode: params.publish_dir_mode, + pattern: '*.tsv' + ] + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + } + + withName: KAIJU_KAIJU2TABLE { + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/kaiju/${meta.db_name}" }, + mode: params.publish_dir_mode, + pattern: '*.{txt}' + ] + } + + withName: DIAMOND_BLASTX { + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/diamond/${meta.db_name}" }, + mode: params.publish_dir_mode, + pattern: '*.{blast,xml,txt,daa,sam,tsv,paf}' + ] + } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -279,24 +309,4 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - - withName: KAIJU_KAIJU { - publishDir = [ - path: { "${params.outdir}/kaiju/${meta.db_name}" }, - mode: params.publish_dir_mode, - pattern: '*.tsv' - ] - ext.args = { "${meta.db_params}" } - ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } - } - - withName: KAIJU_KAIJU2TABLE { - ext.args = { "${meta.db_params}" } - ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } - publishDir = [ - path: { "${params.outdir}/kaiju/${meta.db_name}" }, - mode: params.publish_dir_mode, - pattern: '*.{txt}' - ] - } } diff --git a/conf/test.config b/conf/test.config index ecf55bd..35d3539 100644 --- a/conf/test.config +++ b/conf/test.config @@ -34,6 +34,11 @@ params { run_malt = true run_metaphlan3 = true run_centrifuge = true + run_diamond = true + // TODO: setting to txt here as does not require taxonomy in database. + // Should consider re-building our test database but with the required + // taxonomy files, but this may make large files (prot2access: 9GB) + diamond_output_format = 'txt' } process { diff --git a/docs/usage.md b/docs/usage.md index 5d3268b..002f4f2 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -128,6 +128,9 @@ Expected (uncompressed) database files for each tool are as follows: - `kaiju_db_*.fmi` - `nodes.dmp` - `names.dmp` +- **DIAMOND** output of `diamond makedb`. Note: requires building with taxonomy files + to generate taxonomic profile. See [DIAMOND documentation](https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#makedb-options). A file named: + - `.dmnd` ## Running the pipeline diff --git a/modules.json b/modules.json index ffcde5d..7b659c1 100644 --- a/modules.json +++ b/modules.json @@ -27,6 +27,9 @@ "custom/dumpsoftwareversions": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, + "diamond/blastx": { + "git_sha": "42564565b934eeb2449e35ec97ed13ff2a67f1de" + }, "fastp": { "git_sha": "d0a1cbb703a130c19f6796c3fce24fbe7dfce789" }, @@ -65,4 +68,4 @@ } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/diamond/blastx/main.nf b/modules/nf-core/modules/diamond/blastx/main.nf new file mode 100644 index 0000000..6703c1e --- /dev/null +++ b/modules/nf-core/modules/diamond/blastx/main.nf @@ -0,0 +1,53 @@ +process DIAMOND_BLASTX { + tag "$meta.id" + label 'process_medium' + + // Dimaond is limited to v2.0.9 because there is not a + // singularity version higher than this at the current time. + conda (params.enable_conda ? "bioconda::diamond=2.0.9" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/diamond:2.0.9--hdcc8f71_0' : + 'quay.io/biocontainers/diamond:2.0.9--hdcc8f71_0' }" + + input: + tuple val(meta), path(fasta) + path db + val outext + + output: + tuple val(meta), path('*.{blast,xml,txt,daa,sam,tsv,paf}'), emit: output + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + switch ( outext ) { + case "blast": outfmt = 0; break + case "xml": outfmt = 5; break + case "txt": outfmt = 6; break + case "daa": outfmt = 100; break + case "sam": outfmt = 101; break + case "tsv": outfmt = 102; break + case "paf": outfmt = 103; break + } + """ + DB=`find -L ./ -name "*.dmnd" | sed 's/.dmnd//'` + + diamond \\ + blastx \\ + --threads $task.cpus \\ + --db \$DB \\ + --query $fasta \\ + --outfmt ${outfmt} \\ + $args \\ + --out ${prefix}.${outext} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/diamond/blastx/meta.yml b/modules/nf-core/modules/diamond/blastx/meta.yml new file mode 100644 index 0000000..5ee2d55 --- /dev/null +++ b/modules/nf-core/modules/diamond/blastx/meta.yml @@ -0,0 +1,52 @@ +name: diamond_blastx +description: Queries a DIAMOND database using blastx mode +keywords: + - fasta + - diamond + - blastx + - DNA sequence +tools: + - diamond: + description: Accelerated BLAST compatible local sequence aligner + homepage: https://github.com/bbuchfink/diamond + documentation: https://github.com/bbuchfink/diamond/wiki + tool_dev_url: https://github.com/bbuchfink/diamond + doi: "doi:10.1038/s41592-021-01101-x" + licence: ["GPL v3.0"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input fasta file containing query sequences + pattern: "*.{fa,fasta}" + - db: + type: directory + description: Directory containing the nucelotide blast database + pattern: "*" + - outext: + type: string + description: | + Specify the type of output file to be generated. `blast` corresponds to + BLAST pairwise format. `xml` corresponds to BLAST xml format. + `txt` corresponds to to BLAST tabular format. `tsv` corresponds to + taxonomic classification format. + pattern: "blast|xml|txt|daa|sam|tsv|paf" + +output: + - txt: + type: file + description: File containing blastx hits + pattern: "*.{blastx.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@spficklin" + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index 909da25..963a4a5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -108,6 +108,10 @@ params { // kaiju run_kaiju = false kaiju_taxon_name = 'species' + + // diamond + run_diamond = false + diamond_output_format = 'tsv' } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index 83793e8..fc516d6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -294,7 +304,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -335,7 +348,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -388,7 +404,30 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": ["phylum", "class", "order", "family", "genus", "species"] + "enum": [ + "phylum", + "class", + "order", + "family", + "genus", + "species" + ] + }, + "run_diamond": { + "type": "boolean" + }, + "diamond_output_format": { + "type": "string", + "default": "tsv", + "enum": [ + "blast", + "xml", + "txt", + "daa", + "sam", + "tsv", + "paf" + ] } } -} +} \ No newline at end of file diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 1f1d4da..9389e19 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -10,6 +10,8 @@ include { CENTRIFUGE_KREPORT } from '../../modules/nf-core/modules/cent include { METAPHLAN3 } from '../../modules/nf-core/modules/metaphlan3/main' include { KAIJU_KAIJU } from '../../modules/nf-core/modules/kaiju/kaiju/main' include { KAIJU_KAIJU2TABLE } from '../../modules/nf-core/modules/kaiju/kaiju2table/main' +include { DIAMOND_BLASTX } from '../../modules/nf-core/modules/diamond/blastx/main' + workflow PROFILING { take: @@ -41,6 +43,7 @@ workflow PROFILING { metaphlan3: it[2]['tool'] == 'metaphlan3' centrifuge: it[2]['tool'] == 'centrifuge' kaiju: it[2]['tool'] == 'kaiju' + diamond: it[2]['tool'] == 'diamond' unknown: true } @@ -109,6 +112,13 @@ workflow PROFILING { db: it[3] } + ch_input_for_diamond = ch_input_for_profiling.diamond + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + /* RUN PROFILING */ @@ -163,6 +173,12 @@ workflow PROFILING { ch_raw_profiles = ch_raw_profiles.mix( KAIJU_KAIJU2TABLE.out.summary ) } + if ( params.run_diamond ) { + DIAMOND_BLASTX ( ch_input_for_diamond.reads, ch_input_for_diamond.db, params.diamond_output_format ) + ch_versions = ch_versions.mix( DIAMOND_BLASTX.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( DIAMOND_BLASTX.out.output ) + } + emit: profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom versions = ch_versions // channel: [ versions.yml ] From a4a9b161d80914f7f1964b4a86c58229d4d884b3 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 29 Apr 2022 22:02:44 +0200 Subject: [PATCH 189/214] Lintin --- conf/modules.config | 4 ++-- docs/usage.md | 2 +- modules.json | 2 +- nextflow_schema.json | 45 +++++++------------------------------------- 4 files changed, 11 insertions(+), 42 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 9b081b5..d8fb382 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -274,7 +274,7 @@ process { ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } - withName: KAIJU_KAIJU2TABLE { + withName: KAIJU_KAIJU2TABLE { ext.args = { "${meta.db_params}" } ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ @@ -284,7 +284,7 @@ process { ] } - withName: DIAMOND_BLASTX { + withName: DIAMOND_BLASTX { ext.args = { "${meta.db_params}" } ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ diff --git a/docs/usage.md b/docs/usage.md index 002f4f2..cee2bb6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -129,7 +129,7 @@ Expected (uncompressed) database files for each tool are as follows: - `nodes.dmp` - `names.dmp` - **DIAMOND** output of `diamond makedb`. Note: requires building with taxonomy files - to generate taxonomic profile. See [DIAMOND documentation](https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#makedb-options). A file named: + to generate taxonomic profile. See [DIAMOND documentation](https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#makedb-options). A file named: - `.dmnd` ## Running the pipeline diff --git a/modules.json b/modules.json index 7b659c1..a65926c 100644 --- a/modules.json +++ b/modules.json @@ -68,4 +68,4 @@ } } } -} \ No newline at end of file +} diff --git a/nextflow_schema.json b/nextflow_schema.json index fc516d6..f429d1b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -304,10 +294,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -348,10 +335,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -404,14 +388,7 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": [ - "phylum", - "class", - "order", - "family", - "genus", - "species" - ] + "enum": ["phylum", "class", "order", "family", "genus", "species"] }, "run_diamond": { "type": "boolean" @@ -419,15 +396,7 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": [ - "blast", - "xml", - "txt", - "daa", - "sam", - "tsv", - "paf" - ] + "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"] } } -} \ No newline at end of file +} From 0630fce3b5ddb4db1b1932b2405e11ba9bd321e2 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 30 Apr 2022 08:11:40 +0200 Subject: [PATCH 190/214] Tweak based on official DIAMOND test-data --- conf/test.config | 4 ---- nextflow.config | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/conf/test.config b/conf/test.config index 35d3539..a2464b2 100644 --- a/conf/test.config +++ b/conf/test.config @@ -35,10 +35,6 @@ params { run_metaphlan3 = true run_centrifuge = true run_diamond = true - // TODO: setting to txt here as does not require taxonomy in database. - // Should consider re-building our test database but with the required - // taxonomy files, but this may make large files (prot2access: 9GB) - diamond_output_format = 'txt' } process { diff --git a/nextflow.config b/nextflow.config index 963a4a5..5644786 100644 --- a/nextflow.config +++ b/nextflow.config @@ -111,7 +111,7 @@ params { // diamond run_diamond = false - diamond_output_format = 'tsv' + diamond_output_format = 'txt' } // Load base.config by default for all pipelines From 409470642141460daace239396d7fdc95b0580b5 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 1 May 2022 07:24:58 +0200 Subject: [PATCH 191/214] Only create profiler input channels when profiler activate --- subworkflows/local/profiling.nf | 142 +++++++++++++++++--------------- 1 file changed, 74 insertions(+), 68 deletions(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 9389e19..d8e9c84 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -48,7 +48,7 @@ workflow PROFILING { } /* - PREPARE PROFILER INPUT CHANNELS + PREPARE PROFILER INPUT CHANNELS & RUN PROFILING */ // Each tool as a slightly different input structure and generally separate @@ -56,74 +56,26 @@ workflow PROFILING { // for each tool and make liberal use of multiMap to keep reads/databases // channel element order in sync with each other - // MALT: We groupTuple to have all samples in one channel for MALT as database - // loading takes a long time, so we only want to run it once per database - // TODO document somewhere we only accept illumina short reads for MALT? - ch_input_for_malt = ch_input_for_profiling.malt - .filter { it[0]['instrument_platform'] == 'ILLUMINA' } - .map { - it -> - def temp_meta = [ id: it[2]['db_name']] + it[2] - def db = it[3] - [ temp_meta, it[1], db ] - } - .groupTuple(by: [0,2]) - .multiMap { - it -> - reads: [ it[0], it[1].flatten() ] - db: it[2] - } - - // All subsequent tools can easily run on a per-sample basis - - ch_input_for_kraken2 = ch_input_for_profiling.kraken2 - .multiMap { - it -> - reads: [ it[0] + it[2], it[1] ] - db: it[3] - } - - ch_input_for_centrifuge = ch_input_for_profiling.centrifuge - .filter{ - if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] Centrifuge currently does not accept FASTA files as input. Skipping Centrifuge for sample ${it[0].id}." - !it[0].is_fasta - } - .multiMap { - it -> - reads: [ it[0] + it[2], it[1] ] - db: it[3] - } - - ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 - .filter{ - if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] MetaPhlAn3 currently does not accept FASTA files as input. Skipping MetaPhlAn3 for sample ${it[0].id}." - !it[0].is_fasta - } - .multiMap { - it -> - reads: [it[0] + it[2], it[1]] - db: it[3] - } - - ch_input_for_kaiju = ch_input_for_profiling.kaiju - .multiMap { - it -> - reads: [it[0] + it[2], it[1]] - db: it[3] - } - - ch_input_for_diamond = ch_input_for_profiling.diamond - .multiMap { - it -> - reads: [it[0] + it[2], it[1]] - db: it[3] - } - - /* - RUN PROFILING - */ - if ( params.run_malt ) { + + + // MALT: We groupTuple to have all samples in one channel for MALT as database + // loading takes a long time, so we only want to run it once per database + // TODO document somewhere we only accept illumina short reads for MALT? + ch_input_for_malt = ch_input_for_profiling.malt + .filter { it[0]['instrument_platform'] == 'ILLUMINA' } + .map { + it -> + def temp_meta = [ id: it[2]['db_name']] + it[2] + def db = it[3] + [ temp_meta, it[1], db ] + } + .groupTuple(by: [0,2]) + .multiMap { + it -> + reads: [ it[0], it[1].flatten() ] + db: it[2] + MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) ch_maltrun_for_megan = MALT_RUN.out.rma6 @@ -143,40 +95,94 @@ workflow PROFILING { ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( MALT_RUN.out.versions.first(), MEGAN_RMA2INFO.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( MEGAN_RMA2INFO.out.txt ) + } if ( params.run_kraken2 ) { + + ch_input_for_kraken2 = ch_input_for_profiling.kraken2 + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.txt ) + } if ( params.run_centrifuge ) { + + ch_input_for_centrifuge = ch_input_for_profiling.centrifuge + .filter{ + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] Centrifuge currently does not accept FASTA files as input. Skipping Centrifuge for sample ${it[0].id}." + !it[0].is_fasta + } + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format ) CENTRIFUGE_KREPORT (CENTRIFUGE_CENTRIFUGE.out.results, ch_input_for_centrifuge.db) ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( CENTRIFUGE_KREPORT.out.kreport ) + } if ( params.run_metaphlan3 ) { + + ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 + .filter{ + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] MetaPhlAn3 currently does not accept FASTA files as input. Skipping MetaPhlAn3 for sample ${it[0].id}." + !it[0].is_fasta + } + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db ) ch_versions = ch_versions.mix( METAPHLAN3.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN3.out.biom ) + } if ( params.run_kaiju ) { + + ch_input_for_kaiju = ch_input_for_profiling.kaiju + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + KAIJU_KAIJU ( ch_input_for_kaiju.reads, ch_input_for_kaiju.db) KAIJU_KAIJU2TABLE (KAIJU_KAIJU.out.results, ch_input_for_kaiju.db, params.kaiju_taxon_name) ch_multiqc_files = ch_multiqc_files.mix( KAIJU_KAIJU2TABLE.out.summary.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( KAIJU_KAIJU.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( KAIJU_KAIJU2TABLE.out.summary ) + } if ( params.run_diamond ) { + + ch_input_for_diamond = ch_input_for_profiling.diamond + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + DIAMOND_BLASTX ( ch_input_for_diamond.reads, ch_input_for_diamond.db, params.diamond_output_format ) ch_versions = ch_versions.mix( DIAMOND_BLASTX.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( DIAMOND_BLASTX.out.output ) + } emit: From d5049a34e49c2a093039b83b183e48a81c6e9d60 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 1 May 2022 07:28:29 +0200 Subject: [PATCH 192/214] Add missing close bracket --- subworkflows/local/profiling.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index d8e9c84..7fb3ce9 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -75,6 +75,7 @@ workflow PROFILING { it -> reads: [ it[0], it[1].flatten() ] db: it[2] + } MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) From a821f748ee89ade6e229edab28fa7f3399969596 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 09:28:46 +0200 Subject: [PATCH 193/214] Add modules --- conf/modules.config | 41 ++++++++++++ modules.json | 14 +++- modules/nf-core/modules/metaphlan3/main.nf | 2 +- .../nf-core/modules/minimap2/align/main.nf | 48 ++++++++++++++ .../nf-core/modules/minimap2/align/meta.yml | 65 +++++++++++++++++++ .../nf-core/modules/minimap2/index/main.nf | 33 ++++++++++ .../nf-core/modules/minimap2/index/meta.yml | 30 +++++++++ .../nf-core/modules/samtools/bam2fq/main.nf | 56 ++++++++++++++++ .../nf-core/modules/samtools/bam2fq/meta.yml | 55 ++++++++++++++++ modules/nf-core/modules/samtools/view/main.nf | 44 +++++++++++++ .../nf-core/modules/samtools/view/meta.yml | 57 ++++++++++++++++ nf-core/modules/samtools/bam2fq/main.nf | 56 ++++++++++++++++ nf-core/modules/samtools/bam2fq/meta.yml | 55 ++++++++++++++++ 13 files changed, 554 insertions(+), 2 deletions(-) create mode 100644 modules/nf-core/modules/minimap2/align/main.nf create mode 100644 modules/nf-core/modules/minimap2/align/meta.yml create mode 100644 modules/nf-core/modules/minimap2/index/main.nf create mode 100644 modules/nf-core/modules/minimap2/index/meta.yml create mode 100644 modules/nf-core/modules/samtools/bam2fq/main.nf create mode 100644 modules/nf-core/modules/samtools/bam2fq/meta.yml create mode 100644 modules/nf-core/modules/samtools/view/main.nf create mode 100644 modules/nf-core/modules/samtools/view/meta.yml create mode 100644 nf-core/modules/samtools/bam2fq/main.nf create mode 100644 nf-core/modules/samtools/bam2fq/meta.yml diff --git a/conf/modules.config b/conf/modules.config index d8fb382..b707954 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -164,6 +164,47 @@ process { ] } + withName: MINIMAP2_INDEX { + ext.args = '-x map-ont' + publishDir = [ + path: { "${params.outdir}/minimap2/index" }, + mode: params.publish_dir_mode, + enabled: params.save_minimap2_hostremoval_index, + pattern: 'minimap2' + ] + } + + withName: MINIMAP2_ALIGN { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/minimap2/align" }, + mode: params.publish_dir_mode, + enabled: params.save_minimap2_hostremoval_mapped, + pattern: '*.bam' + ] + } + + withName: SAMTOOLS_VIEW { + ext.args = '-f 4' + ext.prefix = { "${meta.id}.mapped.sorted" } + publishDir = [ + path: { "${params.outdir}/samtools/view" }, + mode: params.publish_dir_mode, + enabled: params.save_samtools_unmapped_bam, + pattern: '*.bam' + ] + } + + withName: SAMTOOLS_BAM2FQ { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/samtools/bam2fq" }, + mode: params.publish_dir_mode, + enabled: params.save_minimap2_unmapped_fq, + pattern: '*.fq.gz' + ] + } + withName: BBMAP_BBDUK { ext.args = [ "entropy=${params.shortread_complexityfilter_entropy}", diff --git a/modules.json b/modules.json index a65926c..071234d 100644 --- a/modules.json +++ b/modules.json @@ -52,6 +52,12 @@ "git_sha": "2d38566eca4cc15142b2ffa7c11837569b39aece" }, "metaphlan3": { + "git_sha": "ed4dd1a928ebf4308efb720de878045f7773f8e2" + }, + "minimap2/align": { + "git_sha": "1a5a9e7b4009dcf34e6867dd1a5a1d9a718b027b" + }, + "minimap2/index": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, "multiqc": { @@ -63,9 +69,15 @@ "prinseqplusplus": { "git_sha": "f1c5384c31e985591716afdd732cf8c2ae29d05b" }, + "samtools/bam2fq": { + "git_sha": "897c33d5da084b61109500ee44c01da2d3e4e773" + }, + "samtools/view": { + "git_sha": "12afb6b0faf3cabf769c9a2a7dd477e3f066eac0" + }, "untar": { "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/metaphlan3/main.nf b/modules/nf-core/modules/metaphlan3/main.nf index 3fc6b27..bff0eb9 100644 --- a/modules/nf-core/modules/metaphlan3/main.nf +++ b/modules/nf-core/modules/metaphlan3/main.nf @@ -23,7 +23,7 @@ process METAPHLAN3 { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def input_type = ("$input".endsWith(".fastq.gz")) ? "--input_type fastq" : ("$input".contains(".fasta")) ? "--input_type fasta" : ("$input".endsWith(".bowtie2out.txt")) ? "--input_type bowtie2out" : "--input_type sam" + def input_type = ("$input".endsWith(".fastq.gz") || "$input".endsWith(".fq.gz")) ? "--input_type fastq" : ("$input".contains(".fasta")) ? "--input_type fasta" : ("$input".endsWith(".bowtie2out.txt")) ? "--input_type bowtie2out" : "--input_type sam" def input_data = ("$input_type".contains("fastq")) && !meta.single_end ? "${input[0]},${input[1]}" : "$input" def bowtie2_out = "$input_type" == "--input_type bowtie2out" || "$input_type" == "--input_type sam" ? '' : "--bowtie2out ${prefix}.bowtie2out.txt" diff --git a/modules/nf-core/modules/minimap2/align/main.nf b/modules/nf-core/modules/minimap2/align/main.nf new file mode 100644 index 0000000..08ac6ee --- /dev/null +++ b/modules/nf-core/modules/minimap2/align/main.nf @@ -0,0 +1,48 @@ +process MINIMAP2_ALIGN { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? 'bioconda::minimap2=2.21 bioconda::samtools=1.12' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' : + 'quay.io/biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }" + + input: + tuple val(meta), path(reads) + path reference + val bam_format + val cigar_paf_format + val cigar_bam + + output: + tuple val(meta), path("*.paf"), optional: true, emit: paf + tuple val(meta), path("*.bam"), optional: true, emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_reads = meta.single_end ? "$reads" : "${reads[0]} ${reads[1]}" + def bam_output = bam_format ? "-a | samtools sort | samtools view -@ ${task.cpus} -b -h -o ${prefix}.bam" : "-o ${prefix}.paf" + def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' + def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' + """ + minimap2 \\ + $args \\ + -t $task.cpus \\ + $reference \\ + $input_reads \\ + $cigar_paf \\ + $set_cigar_bam \\ + $bam_output + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/minimap2/align/meta.yml b/modules/nf-core/modules/minimap2/align/meta.yml new file mode 100644 index 0000000..991b39a --- /dev/null +++ b/modules/nf-core/modules/minimap2/align/meta.yml @@ -0,0 +1,65 @@ +name: minimap2_align +description: A versatile pairwise aligner for genomic and spliced nucleotide sequences +keywords: + - align + - fasta + - fastq + - genome + - paf + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FASTA or FASTQ files of size 1 and 2 for single-end + and paired-end data, respectively. + - reference: + type: file + description: | + Reference database in FASTA format. + - bam_format: + type: boolean + description: Specify that output should be in BAM format + - cigar_paf_format: + type: boolean + description: Specify that output CIGAR should be in PAF format + - cigar_bam: + type: boolean + description: | + Write CIGAR with >65535 ops at the CG tag. This is recommended when + doing XYZ (https://github.com/lh3/minimap2#working-with-65535-cigar-operations) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - paf: + type: file + description: Alignment in PAF format + pattern: "*.paf" + - bam: + type: file + description: Alignment in BAM format + pattern: "*.bam" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" diff --git a/modules/nf-core/modules/minimap2/index/main.nf b/modules/nf-core/modules/minimap2/index/main.nf new file mode 100644 index 0000000..3dfeb86 --- /dev/null +++ b/modules/nf-core/modules/minimap2/index/main.nf @@ -0,0 +1,33 @@ +process MINIMAP2_INDEX { + label 'process_medium' + + conda (params.enable_conda ? 'bioconda::minimap2=2.21' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/minimap2:2.21--h5bf99c6_0' : + 'quay.io/biocontainers/minimap2:2.21--h5bf99c6_0' }" + + input: + path fasta + + output: + path "*.mmi" , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + minimap2 \\ + -t $task.cpus \\ + -d ${fasta.baseName}.mmi \\ + $args \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/minimap2/index/meta.yml b/modules/nf-core/modules/minimap2/index/meta.yml new file mode 100644 index 0000000..3bf9f04 --- /dev/null +++ b/modules/nf-core/modules/minimap2/index/meta.yml @@ -0,0 +1,30 @@ +name: minimap2_index +description: Provides fasta index required by minimap2 alignment. +keywords: + - index + - fasta + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - fasta: + type: file + description: | + Reference database in FASTA format. +output: + - mmi: + type: file + description: Minimap2 fasta index. + pattern: "*.mmi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@yuukiiwa" + - "@drpatelh" diff --git a/modules/nf-core/modules/samtools/bam2fq/main.nf b/modules/nf-core/modules/samtools/bam2fq/main.nf new file mode 100644 index 0000000..554af48 --- /dev/null +++ b/modules/nf-core/modules/samtools/bam2fq/main.nf @@ -0,0 +1,56 @@ +process SAMTOOLS_BAM2FQ { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : + 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + + input: + tuple val(meta), path(inputbam) + val split + + output: + tuple val(meta), path("*.fq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + if (split){ + """ + samtools \\ + bam2fq \\ + $args \\ + -@ $task.cpus \\ + -1 ${prefix}_1.fq.gz \\ + -2 ${prefix}_2.fq.gz \\ + -0 ${prefix}_other.fq.gz \\ + -s ${prefix}_singleton.fq.gz \\ + $inputbam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + } else { + """ + samtools \\ + bam2fq \\ + $args \\ + -@ $task.cpus \\ + $inputbam | gzip > ${prefix}_interleaved.fq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/modules/samtools/bam2fq/meta.yml b/modules/nf-core/modules/samtools/bam2fq/meta.yml new file mode 100644 index 0000000..319a60c --- /dev/null +++ b/modules/nf-core/modules/samtools/bam2fq/meta.yml @@ -0,0 +1,55 @@ +name: samtools_bam2fq +description: | + The module uses bam2fq method from samtools to + convert a SAM, BAM or CRAM file to FASTQ format +keywords: + - bam2fq + - samtools + - fastq +tools: + - samtools: + description: Tools for dealing with SAM, BAM and CRAM files + homepage: None + documentation: http://www.htslib.org/doc/1.1/samtools.html + tool_dev_url: None + doi: "" + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - inputbam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - split: + type: boolean + description: | + TRUE/FALSE value to indicate if reads should be separated into + /1, /2 and if present other, or singleton. + Note: choosing TRUE will generate 4 different files. + Choosing FALSE will produce a single file, which will be interleaved in case + the input contains paired reads. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: | + FASTQ files, which will be either a group of 4 files (read_1, read_2, other and singleton) + or a single interleaved .fq.gz file if the user chooses not to split the reads. + pattern: "*.fq.gz" + +authors: + - "@lescai" diff --git a/modules/nf-core/modules/samtools/view/main.nf b/modules/nf-core/modules/samtools/view/main.nf new file mode 100644 index 0000000..11cfb74 --- /dev/null +++ b/modules/nf-core/modules/samtools/view/main.nf @@ -0,0 +1,44 @@ +process SAMTOOLS_VIEW { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : + 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + + input: + tuple val(meta), path(input), path(index) + path fasta + + output: + tuple val(meta), path("*.bam") , emit: bam , optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta} -C" : "" + def file_type = input.getExtension() + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools \\ + view \\ + --threads ${task.cpus-1} \\ + ${reference} \\ + $args \\ + $input \\ + $args2 \\ + > ${prefix}.${file_type} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/samtools/view/meta.yml b/modules/nf-core/modules/samtools/view/meta.yml new file mode 100644 index 0000000..a8b43ec --- /dev/null +++ b/modules/nf-core/modules/samtools/view/meta.yml @@ -0,0 +1,57 @@ +name: samtools_view +description: filter/convert SAM/BAM/CRAM file +keywords: + - view + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: hhttp://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - index: + type: optional file + description: BAM.BAI/CRAM.CRAI file + pattern: "*.{.bai,.crai}" + - fasta: + type: optional file + description: Reference file the CRAM was created with + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: filtered/converted BAM/SAM file + pattern: "*.{bam,sam}" + - cram: + type: file + description: filtered/converted CRAM file + pattern: "*.cram" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" diff --git a/nf-core/modules/samtools/bam2fq/main.nf b/nf-core/modules/samtools/bam2fq/main.nf new file mode 100644 index 0000000..5d6aa79 --- /dev/null +++ b/nf-core/modules/samtools/bam2fq/main.nf @@ -0,0 +1,56 @@ +process SAMTOOLS_BAM2FQ { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : + 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + + input: + tuple val(meta), path(inputbam) + val split + + output: + tuple val(meta), path("*.fq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + if (split){ + """ + samtools \\ + bam2fq \\ + $args \\ + -@ $task.cpus \\ + -1 ${prefix}_1.fq.gz \\ + -2 ${prefix}_2.fq.gz \\ + -0 ${prefix}_other.fq.gz \\ + -s ${prefix}_singleton.fq.gz \\ + $inputbam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + } else { + """ + samtools \\ + bam2fq \\ + $args \\ + -@ $task.cpus \\ + $inputbam >${prefix}_interleaved.fq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + } +} diff --git a/nf-core/modules/samtools/bam2fq/meta.yml b/nf-core/modules/samtools/bam2fq/meta.yml new file mode 100644 index 0000000..319a60c --- /dev/null +++ b/nf-core/modules/samtools/bam2fq/meta.yml @@ -0,0 +1,55 @@ +name: samtools_bam2fq +description: | + The module uses bam2fq method from samtools to + convert a SAM, BAM or CRAM file to FASTQ format +keywords: + - bam2fq + - samtools + - fastq +tools: + - samtools: + description: Tools for dealing with SAM, BAM and CRAM files + homepage: None + documentation: http://www.htslib.org/doc/1.1/samtools.html + tool_dev_url: None + doi: "" + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - inputbam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - split: + type: boolean + description: | + TRUE/FALSE value to indicate if reads should be separated into + /1, /2 and if present other, or singleton. + Note: choosing TRUE will generate 4 different files. + Choosing FALSE will produce a single file, which will be interleaved in case + the input contains paired reads. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: | + FASTQ files, which will be either a group of 4 files (read_1, read_2, other and singleton) + or a single interleaved .fq.gz file if the user chooses not to split the reads. + pattern: "*.fq.gz" + +authors: + - "@lescai" From 5e6be52fab87d6b6a7f14ce254f63b02ad682a63 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 09:34:27 +0200 Subject: [PATCH 194/214] Implement hostremoval --- nextflow.config | 18 ++++++--- subworkflows/local/longread_hostremoval.nf | 47 ++++++++++++++++++++++ workflows/taxprofiler.nf | 17 ++++++-- 3 files changed, 72 insertions(+), 10 deletions(-) create mode 100644 subworkflows/local/longread_hostremoval.nf diff --git a/nextflow.config b/nextflow.config index 5644786..04273f1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -81,12 +81,18 @@ params { save_runmerged_reads = false // Host Removal - perform_shortread_hostremoval = false - shortread_hostremoval_reference = null - shortread_hostremoval_index = null - save_hostremoval_index = false - save_hostremoval_mapped = false - save_hostremoval_unmapped = false + perform_shortread_hostremoval = false + shortread_hostremoval_reference = null + shortread_hostremoval_index = null + longread_hostremoval_index = null + save_hostremoval_index = false + save_hostremoval_mapped = false + save_hostremoval_unmapped = false + save_minimap2_hostremoval_index = false + save_minimap2_hostremoval_mapped = false + save_minimap2_hostremoval_unmapped = false + save_samtools_unmapped_bam = false + save_minimap2_unmapped_fq = false // MALT run_malt = false diff --git a/subworkflows/local/longread_hostremoval.nf b/subworkflows/local/longread_hostremoval.nf new file mode 100644 index 0000000..7db020b --- /dev/null +++ b/subworkflows/local/longread_hostremoval.nf @@ -0,0 +1,47 @@ +// +// Remove host reads via alignment and export off-target reads +// + +include { MINIMAP2_INDEX } from '../../modules/nf-core/modules/minimap2/index/main' +include { MINIMAP2_ALIGN } from '../../modules/nf-core/modules/minimap2/align/main' +include { SAMTOOLS_VIEW } from '../../modules/nf-core/modules/samtools/view/main' +include { SAMTOOLS_BAM2FQ } from '../../modules/nf-core/modules/samtools/bam2fq/main' + +workflow LONGREAD_HOSTREMOVAL { + take: + reads // [ [ meta ], [ reads ] ] + reference // /path/to/fasta + index // /path/to/index + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( !params.longread_hostremoval_index ) { + ch_minimap2_index = MINIMAP2_INDEX ( reference ).index + ch_versions = ch_versions.mix( MINIMAP2_INDEX.out.versions ) + } else { + ch_minimap2_index = index + } + + MINIMAP2_ALIGN ( reads, ch_minimap2_index, true, false, false ) + ch_versions = ch_versions.mix( MINIMAP2_ALIGN.out.versions.first() ) + ch_minimap2_mapped = MINIMAP2_ALIGN.out.bam + .map { + meta, reads -> + [ meta, reads, [] ] + } + + + SAMTOOLS_VIEW ( ch_minimap2_mapped , [] ) + ch_versions = ch_versions.mix( SAMTOOLS_VIEW.out.versions.first() ) + + SAMTOOLS_BAM2FQ ( SAMTOOLS_VIEW.out.bam, false ) + ch_versions = ch_versions.mix( SAMTOOLS_BAM2FQ.out.versions.first() ) + + + emit: + reads = SAMTOOLS_BAM2FQ.out.reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] +} + diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index a0046b2..9eb53a3 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -26,7 +26,8 @@ if (params.perform_shortread_hostremoval && !params.shortread_hostremoval_refere if (!params.shortread_hostremoval_reference && params.shortread_hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --shortread_hostremoval_reference FASTA supplied. Check input." } if (params.shortread_hostremoval_reference ) { ch_reference = file(params.shortread_hostremoval_reference) } -if (params.shortread_hostremoval_index ) { ch_reference_index = file(params.shortread_hostremoval_index ) } else { ch_reference_index = [] } +if (params.shortread_hostremoval_index ) { ch_shortread_reference_index = file(params.shortread_hostremoval_index ) } else { ch_shortread_reference_index = [] } +if (params.longread_hostremoval_index ) { ch_longread_reference_index = file(params.longread_hostremoval_index ) } else { ch_longread_reference_index = [] } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -52,6 +53,7 @@ include { DB_CHECK } from '../subworkflows/local/db_check' include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' include { SHORTREAD_HOSTREMOVAL } from '../subworkflows/local/shortread_hostremoval' +include { LONGREAD_HOSTREMOVAL } from '../subworkflows/local/longread_hostremoval' include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_complexityfiltering' include { PROFILING } from '../subworkflows/local/profiling' @@ -141,16 +143,23 @@ workflow TAXPROFILER { */ if ( params.perform_shortread_hostremoval ) { - ch_shortreads_hostremoved = SHORTREAD_HOSTREMOVAL ( ch_shortreads_filtered, ch_reference, ch_reference_index ).reads + ch_shortreads_hostremoved = SHORTREAD_HOSTREMOVAL ( ch_shortreads_filtered, ch_reference, ch_shortread_reference_index ).reads ch_versions = ch_versions.mix(SHORTREAD_HOSTREMOVAL.out.versions) } else { ch_shortreads_hostremoved = ch_shortreads_filtered } + if ( params.perform_longread_hostremoval ) { + ch_longreads_hostremoved = LONGREAD_HOSTREMOVAL ( ch_longreads_preprocessed, ch_reference, ch_longread_reference_index ).reads + ch_versions = ch_versions.mix(LONGREAD_HOSTREMOVAL.out.versions) + } else { + ch_longreads_hostremoved = ch_longreads_preprocessed + } + if ( params.perform_runmerging ) { ch_reads_for_cat_branch = ch_shortreads_hostremoved - .mix( ch_longreads_preprocessed ) + .mix( ch_longreads_hostremoved ) .map { meta, reads -> def meta_new = meta.clone() @@ -182,7 +191,7 @@ workflow TAXPROFILER { } else { ch_reads_runmerged = ch_shortreads_hostremoved - .mix( ch_longreads_preprocessed, INPUT_CHECK.out.fasta ) + .mix( ch_longreads_hostremoved, INPUT_CHECK.out.fasta ) } /* From afdebaf09a41cfc6fd3f3d3455f3f69b1e1dc608 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 10:15:11 +0200 Subject: [PATCH 195/214] Remove own modification to a module --- modules/nf-core/modules/samtools/bam2fq/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/nf-core/modules/samtools/bam2fq/main.nf b/modules/nf-core/modules/samtools/bam2fq/main.nf index 554af48..0dd1da9 100644 --- a/modules/nf-core/modules/samtools/bam2fq/main.nf +++ b/modules/nf-core/modules/samtools/bam2fq/main.nf @@ -45,7 +45,7 @@ process SAMTOOLS_BAM2FQ { bam2fq \\ $args \\ -@ $task.cpus \\ - $inputbam | gzip > ${prefix}_interleaved.fq.gz + $inputbam > ${prefix}_interleaved.fq.gz cat <<-END_VERSIONS > versions.yml "${task.process}": From fa1c13263573e5cda53fb2468e8c339ec0ad0ecf Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 11:03:25 +0200 Subject: [PATCH 196/214] Update nextflow_schema --- nextflow_schema.json | 67 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 6 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index f429d1b..d3878c1 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -294,7 +304,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -335,7 +348,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -388,7 +404,14 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": ["phylum", "class", "order", "family", "genus", "species"] + "enum": [ + "phylum", + "class", + "order", + "family", + "genus", + "species" + ] }, "run_diamond": { "type": "boolean" @@ -396,7 +419,39 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"] + "enum": [ + "blast", + "xml", + "txt", + "daa", + "sam", + "tsv", + "paf" + ] + }, + "longread_hostremoval_index": { + "type": "string", + "default": null + }, + "save_minimap2_hostremoval_index": { + "type": "string", + "default": "false", + "description": "Flag for publishing minimap2 host removal index" + }, + "save_minimap2_hostremoval_mapped": { + "type": "string", + "default": "false", + "description": "Flag for publishinig bam file with all long reads mapped to a reference" + }, + "save_samtools_unmapped_bam": { + "type": "string", + "default": "false", + "description": "Flag for publishing bam for reads that did not map to the host reference" + }, + "save_minimap2_unmapped_fq": { + "type": "string", + "default": "false", + "description": "Flag for publishing fastq files for reads that did not map to the host reference" } } } From 4b2a3789cd7ba15ac41f32f6179bcc1665cb8e47 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 11:03:48 +0200 Subject: [PATCH 197/214] Update default values for host removal parameters --- nextflow.config | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/nextflow.config b/nextflow.config index 04273f1..95daba9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -81,18 +81,18 @@ params { save_runmerged_reads = false // Host Removal - perform_shortread_hostremoval = false - shortread_hostremoval_reference = null - shortread_hostremoval_index = null - longread_hostremoval_index = null - save_hostremoval_index = false - save_hostremoval_mapped = false - save_hostremoval_unmapped = false - save_minimap2_hostremoval_index = false - save_minimap2_hostremoval_mapped = false - save_minimap2_hostremoval_unmapped = false - save_samtools_unmapped_bam = false - save_minimap2_unmapped_fq = false + perform_shortread_hostremoval = false + shortread_hostremoval_reference = null + shortread_hostremoval_index = null + save_hostremoval_index = false + save_hostremoval_mapped = false + save_hostremoval_unmapped = false + longread_hostremoval_index = null + save_minimap2_hostremoval_index = false + save_minimap2_hostremoval_mapped = false + save_samtools_unmapped_bam = false + save_minimap2_unmapped_fq = false + // MALT run_malt = false From 17039ebc6b143870331c4d2f8976a0fe63de0736 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 11:07:23 +0200 Subject: [PATCH 198/214] Fix type to boolean for flags --- nextflow_schema.json | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index d3878c1..257f5cb 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -431,27 +431,23 @@ }, "longread_hostremoval_index": { "type": "string", - "default": null + "default": "None" }, "save_minimap2_hostremoval_index": { - "type": "string", - "default": "false", + "type": "boolean", "description": "Flag for publishing minimap2 host removal index" }, "save_minimap2_hostremoval_mapped": { - "type": "string", - "default": "false", + "type": "boolean", "description": "Flag for publishinig bam file with all long reads mapped to a reference" }, "save_samtools_unmapped_bam": { - "type": "string", - "default": "false", + "type": "boolean", "description": "Flag for publishing bam for reads that did not map to the host reference" }, "save_minimap2_unmapped_fq": { - "type": "string", - "default": "false", + "type": "boolean", "description": "Flag for publishing fastq files for reads that did not map to the host reference" } } -} +} \ No newline at end of file From 58d4dec70be5ac5eb2356aac351593c27f77502a Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 11:09:54 +0200 Subject: [PATCH 199/214] Update samtools view --- modules.json | 2 +- modules/nf-core/modules/samtools/view/main.nf | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 071234d..b568d9c 100644 --- a/modules.json +++ b/modules.json @@ -73,7 +73,7 @@ "git_sha": "897c33d5da084b61109500ee44c01da2d3e4e773" }, "samtools/view": { - "git_sha": "12afb6b0faf3cabf769c9a2a7dd477e3f066eac0" + "git_sha": "6b64f9cb6c3dd3577931cc3cd032d6fb730000ce" }, "untar": { "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" diff --git a/modules/nf-core/modules/samtools/view/main.nf b/modules/nf-core/modules/samtools/view/main.nf index 11cfb74..55194e8 100644 --- a/modules/nf-core/modules/samtools/view/main.nf +++ b/modules/nf-core/modules/samtools/view/main.nf @@ -41,4 +41,16 @@ process SAMTOOLS_VIEW { samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + touch ${prefix}.cram + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ } From c833d4fd9ae037e297eda9dea69045715fc4a064 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 11:42:05 +0200 Subject: [PATCH 200/214] Fix issues found with prettier --- modules.json | 2 +- nextflow_schema.json | 45 +++++++------------------------------------- 2 files changed, 8 insertions(+), 39 deletions(-) diff --git a/modules.json b/modules.json index b568d9c..ce1fbc5 100644 --- a/modules.json +++ b/modules.json @@ -80,4 +80,4 @@ } } } -} \ No newline at end of file +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 257f5cb..1902e27 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -304,10 +294,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -348,10 +335,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -404,14 +388,7 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": [ - "phylum", - "class", - "order", - "family", - "genus", - "species" - ] + "enum": ["phylum", "class", "order", "family", "genus", "species"] }, "run_diamond": { "type": "boolean" @@ -419,15 +396,7 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": [ - "blast", - "xml", - "txt", - "daa", - "sam", - "tsv", - "paf" - ] + "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"] }, "longread_hostremoval_index": { "type": "string", @@ -450,4 +419,4 @@ "description": "Flag for publishing fastq files for reads that did not map to the host reference" } } -} \ No newline at end of file +} From f4abbe280ab6497a220fde116f606d588b49bf73 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 12:56:49 +0200 Subject: [PATCH 201/214] Fix back to nf-core modules version --- .../nf-core/modules/samtools/bam2fq/main.nf | 2 +- nf-core/modules/samtools/bam2fq/main.nf | 56 ------------------- nf-core/modules/samtools/bam2fq/meta.yml | 55 ------------------ 3 files changed, 1 insertion(+), 112 deletions(-) delete mode 100644 nf-core/modules/samtools/bam2fq/main.nf delete mode 100644 nf-core/modules/samtools/bam2fq/meta.yml diff --git a/modules/nf-core/modules/samtools/bam2fq/main.nf b/modules/nf-core/modules/samtools/bam2fq/main.nf index 0dd1da9..5d6aa79 100644 --- a/modules/nf-core/modules/samtools/bam2fq/main.nf +++ b/modules/nf-core/modules/samtools/bam2fq/main.nf @@ -45,7 +45,7 @@ process SAMTOOLS_BAM2FQ { bam2fq \\ $args \\ -@ $task.cpus \\ - $inputbam > ${prefix}_interleaved.fq.gz + $inputbam >${prefix}_interleaved.fq.gz cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nf-core/modules/samtools/bam2fq/main.nf b/nf-core/modules/samtools/bam2fq/main.nf deleted file mode 100644 index 5d6aa79..0000000 --- a/nf-core/modules/samtools/bam2fq/main.nf +++ /dev/null @@ -1,56 +0,0 @@ -process SAMTOOLS_BAM2FQ { - tag "$meta.id" - label 'process_low' - - conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : - 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" - - input: - tuple val(meta), path(inputbam) - val split - - output: - tuple val(meta), path("*.fq.gz"), emit: reads - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - - if (split){ - """ - samtools \\ - bam2fq \\ - $args \\ - -@ $task.cpus \\ - -1 ${prefix}_1.fq.gz \\ - -2 ${prefix}_2.fq.gz \\ - -0 ${prefix}_other.fq.gz \\ - -s ${prefix}_singleton.fq.gz \\ - $inputbam - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ - } else { - """ - samtools \\ - bam2fq \\ - $args \\ - -@ $task.cpus \\ - $inputbam >${prefix}_interleaved.fq.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ - } -} diff --git a/nf-core/modules/samtools/bam2fq/meta.yml b/nf-core/modules/samtools/bam2fq/meta.yml deleted file mode 100644 index 319a60c..0000000 --- a/nf-core/modules/samtools/bam2fq/meta.yml +++ /dev/null @@ -1,55 +0,0 @@ -name: samtools_bam2fq -description: | - The module uses bam2fq method from samtools to - convert a SAM, BAM or CRAM file to FASTQ format -keywords: - - bam2fq - - samtools - - fastq -tools: - - samtools: - description: Tools for dealing with SAM, BAM and CRAM files - homepage: None - documentation: http://www.htslib.org/doc/1.1/samtools.html - tool_dev_url: None - doi: "" - licence: ["MIT"] - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - inputbam: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - split: - type: boolean - description: | - TRUE/FALSE value to indicate if reads should be separated into - /1, /2 and if present other, or singleton. - Note: choosing TRUE will generate 4 different files. - Choosing FALSE will produce a single file, which will be interleaved in case - the input contains paired reads. - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - reads: - type: file - description: | - FASTQ files, which will be either a group of 4 files (read_1, read_2, other and singleton) - or a single interleaved .fq.gz file if the user chooses not to split the reads. - pattern: "*.fq.gz" - -authors: - - "@lescai" From 59c7f5a5b1b6a3e144a9b1c151e6f2a4c315342f Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Thu, 5 May 2022 08:38:41 +0200 Subject: [PATCH 202/214] Update samtools/bam2fq --- modules.json | 4 ++-- modules/nf-core/modules/samtools/bam2fq/main.nf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules.json b/modules.json index ce1fbc5..043e4ad 100644 --- a/modules.json +++ b/modules.json @@ -70,7 +70,7 @@ "git_sha": "f1c5384c31e985591716afdd732cf8c2ae29d05b" }, "samtools/bam2fq": { - "git_sha": "897c33d5da084b61109500ee44c01da2d3e4e773" + "git_sha": "5510ea39fe638594bc26ac34cadf4a84bf27d159" }, "samtools/view": { "git_sha": "6b64f9cb6c3dd3577931cc3cd032d6fb730000ce" @@ -80,4 +80,4 @@ } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/samtools/bam2fq/main.nf b/modules/nf-core/modules/samtools/bam2fq/main.nf index 5d6aa79..9301d1d 100644 --- a/modules/nf-core/modules/samtools/bam2fq/main.nf +++ b/modules/nf-core/modules/samtools/bam2fq/main.nf @@ -45,7 +45,7 @@ process SAMTOOLS_BAM2FQ { bam2fq \\ $args \\ -@ $task.cpus \\ - $inputbam >${prefix}_interleaved.fq.gz + $inputbam | gzip --no-name > ${prefix}_interleaved.fq.gz cat <<-END_VERSIONS > versions.yml "${task.process}": From 5b7b21415633c3df64cb2d88734d5863b4162a9c Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Thu, 5 May 2022 08:40:53 +0200 Subject: [PATCH 203/214] Fix formatting with prettier --- modules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 043e4ad..a55c88b 100644 --- a/modules.json +++ b/modules.json @@ -80,4 +80,4 @@ } } } -} \ No newline at end of file +} From d94534e8acee01575b589e88549ed64ab4fc4411 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Thu, 5 May 2022 09:07:33 +0200 Subject: [PATCH 204/214] Simplify params which control host removal publish --- conf/modules.config | 8 ++++---- nextflow.config | 6 +----- nextflow_schema.json | 16 ---------------- 3 files changed, 5 insertions(+), 25 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index b707954..cd0fb04 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -169,7 +169,7 @@ process { publishDir = [ path: { "${params.outdir}/minimap2/index" }, mode: params.publish_dir_mode, - enabled: params.save_minimap2_hostremoval_index, + enabled: params.save_hostremoval_index, pattern: 'minimap2' ] } @@ -179,7 +179,7 @@ process { publishDir = [ path: { "${params.outdir}/minimap2/align" }, mode: params.publish_dir_mode, - enabled: params.save_minimap2_hostremoval_mapped, + enabled: params.save_hostremoval_mapped, pattern: '*.bam' ] } @@ -190,7 +190,7 @@ process { publishDir = [ path: { "${params.outdir}/samtools/view" }, mode: params.publish_dir_mode, - enabled: params.save_samtools_unmapped_bam, + enabled: params.save_hostremoval_unmapped, pattern: '*.bam' ] } @@ -200,7 +200,7 @@ process { publishDir = [ path: { "${params.outdir}/samtools/bam2fq" }, mode: params.publish_dir_mode, - enabled: params.save_minimap2_unmapped_fq, + enabled: params.save_hostremoval_unmapped, pattern: '*.fq.gz' ] } diff --git a/nextflow.config b/nextflow.config index 95daba9..8c99af2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -84,14 +84,10 @@ params { perform_shortread_hostremoval = false shortread_hostremoval_reference = null shortread_hostremoval_index = null + longread_hostremoval_index = null save_hostremoval_index = false save_hostremoval_mapped = false save_hostremoval_unmapped = false - longread_hostremoval_index = null - save_minimap2_hostremoval_index = false - save_minimap2_hostremoval_mapped = false - save_samtools_unmapped_bam = false - save_minimap2_unmapped_fq = false // MALT diff --git a/nextflow_schema.json b/nextflow_schema.json index 1902e27..9e4cc6c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -401,22 +401,6 @@ "longread_hostremoval_index": { "type": "string", "default": "None" - }, - "save_minimap2_hostremoval_index": { - "type": "boolean", - "description": "Flag for publishing minimap2 host removal index" - }, - "save_minimap2_hostremoval_mapped": { - "type": "boolean", - "description": "Flag for publishinig bam file with all long reads mapped to a reference" - }, - "save_samtools_unmapped_bam": { - "type": "boolean", - "description": "Flag for publishing bam for reads that did not map to the host reference" - }, - "save_minimap2_unmapped_fq": { - "type": "boolean", - "description": "Flag for publishing fastq files for reads that did not map to the host reference" } } } From 557d31dfd2fecbac2a415ddab780e6a3abc87168 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Thu, 5 May 2022 13:19:10 +0200 Subject: [PATCH 205/214] Add parameter for turning on longread host removal --- conf/test.config | 2 +- docs/usage.md | 2 +- nextflow.config | 2 +- nextflow_schema.json | 4 +++- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/conf/test.config b/conf/test.config index a2464b2..6d04f60 100644 --- a/conf/test.config +++ b/conf/test.config @@ -28,7 +28,7 @@ params { perform_longread_clip = false perform_shortread_complexityfilter = true perform_shortread_hostremoval = true - shortread_hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + perform_longread_hostremoval = true run_kaiju = true run_kraken2 = true run_malt = true diff --git a/docs/usage.md b/docs/usage.md index cee2bb6..537b94a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -191,7 +191,7 @@ You can optionally save the FASTQ output of the run merging with the `--save_com #### Host Removal -Removal of possible-host reads from FASTQ files prior profiling can be activated with `--perform_shortread_hostremoval` +Removal of possible-host reads from FASTQ files prior profiling can be activated with `--perform_shortread_hostremoval` or `--perform_longread_hostremoval`. Similarly to complexity filtering, host-removal can be useful for runtime optimisation and reduction in misclassified reads. It is not always necessary to report classification of reads from a host when you already know the host of the sample, therefore you can gain a run-time and computational advantage by removing these prior typically resource-heavy profiling with more efficient methods. Furthermore, particularly with human samples, you can reduce the number of false positives during profiling that occur due to host-sequence contamination in reference genomes on public databases. diff --git a/nextflow.config b/nextflow.config index 8c99af2..4ac0c44 100644 --- a/nextflow.config +++ b/nextflow.config @@ -82,7 +82,7 @@ params { // Host Removal perform_shortread_hostremoval = false - shortread_hostremoval_reference = null + perform_longread_hostremoval = false shortread_hostremoval_index = null longread_hostremoval_index = null save_hostremoval_index = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 9e4cc6c..d2eee95 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -362,7 +362,9 @@ "perform_shortread_hostremoval": { "type": "boolean" }, - "shortread_hostremoval_reference": { + "perform_longread_hostremoval": { + "type": "boolean" + }, "type": "string", "default": "None" }, From 6618e8cac65fe43d40d4952863698801f9b4507b Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Thu, 5 May 2022 13:20:34 +0200 Subject: [PATCH 206/214] Change param to a more generic name --- conf/test.config | 1 + docs/usage.md | 2 +- nextflow.config | 1 + nextflow_schema.json | 1 + workflows/taxprofiler.nf | 8 ++++---- 5 files changed, 8 insertions(+), 5 deletions(-) diff --git a/conf/test.config b/conf/test.config index 6d04f60..a5244f9 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,6 +29,7 @@ params { perform_shortread_complexityfilter = true perform_shortread_hostremoval = true perform_longread_hostremoval = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' run_kaiju = true run_kraken2 = true run_malt = true diff --git a/docs/usage.md b/docs/usage.md index 537b94a..1143da3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -197,7 +197,7 @@ Similarly to complexity filtering, host-removal can be useful for runtime optimi nf-core/taxprofiler currently offers host-removal via alignment against a reference genome with Bowtie2, and the use of the unaligned reads for downstream profiling. -You can supply your reference genome in FASTA format with `--shortread_hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index`, however nf-core/taxprofiler will generate this for you if necessary. Pre-supplying the directory of index files can greatly speed up the process, and these can be re-used. +You can supply your reference genome in FASTA format with `--hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index`, however nf-core/taxprofiler will generate this for you if necessary. Pre-supplying the directory of index files can greatly speed up the process, and these can be re-used. > 💡 If you have multiple taxa or sequences you wish to remove (e.g., the host genome and then also PhiX - common quality-control reagent during sequencing) you can simply concatenate the FASTAs of each taxa or sequences into a single reference file. diff --git a/nextflow.config b/nextflow.config index 4ac0c44..ca9e280 100644 --- a/nextflow.config +++ b/nextflow.config @@ -83,6 +83,7 @@ params { // Host Removal perform_shortread_hostremoval = false perform_longread_hostremoval = false + hostremoval_reference = null shortread_hostremoval_index = null longread_hostremoval_index = null save_hostremoval_index = false diff --git a/nextflow_schema.json b/nextflow_schema.json index d2eee95..ab2108e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -365,6 +365,7 @@ "perform_longread_hostremoval": { "type": "boolean" }, + "hostremoval_reference": { "type": "string", "default": "None" }, diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 9eb53a3..81ea5d9 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -11,7 +11,7 @@ WorkflowTaxprofiler.initialise(params, log) // TODO nf-core: Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.databases, params.shortread_hostremoval_reference, +def checkPathParamList = [ params.input, params.databases, params.hostremoval_reference, params.shortread_hostremoval_index, params.multiqc_config ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } @@ -22,10 +22,10 @@ if (params.databases) { ch_databases = file(params.databases) } else { exit 1, ' if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" -if (params.perform_shortread_hostremoval && !params.shortread_hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --shortread_hostremoval_reference FASTA supplied. Check input." } -if (!params.shortread_hostremoval_reference && params.shortread_hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --shortread_hostremoval_reference FASTA supplied. Check input." } +if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." } +if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." } -if (params.shortread_hostremoval_reference ) { ch_reference = file(params.shortread_hostremoval_reference) } +if (params.hostremoval_reference ) { ch_reference = file(params.hostremoval_reference) } if (params.shortread_hostremoval_index ) { ch_shortread_reference_index = file(params.shortread_hostremoval_index ) } else { ch_shortread_reference_index = [] } if (params.longread_hostremoval_index ) { ch_longread_reference_index = file(params.longread_hostremoval_index ) } else { ch_longread_reference_index = [] } From e3e55f57c26ba462ceb6795be55b312e615ca550 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Thu, 5 May 2022 14:02:24 +0200 Subject: [PATCH 207/214] Fix alignment Just made this change to see if the CI tests would fail again or if the failure was just a one of thing. --- workflows/taxprofiler.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 81ea5d9..1f2e9d1 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -47,7 +47,7 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { INPUT_CHECK } from '../subworkflows/local/input_check' include { DB_CHECK } from '../subworkflows/local/db_check' include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' From b0faeedc60c654fe445909ceed569ee010c84837 Mon Sep 17 00:00:00 2001 From: Lauri Mesilaakso Date: Fri, 6 May 2022 08:20:31 +0200 Subject: [PATCH 208/214] Update workflows/taxprofiler.nf Co-authored-by: James A. Fellows Yates --- workflows/taxprofiler.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 1f2e9d1..7a6cd09 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -25,7 +25,7 @@ if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_me if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." } if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." } -if (params.hostremoval_reference ) { ch_reference = file(params.hostremoval_reference) } +if (params.hostremoval_reference ) { ch_reference = file(params.hostremoval_reference) } if (params.shortread_hostremoval_index ) { ch_shortread_reference_index = file(params.shortread_hostremoval_index ) } else { ch_shortread_reference_index = [] } if (params.longread_hostremoval_index ) { ch_longread_reference_index = file(params.longread_hostremoval_index ) } else { ch_longread_reference_index = [] } From 02517733aa2227d1bc6f4decdb607918de047b2d Mon Sep 17 00:00:00 2001 From: Lauri Mesilaakso Date: Fri, 6 May 2022 10:40:59 +0200 Subject: [PATCH 209/214] Update docs/usage.md Co-authored-by: James A. Fellows Yates --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 1143da3..4aa1d09 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -197,7 +197,7 @@ Similarly to complexity filtering, host-removal can be useful for runtime optimi nf-core/taxprofiler currently offers host-removal via alignment against a reference genome with Bowtie2, and the use of the unaligned reads for downstream profiling. -You can supply your reference genome in FASTA format with `--hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index`, however nf-core/taxprofiler will generate this for you if necessary. Pre-supplying the directory of index files can greatly speed up the process, and these can be re-used. +You can supply your reference genome in FASTA format with `--hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index` or `--longread_hostremoval_index`, however nf-core/taxprofiler will generate this for you if necessary. Pre-supplying the directory of index files can greatly speed up the process, and these can be re-used. > 💡 If you have multiple taxa or sequences you wish to remove (e.g., the host genome and then also PhiX - common quality-control reagent during sequencing) you can simply concatenate the FASTAs of each taxa or sequences into a single reference file. From 47a5ae0cff4060c12451beba47502dae5dbb9d17 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 7 May 2022 06:09:05 +0200 Subject: [PATCH 210/214] Add FASTP complexity option --- conf/modules.config | 6 ++- conf/test.config | 1 + conf/test_nopreprocessing.config | 46 ++++++++++++++++ conf/test_noprofiling.config | 46 ++++++++++++++++ docs/usage.md | 4 +- nextflow.config | 3 ++ nextflow_schema.json | 52 ++++++++++++++++--- .../local/shortread_complexityfiltering.nf | 1 + workflows/taxprofiler.nf | 8 ++- 9 files changed, 153 insertions(+), 14 deletions(-) create mode 100644 conf/test_nopreprocessing.config create mode 100644 conf/test_noprofiling.config diff --git a/conf/modules.config b/conf/modules.config index cd0fb04..c834f4e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -54,7 +54,8 @@ process { params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "", params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", // filtering options - "--length_required ${params.shortread_clipmerge_minlength}" + "--length_required ${params.shortread_clipmerge_minlength}", + params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ @@ -74,7 +75,8 @@ process { params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe", // filtering options - "--length_required ${params.shortread_clipmerge_minlength}" + "--length_required ${params.shortread_clipmerge_minlength}", + params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ diff --git a/conf/test.config b/conf/test.config index a5244f9..c687a86 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,6 +29,7 @@ params { perform_shortread_complexityfilter = true perform_shortread_hostremoval = true perform_longread_hostremoval = true + perform_runmerging = true hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' run_kaiju = true run_kraken2 = true diff --git a/conf/test_nopreprocessing.config b/conf/test_nopreprocessing.config new file mode 100644 index 0000000..e8d4ed9 --- /dev/null +++ b/conf/test_nopreprocessing.config @@ -0,0 +1,46 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset skipping all preprocessing to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + perform_shortread_clipmerge = false + perform_longread_clip = false + perform_shortread_complexityfilter = false + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + perform_runmerging = false + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_malt = true + run_metaphlan3 = true + run_centrifuge = true + run_diamond = true +} + +process { + withName: MALT_RUN { + maxForks = 1 + } +} diff --git a/conf/test_noprofiling.config b/conf/test_noprofiling.config new file mode 100644 index 0000000..f908651 --- /dev/null +++ b/conf/test_noprofiling.config @@ -0,0 +1,46 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset without performing any profiling to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + perform_shortread_clipmerge = true + perform_longread_clip = true + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = false + run_kraken2 = false + run_malt = false + run_metaphlan3 = false + run_centrifuge = false + run_diamond = false +} + +process { + withName: MALT_RUN { + maxForks = 1 + } +} diff --git a/docs/usage.md b/docs/usage.md index 4aa1d09..47ac952 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -183,11 +183,11 @@ Complexity filtering can be activated via the `--perform_shortread_complexityfil Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources. -There are currently two options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) and [`prinseq++`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/). +There are currently three options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus), and [`fastp`](https://github.com/OpenGene/fastp#low-complexity-filter). The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of both tools (see links above) to decide on optimal methods and parameters for your dataset. -You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. +You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. If running with `fastp`, complexity filtering happens inclusively within the earlier shortread preprocessing step. Therefore there will not be an independent pipeline step for complexity filtering, and no independent FASTQ file (i.e. `--save_complexityfiltered_reads` will be ignored) - your complexity filtered reads will also be in the `fastp/` folder in the same file(s) as the preprocessed read. #### Host Removal diff --git a/nextflow.config b/nextflow.config index ca9e280..411e7a6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -74,6 +74,7 @@ params { shortread_complexityfilter_bbduk_mask = false shortread_complexityfilter_prinseqplusplus_mode = 'entropy' shortread_complexityfilter_prinseqplusplus_dustscore = 0.5 + shortread_complexityfilter_fastp_threshold = 30 save_complexityfiltered_reads = false // run merging @@ -185,6 +186,8 @@ profiles { } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } + test_noprofiling { includeConfig 'conf/test_noprofiling.config' } + test_nopreprocessing { includeConfig 'conf/test_preprocessing.config' } } // Load igenomes.config if required diff --git a/nextflow_schema.json b/nextflow_schema.json index ab2108e..a0a830c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -294,7 +304,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -319,7 +332,12 @@ }, "shortread_complexityfilter_tool": { "type": "string", - "default": "bbduk" + "default": "bbduk", + "enum": [ + "bbduk", + "prinseqplusplus", + "fastp" + ] }, "shortread_complexityfilter_bbduk_windowsize": { "type": "integer", @@ -335,7 +353,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -391,7 +412,14 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": ["phylum", "class", "order", "family", "genus", "species"] + "enum": [ + "phylum", + "class", + "order", + "family", + "genus", + "species" + ] }, "run_diamond": { "type": "boolean" @@ -399,11 +427,19 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"] + "enum": [ + "blast", + "xml", + "txt", + "daa", + "sam", + "tsv", + "paf" + ] }, "longread_hostremoval_index": { "type": "string", "default": "None" } } -} +} \ No newline at end of file diff --git a/subworkflows/local/shortread_complexityfiltering.nf b/subworkflows/local/shortread_complexityfiltering.nf index 12686d7..a34440d 100644 --- a/subworkflows/local/shortread_complexityfiltering.nf +++ b/subworkflows/local/shortread_complexityfiltering.nf @@ -13,6 +13,7 @@ workflow SHORTREAD_COMPLEXITYFILTERING { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() + // fastp complexity filtering is activated via modules.conf in shortread_preprocessing if ( params.shortread_complexityfilter_tool == 'bbduk' ) { ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads ch_versions = ch_versions.mix( BBMAP_BBDUK.out.versions.first() ) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 7a6cd09..b8b953b 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -19,9 +19,12 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } + if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" +if (params.shortread_complexityfilter_tool == 'fastp' && ( params.perform_shortread_clipmerge == false || params.shortread_clipmerge_tool != 'fastp' )) exit 1, "ERROR: [nf-core/taxprofiler] cannot use fastp complexity filtering if preprocessing not turned on and/or tool is not fastp. Please specify --perform_shortread_clipmerge and/or --shortread_clipmerge_tool 'fastp'" + if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." } if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." } @@ -131,7 +134,8 @@ workflow TAXPROFILER { SUBWORKFLOW: COMPLEXITY FILTERING */ - if ( params.perform_shortread_complexityfilter ) { + // fastp complexity filtering is activated via modules.conf in shortread_preprocessing + if ( params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp' ) { ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } else { @@ -228,7 +232,7 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) } - if (params.perform_shortread_complexityfilter){ + if (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp'){ ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) } From 0f651873dd711c2223a8f844a091deb1316f6979 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 7 May 2022 06:12:24 +0200 Subject: [PATCH 211/214] Linting --- conf/modules.config | 4 ++-- nextflow_schema.json | 55 ++++++++++---------------------------------- 2 files changed, 14 insertions(+), 45 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index c834f4e..0abff92 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -55,7 +55,7 @@ process { params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", // filtering options "--length_required ${params.shortread_clipmerge_minlength}", - params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' + params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ @@ -76,7 +76,7 @@ process { params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe", // filtering options "--length_required ${params.shortread_clipmerge_minlength}", - params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' + params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ diff --git a/nextflow_schema.json b/nextflow_schema.json index a0a830c..74fab27 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -304,10 +294,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -333,11 +320,7 @@ "shortread_complexityfilter_tool": { "type": "string", "default": "bbduk", - "enum": [ - "bbduk", - "prinseqplusplus", - "fastp" - ] + "enum": ["bbduk", "prinseqplusplus", "fastp"] }, "shortread_complexityfilter_bbduk_windowsize": { "type": "integer", @@ -353,10 +336,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -412,14 +392,7 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": [ - "phylum", - "class", - "order", - "family", - "genus", - "species" - ] + "enum": ["phylum", "class", "order", "family", "genus", "species"] }, "run_diamond": { "type": "boolean" @@ -427,19 +400,15 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": [ - "blast", - "xml", - "txt", - "daa", - "sam", - "tsv", - "paf" - ] + "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"] }, "longread_hostremoval_index": { "type": "string", "default": "None" + }, + "shortread_complexityfilter_fastp_threshold": { + "type": "integer", + "default": 30 } } -} \ No newline at end of file +} From 3fa2181f498f5e1c994efb06be3a63f459dcf9cc Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 7 May 2022 06:15:15 +0200 Subject: [PATCH 212/214] Fix prinseq tool selection --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a1ece72..7bb2076 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,7 +38,7 @@ jobs: - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs --shortread_clipmerge_excludeunmerged" - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs" - "--shortread_complexityfilter_tool bbduk" - - "--shortread_complexityfilter_tool prinseq" + - "--shortread_complexityfilter_tool prinseqplusplus" - "--perform_runmerging" - "--perform_runmerging --shortread_clipmerge_mergepairs" - "--shortread_complexityfilter false --perform_shortread_hostremoval" From d67543503b927f6af5bce9574ad8b428c53c0c46 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Sat, 7 May 2022 13:15:21 +0200 Subject: [PATCH 213/214] Apply suggestions from code review Co-authored-by: Moritz E. Beber --- conf/modules.config | 2 +- docs/usage.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 0abff92..5d8398e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -55,7 +55,7 @@ process { params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", // filtering options "--length_required ${params.shortread_clipmerge_minlength}", - params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' + (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp') ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ diff --git a/docs/usage.md b/docs/usage.md index 47ac952..54ffce0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -185,7 +185,7 @@ Complexity filtering is primarily a run-time optimisation step. It is not necess There are currently three options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus), and [`fastp`](https://github.com/OpenGene/fastp#low-complexity-filter). -The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of both tools (see links above) to decide on optimal methods and parameters for your dataset. +The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of the tools (see links above) to decide on optimal methods and parameters for your dataset. You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. If running with `fastp`, complexity filtering happens inclusively within the earlier shortread preprocessing step. Therefore there will not be an independent pipeline step for complexity filtering, and no independent FASTQ file (i.e. `--save_complexityfiltered_reads` will be ignored) - your complexity filtered reads will also be in the `fastp/` folder in the same file(s) as the preprocessed read. From aca7bc439dac189b8f4b33e7014859f7528d80bf Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 12 May 2022 08:08:14 +0100 Subject: [PATCH 214/214] Update nextflow.config --- nextflow.config | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nextflow.config b/nextflow.config index 411e7a6..4bcb1b1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -130,11 +130,11 @@ try { // Load nf-core/taxprofiler custom profiles from different institutions. // Warning: Uncomment only if a pipeline-specific instititutional config already exists on nf-core/configs! -// try { -// includeConfig "${params.custom_config_base}/pipeline/taxprofiler.config" -// } catch (Exception e) { -// System.err.println("WARNING: Could not load nf-core/config/taxprofiler profiles: ${params.custom_config_base}/pipeline/taxprofiler.config") -// } +try { + includeConfig "${params.custom_config_base}/pipeline/taxprofiler.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/config/taxprofiler profiles: ${params.custom_config_base}/pipeline/taxprofiler.config") +} profiles {