From 2c183ed2edc61ab058580cb63441917d516eb564 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 3 Mar 2022 17:42:02 +0100 Subject: [PATCH] Add Kraken2 and MALT/run as Proof of Concept (currnetly MQC issue) --- conf/modules.config | 20 ++++++ modules.json | 6 ++ .../nf-core/modules/kraken2/kraken2/main.nf | 49 +++++++++++++ .../nf-core/modules/kraken2/kraken2/meta.yml | 60 ++++++++++++++++ modules/nf-core/modules/malt/run/main.nf | 49 +++++++++++++ modules/nf-core/modules/malt/run/meta.yml | 58 ++++++++++++++++ nextflow.config | 11 ++- workflows/taxprofiler.nf | 69 +++++++++++++++++-- 8 files changed, 315 insertions(+), 7 deletions(-) create mode 100644 modules/nf-core/modules/kraken2/kraken2/main.nf create mode 100644 modules/nf-core/modules/kraken2/kraken2/meta.yml create mode 100644 modules/nf-core/modules/malt/run/main.nf create mode 100644 modules/nf-core/modules/malt/run/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 2d533e9..dbc926c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -68,6 +68,26 @@ process { ] } + withName: MALT_RUN { + publishDir = [ + path: { "${params.outdir}/malt/${meta.db_name}" }, + mode: 'copy', + pattern: '*.{rma6,tab,text,sam,log}' + ] + ext.args = { "${meta.db_params}" } + ext.when = params.run_malt + } + + withName: KRAKEN2_KRAKEN2 { + publishDir = [ + path: { "${params.outdir}/kraken2/${meta.db_name}" }, + mode: 'copy', + pattern: '.{fastq.gz,txt}' + ] + ext.args = { "${meta.db_params}" } + ext.when = params.run_kraken2 + ext.prefix = { "${meta.id}-${meta.db_name}" } + } withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ diff --git a/modules.json b/modules.json index 6cf2b3e..844b33a 100644 --- a/modules.json +++ b/modules.json @@ -15,6 +15,12 @@ "fastqc": { "git_sha": "9d0cad583b9a71a6509b754fdf589cbfbed08961" }, + "kraken2/kraken2": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + }, + "malt/run": { + "git_sha": "76cdd46f3f8a77fb5023fb5a39c4ab99925b8b56" + }, "multiqc": { "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41" } diff --git a/modules/nf-core/modules/kraken2/kraken2/main.nf b/modules/nf-core/modules/kraken2/kraken2/main.nf new file mode 100644 index 0000000..3ec5df5 --- /dev/null +++ b/modules/nf-core/modules/kraken2/kraken2/main.nf @@ -0,0 +1,49 @@ +process KRAKEN2_KRAKEN2 { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? 'bioconda::kraken2=2.1.2 conda-forge::pigz=2.6' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' : + 'quay.io/biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }" + + input: + tuple val(meta), path(reads) + path db + + output: + tuple val(meta), path('*classified*') , emit: classified + tuple val(meta), path('*unclassified*'), emit: unclassified + tuple val(meta), path('*report.txt') , emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "" : "--paired" + def classified = meta.single_end ? "${prefix}.classified.fastq" : "${prefix}.classified#.fastq" + def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq" + """ + kraken2 \\ + --db $db \\ + --threads $task.cpus \\ + --unclassified-out $unclassified \\ + --classified-out $classified \\ + --report ${prefix}.kraken2.report.txt \\ + --gzip-compressed \\ + $paired \\ + $args \\ + $reads + + pigz -p $task.cpus *.fastq + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/kraken2/kraken2/meta.yml b/modules/nf-core/modules/kraken2/kraken2/meta.yml new file mode 100644 index 0000000..9d6a385 --- /dev/null +++ b/modules/nf-core/modules/kraken2/kraken2/meta.yml @@ -0,0 +1,60 @@ +name: kraken2_kraken2 +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - kraken2: + description: | + Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads + homepage: https://ccb.jhu.edu/software/kraken2/ + documentation: https://github.com/DerrickWood/kraken2/wiki/Manual + doi: 10.1186/s13059-019-1891-0 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Kraken2 database +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classified: + type: file + description: | + Reads classified to belong to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - unclassified: + type: file + description: | + Reads not classified to belong to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - txt: + type: file + description: | + Kraken2 report containing stats about classified + and not classifed reads. + pattern: "*.{report.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/modules/malt/run/main.nf b/modules/nf-core/modules/malt/run/main.nf new file mode 100644 index 0000000..61c02ec --- /dev/null +++ b/modules/nf-core/modules/malt/run/main.nf @@ -0,0 +1,49 @@ +process MALT_RUN { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? "bioconda::malt=0.53" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/malt:0.53--hdfd78af_0' : + 'quay.io/biocontainers/malt:0.53--hdfd78af_0' }" + + input: + tuple val(meta), path(fastqs) + val mode + path index + + output: + tuple val(meta), path("*.rma6") , emit: rma6 + tuple val(meta), path("*.{tab,text,sam}"), optional:true, emit: alignments + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def avail_mem = 6 + if (!task.memory) { + log.info '[MALT_RUN] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.' + } else { + avail_mem = task.memory.giga + } + + """ + malt-run \\ + -J-Xmx${avail_mem}g \\ + -t $task.cpus \\ + -v \\ + -o . \\ + $args \\ + --inFile ${fastqs.join(' ')} \\ + -m $mode \\ + --index $index/ |&tee malt-run.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + malt: \$(malt-run --help 2>&1 | grep -o 'version.* ' | cut -f 1 -d ',' | cut -f2 -d ' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/malt/run/meta.yml b/modules/nf-core/modules/malt/run/meta.yml new file mode 100644 index 0000000..ae4277a --- /dev/null +++ b/modules/nf-core/modules/malt/run/meta.yml @@ -0,0 +1,58 @@ +name: malt_run +description: MALT, an acronym for MEGAN alignment tool, is a sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics. +keywords: + - malt + - alignment + - metagenomics + - ancient DNA + - aDNA + - palaeogenomics + - archaeogenomics + - microbiome +tools: + - malt: + description: A tool for mapping metagenomic data + homepage: https://www.wsi.uni-tuebingen.de/lehrstuehle/algorithms-in-bioinformatics/software/malt/ + documentation: https://software-ab.informatik.uni-tuebingen.de/download/malt/manual.pdf + tool_dev_url: None + doi: "10.1038/s41559-017-0446-6" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastqs: + type: file + description: Input FASTQ files + pattern: "*.{fastq.gz,fq.gz}" + - mode: + type: string + description: Program mode + pattern: "Unknown|BlastN|BlastP|BlastX|Classifier" + - index: + type: directory + description: Index/database directory from malt-build + pattern: "*/" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - rma6: + type: file + description: MEGAN6 RMA6 file + pattern: "*.rma6" + - sam: + type: file + description: Alignment files in Tab, Text or MEGAN-compatible SAM format + pattern: "*.{tab,txt,sam}" + - log: + type: file + description: Log of verbose MALT stdout + pattern: "malt-run.log" + +authors: + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index 60b5a42..7991bdf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -54,8 +54,15 @@ params { databases = null // FASTQ preprocessing - fastp_clip_merge = false - fastp_exclude_unmerged = true + fastp_clip_merge = false + fastp_exclude_unmerged = true + + // MALT + run_malt = false + malt_mode = 'BlastN' + + // kraken2 + run_kraken2 = false } // Load base.config by default for all pipelines diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 4a356a5..bd25563 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -56,6 +56,9 @@ include { MULTIQC } from '../modules/nf-core/modules/multiqc include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' +include { MALT_RUN } from '../modules/nf-core/modules/malt/run/main' +include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/modules/kraken2/kraken2/main' + /* ======================================================================================== @@ -95,13 +98,15 @@ workflow TAXPROFILER { ) // - // MODULE: Run Clip/Merge/Complexity + // PERFORM PREPROCESSING // if ( params.fastp_clip_merge ) { FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq ) } - // MODULE: Cat merge runs of same sample + // + // PERFORM RUN MERGING + // ch_processed_for_combine = FASTQ_PREPROCESSING.out.reads .dump(tag: "prep_for_combine_grouping") .map { @@ -118,15 +123,61 @@ workflow TAXPROFILER { CAT_FASTQ ( ch_processed_for_combine.combine ) - // Ready for profiling! ch_reads_for_profiling = ch_processed_for_combine.skip .dump(tag: "skip_combine") .mix( CAT_FASTQ.out.reads ) .dump(tag: "files_for_profiling") - // Combine reads with possible databases + // + // COMBINE READS WITH POSSIBLE DATABASES + // + + // output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] + ch_input_for_profiling = ch_reads_for_profiling + .combine(DB_CHECK.out.dbs) + .dump(tag: "reads_plus_db") + .branch { + malt: it[2]['tool'] == 'malt' + kraken2: it[2]['tool'] == 'kraken2' + unknown: true + } + + // + // PREP PROFILER INPUT CHANNELS ON PER TOOL BASIS + // + + // We groupTuple to have all samples in one channel for MALT as database + // loading takes a long time, so we only want to run it once per database + ch_input_for_malt = ch_input_for_profiling.malt + .map { + it -> + def temp_meta = [ id: it[2]['db_name']] + it[2] + def db = it[3] + [ temp_meta, it[1], db ] + } + .groupTuple(by: [0,2]) + .dump(tag: "input for malt") + .multiMap { + it -> + reads: [ it[0], it[1].flatten() ] + db: it[2] + } + + // We can run Kraken2 one-by-one sample-wise + ch_input_for_kraken2 = ch_input_for_profiling.kraken2 + .dump(tag: "input for kraken") + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + // + // RUN PROFILING + // + MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) - ch_reads_for_profiling.combine(DB_CHECK.out.dbs).dump(tag: "reads_plus_db") // // MODULE: MultiQC @@ -143,6 +194,14 @@ workflow TAXPROFILER { if (params.fastp_clip_merge) { ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc) } + if (params.run_kraken2) { + ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([])) + ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first()) + } + if (params.run_malt) { + ch_multiqc_files = ch_multiqc_files.mix(MALT_RUN.out.log.collect{it[1]}.ifEmpty([])) + ch_versions = ch_versions.mix(MALT_RUN.out.versions.first()) + } MULTIQC (