Add Kraken2 and MALT/run as Proof of Concept (currnetly MQC issue)

2024-12-22 15:18:16 +00:00 · 2022-03-03 17:42:02 +01:00 · 2022-03-03 17:42:02 +01:00 · 2c183ed2ed
commit 2c183ed2ed
parent 278f5605ca
8 changed files with 315 additions and 7 deletions
--- a/conf/modules.config
+++ b/conf/modules.config
@ -68,6 +68,26 @@ process {
        ]
    }

+    withName: MALT_RUN {
+        publishDir = [
+            path: { "${params.outdir}/malt/${meta.db_name}" },
+            mode: 'copy',
+            pattern: '*.{rma6,tab,text,sam,log}'
+        ]
+        ext.args = { "${meta.db_params}" }
+        ext.when = params.run_malt
+    }
+
+    withName: KRAKEN2_KRAKEN2 {
+        publishDir = [
+            path: { "${params.outdir}/kraken2/${meta.db_name}" },
+            mode: 'copy',
+            pattern: '.{fastq.gz,txt}'
+        ]
+        ext.args = { "${meta.db_params}" }
+        ext.when = params.run_kraken2
+        ext.prefix = { "${meta.id}-${meta.db_name}" }
+    }

    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
        publishDir = [
--- a/modules.json
+++ b/modules.json
@ -15,6 +15,12 @@
            "fastqc": {
                "git_sha": "9d0cad583b9a71a6509b754fdf589cbfbed08961"
            },
+            "kraken2/kraken2": {
+                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
+            },
+            "malt/run": {
+                "git_sha": "76cdd46f3f8a77fb5023fb5a39c4ab99925b8b56"
+            },
            "multiqc": {
                "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41"
            }
--- a/modules/nf-core/modules/kraken2/kraken2/main.nf
+++ b/modules/nf-core/modules/kraken2/kraken2/main.nf
@ -0,0 +1,49 @@
+process KRAKEN2_KRAKEN2 {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda (params.enable_conda ? 'bioconda::kraken2=2.1.2 conda-forge::pigz=2.6' : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' :
+        'quay.io/biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }"
+
+    input:
+    tuple val(meta), path(reads)
+    path  db
+
+    output:
+    tuple val(meta), path('*classified*')  , emit: classified
+    tuple val(meta), path('*unclassified*'), emit: unclassified
+    tuple val(meta), path('*report.txt')   , emit: txt
+    path "versions.yml"                    , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def paired       = meta.single_end ? "" : "--paired"
+    def classified   = meta.single_end ? "${prefix}.classified.fastq"   : "${prefix}.classified#.fastq"
+    def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq"
+    """
+    kraken2 \\
+        --db $db \\
+        --threads $task.cpus \\
+        --unclassified-out $unclassified \\
+        --classified-out $classified \\
+        --report ${prefix}.kraken2.report.txt \\
+        --gzip-compressed \\
+        $paired \\
+        $args \\
+        $reads
+
+    pigz -p $task.cpus *.fastq
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//')
+        pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
+    END_VERSIONS
+    """
+}
--- a/modules/nf-core/modules/kraken2/kraken2/meta.yml
+++ b/modules/nf-core/modules/kraken2/kraken2/meta.yml
@ -0,0 +1,60 @@
+name: kraken2_kraken2
+description: Classifies metagenomic sequence data
+keywords:
+  - classify
+  - metagenomics
+  - fastq
+  - db
+tools:
+  - kraken2:
+      description: |
+        Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads
+      homepage: https://ccb.jhu.edu/software/kraken2/
+      documentation: https://github.com/DerrickWood/kraken2/wiki/Manual
+      doi: 10.1186/s13059-019-1891-0
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: |
+        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+        respectively.
+  - db:
+      type: directory
+      description: Kraken2 database
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - classified:
+      type: file
+      description: |
+        Reads classified to belong to any of the taxa
+        on the Kraken2 database.
+      pattern: "*{fastq.gz}"
+  - unclassified:
+      type: file
+      description: |
+        Reads not classified to belong to any of the taxa
+        on the Kraken2 database.
+      pattern: "*{fastq.gz}"
+  - txt:
+      type: file
+      description: |
+        Kraken2 report containing stats about classified
+        and not classifed reads.
+      pattern: "*.{report.txt}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@joseespinosa"
+  - "@drpatelh"
--- a/modules/nf-core/modules/malt/run/main.nf
+++ b/modules/nf-core/modules/malt/run/main.nf
@ -0,0 +1,49 @@
+process MALT_RUN {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda (params.enable_conda ? "bioconda::malt=0.53" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/malt:0.53--hdfd78af_0' :
+        'quay.io/biocontainers/malt:0.53--hdfd78af_0' }"
+
+    input:
+    tuple val(meta), path(fastqs)
+    val mode
+    path index
+
+    output:
+    tuple val(meta), path("*.rma6")                          , emit: rma6
+    tuple val(meta), path("*.{tab,text,sam}"),  optional:true, emit: alignments
+    tuple val(meta), path("*.log")                           , emit: log
+    path "versions.yml"                                      , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def avail_mem = 6
+    if (!task.memory) {
+        log.info '[MALT_RUN] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.'
+    } else {
+        avail_mem = task.memory.giga
+    }
+
+    """
+    malt-run \\
+        -J-Xmx${avail_mem}g \\
+        -t $task.cpus \\
+        -v \\
+        -o . \\
+        $args \\
+        --inFile ${fastqs.join(' ')} \\
+        -m $mode \\
+        --index $index/ |&tee malt-run.log
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        malt: \$(malt-run --help  2>&1 | grep -o 'version.* ' | cut -f 1 -d ',' | cut -f2 -d ' ')
+    END_VERSIONS
+    """
+}
--- a/modules/nf-core/modules/malt/run/meta.yml
+++ b/modules/nf-core/modules/malt/run/meta.yml
@ -0,0 +1,58 @@
+name: malt_run
+description: MALT, an acronym for MEGAN alignment tool, is a sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics.
+keywords:
+  - malt
+  - alignment
+  - metagenomics
+  - ancient DNA
+  - aDNA
+  - palaeogenomics
+  - archaeogenomics
+  - microbiome
+tools:
+  - malt:
+      description: A tool for mapping metagenomic data
+      homepage: https://www.wsi.uni-tuebingen.de/lehrstuehle/algorithms-in-bioinformatics/software/malt/
+      documentation: https://software-ab.informatik.uni-tuebingen.de/download/malt/manual.pdf
+      tool_dev_url: None
+      doi: "10.1038/s41559-017-0446-6"
+      licence: ["GPL v3"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - fastqs:
+      type: file
+      description: Input FASTQ files
+      pattern: "*.{fastq.gz,fq.gz}"
+  - mode:
+      type: string
+      description: Program mode
+      pattern: "Unknown|BlastN|BlastP|BlastX|Classifier"
+  - index:
+      type: directory
+      description: Index/database directory from malt-build
+      pattern: "*/"
+output:
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - rma6:
+      type: file
+      description: MEGAN6 RMA6 file
+      pattern: "*.rma6"
+  - sam:
+      type: file
+      description: Alignment files in Tab, Text or MEGAN-compatible SAM format
+      pattern: "*.{tab,txt,sam}"
+  - log:
+      type: file
+      description: Log of verbose MALT stdout
+      pattern: "malt-run.log"
+
+authors:
+  - "@jfy133"
--- a/nextflow.config
+++ b/nextflow.config
@ -54,8 +54,15 @@ params {
    databases = null

    // FASTQ preprocessing
-    fastp_clip_merge       = false
-    fastp_exclude_unmerged = true
+    fastp_clip_merge           = false
+    fastp_exclude_unmerged     = true
+
+    // MALT
+    run_malt                   = false
+    malt_mode                  = 'BlastN'
+
+    // kraken2
+    run_kraken2                = false
 }

 // Load base.config by default for all pipelines
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -56,6 +56,9 @@ include { MULTIQC                     } from '../modules/nf-core/modules/multiqc
 include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main'

 include { CAT_FASTQ                   } from '../modules/nf-core/modules/cat/fastq/main'
+include { MALT_RUN                    } from '../modules/nf-core/modules/malt/run/main'
+include { KRAKEN2_KRAKEN2             } from '../modules/nf-core/modules/kraken2/kraken2/main'
+

 /*
 ========================================================================================
@ -95,13 +98,15 @@ workflow TAXPROFILER {
    )

    //
-    // MODULE: Run Clip/Merge/Complexity
+    // PERFORM PREPROCESSING
    //
    if ( params.fastp_clip_merge ) {
        FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq )
    }

-    // MODULE: Cat merge runs of same sample
+    //
+    // PERFORM RUN MERGING
+    //
    ch_processed_for_combine = FASTQ_PREPROCESSING.out.reads
        .dump(tag: "prep_for_combine_grouping")
        .map {
@ -118,15 +123,61 @@ workflow TAXPROFILER {

    CAT_FASTQ ( ch_processed_for_combine.combine )

-    // Ready for profiling!
    ch_reads_for_profiling = ch_processed_for_combine.skip
                                .dump(tag: "skip_combine")
                                .mix( CAT_FASTQ.out.reads )
                                .dump(tag: "files_for_profiling")

-    // Combine reads with possible databases
+    //
+    // COMBINE READS WITH POSSIBLE DATABASES
+    //
+
+    // output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
+    ch_input_for_profiling = ch_reads_for_profiling
+            .combine(DB_CHECK.out.dbs)
+            .dump(tag: "reads_plus_db")
+            .branch {
+                malt:    it[2]['tool'] == 'malt'
+                kraken2: it[2]['tool'] == 'kraken2'
+                unknown: true
+            }
+
+    //
+    // PREP PROFILER INPUT CHANNELS ON PER TOOL BASIS
+    //
+
+    // We groupTuple to have all samples in one channel for MALT as database
+    // loading takes a long time, so we only want to run it once per database
+    ch_input_for_malt =  ch_input_for_profiling.malt
+                            .map {
+                                it ->
+                                    def temp_meta =  [ id: it[2]['db_name']]  + it[2]
+                                    def db = it[3]
+                                    [ temp_meta, it[1], db ]
+                            }
+                            .groupTuple(by: [0,2])
+                            .dump(tag: "input for malt")
+                            .multiMap {
+                                it ->
+                                    reads: [ it[0], it[1].flatten() ]
+                                    db: it[2]
+                            }
+
+    // We can run Kraken2 one-by-one sample-wise
+    ch_input_for_kraken2 =  ch_input_for_profiling.kraken2
+                            .dump(tag: "input for kraken")
+                            .multiMap {
+                                it ->
+                                    reads: [ it[0] + it[2], it[1] ]
+                                    db: it[3]
+                            }
+
+    //
+    // RUN PROFILING
+    //
+    MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db )
+    KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db  )

-    ch_reads_for_profiling.combine(DB_CHECK.out.dbs).dump(tag: "reads_plus_db")

    //
    // MODULE: MultiQC
@ -143,6 +194,14 @@ workflow TAXPROFILER {
    if (params.fastp_clip_merge) {
        ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc)
    }
+    if (params.run_kraken2) {
+        ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]))
+        ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first())
+    }
+    if (params.run_malt) {
+        ch_multiqc_files = ch_multiqc_files.mix(MALT_RUN.out.log.collect{it[1]}.ifEmpty([]))
+        ch_versions = ch_versions.mix(MALT_RUN.out.versions.first())
+    }


    MULTIQC (