Add Kraken2 and MALT/run as Proof of Concept (currnetly MQC issue)

2024-11-22 01:16:03 +00:00 · 2022-03-03 17:42:02 +01:00 · 2022-03-03 17:42:02 +01:00 · 2c183ed2ed
commit 2c183ed2ed
parent 278f5605ca
8 changed files with 315 additions and 7 deletions
--- a/conf/modules.config
+++ b/conf/modules.config
@ -68,6 +68,26 @@ process {
        ]
    }
    withName: MALT_RUN {
        publishDir = [
            path: { "${params.outdir}/malt/${meta.db_name}" },
            mode: 'copy',
            pattern: '*.{rma6,tab,text,sam,log}'
        ]
        ext.args = { "${meta.db_params}" }
        ext.when = params.run_malt
    }
    withName: KRAKEN2_KRAKEN2 {
        publishDir = [
            path: { "${params.outdir}/kraken2/${meta.db_name}" },
            mode: 'copy',
            pattern: '.{fastq.gz,txt}'
        ]
        ext.args = { "${meta.db_params}" }
        ext.when = params.run_kraken2
        ext.prefix = { "${meta.id}-${meta.db_name}" }
    }
    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
        publishDir = [
--- a/modules.json
+++ b/modules.json
@ -15,6 +15,12 @@
            "fastqc": {
                "git_sha": "9d0cad583b9a71a6509b754fdf589cbfbed08961"
            },
            "kraken2/kraken2": {
                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
            },
            "malt/run": {
                "git_sha": "76cdd46f3f8a77fb5023fb5a39c4ab99925b8b56"
            },
            "multiqc": {
                "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41"
            }
--- a/modules/nf-core/modules/kraken2/kraken2/main.nf
+++ b/modules/nf-core/modules/kraken2/kraken2/main.nf
@ -0,0 +1,49 @@
 process KRAKEN2_KRAKEN2 {
    tag "$meta.id"
    label 'process_high'
    conda (params.enable_conda ? 'bioconda::kraken2=2.1.2 conda-forge::pigz=2.6' : null)
    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
        'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' :
        'quay.io/biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }"
    input:
    tuple val(meta), path(reads)
    path  db
    output:
    tuple val(meta), path('*classified*')  , emit: classified
    tuple val(meta), path('*unclassified*'), emit: unclassified
    tuple val(meta), path('*report.txt')   , emit: txt
    path "versions.yml"                    , emit: versions
    when:
    task.ext.when == null || task.ext.when
    script:
    def args = task.ext.args ?: ''
    def prefix = task.ext.prefix ?: "${meta.id}"
    def paired       = meta.single_end ? "" : "--paired"
    def classified   = meta.single_end ? "${prefix}.classified.fastq"   : "${prefix}.classified#.fastq"
    def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq"
    """
    kraken2 \\
        --db $db \\
        --threads $task.cpus \\
        --unclassified-out $unclassified \\
        --classified-out $classified \\
        --report ${prefix}.kraken2.report.txt \\
        --gzip-compressed \\
        $paired \\
        $args \\
        $reads
    pigz -p $task.cpus *.fastq
    cat <<-END_VERSIONS > versions.yml
    "${task.process}":
        kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//')
        pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
    END_VERSIONS
    """
 }
--- a/modules/nf-core/modules/kraken2/kraken2/meta.yml
+++ b/modules/nf-core/modules/kraken2/kraken2/meta.yml
@ -0,0 +1,60 @@
 name: kraken2_kraken2
 description: Classifies metagenomic sequence data
 keywords:
  - classify
  - metagenomics
  - fastq
  - db
 tools:
  - kraken2:
      description: |
        Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads
      homepage: https://ccb.jhu.edu/software/kraken2/
      documentation: https://github.com/DerrickWood/kraken2/wiki/Manual
      doi: 10.1186/s13059-019-1891-0
      licence: ["MIT"]
 input:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - reads:
      type: file
      description: |
        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
        respectively.
  - db:
      type: directory
      description: Kraken2 database
 output:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - classified:
      type: file
      description: |
        Reads classified to belong to any of the taxa
        on the Kraken2 database.
      pattern: "*{fastq.gz}"
  - unclassified:
      type: file
      description: |
        Reads not classified to belong to any of the taxa
        on the Kraken2 database.
      pattern: "*{fastq.gz}"
  - txt:
      type: file
      description: |
        Kraken2 report containing stats about classified
        and not classifed reads.
      pattern: "*.{report.txt}"
  - versions:
      type: file
      description: File containing software versions
      pattern: "versions.yml"
 authors:
  - "@joseespinosa"
  - "@drpatelh"
--- a/modules/nf-core/modules/malt/run/main.nf
+++ b/modules/nf-core/modules/malt/run/main.nf
@ -0,0 +1,49 @@
 process MALT_RUN {
    tag "$meta.id"
    label 'process_high'
    conda (params.enable_conda ? "bioconda::malt=0.53" : null)
    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
        'https://depot.galaxyproject.org/singularity/malt:0.53--hdfd78af_0' :
        'quay.io/biocontainers/malt:0.53--hdfd78af_0' }"
    input:
    tuple val(meta), path(fastqs)
    val mode
    path index
    output:
    tuple val(meta), path("*.rma6")                          , emit: rma6
    tuple val(meta), path("*.{tab,text,sam}"),  optional:true, emit: alignments
    tuple val(meta), path("*.log")                           , emit: log
    path "versions.yml"                                      , emit: versions
    when:
    task.ext.when == null || task.ext.when
    script:
    def args = task.ext.args ?: ''
    def avail_mem = 6
    if (!task.memory) {
        log.info '[MALT_RUN] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.'
    } else {
        avail_mem = task.memory.giga
    }
    """
    malt-run \\
        -J-Xmx${avail_mem}g \\
        -t $task.cpus \\
        -v \\
        -o . \\
        $args \\
        --inFile ${fastqs.join(' ')} \\
        -m $mode \\
        --index $index/ |&tee malt-run.log
    cat <<-END_VERSIONS > versions.yml
    "${task.process}":
        malt: \$(malt-run --help  2>&1 | grep -o 'version.* ' | cut -f 1 -d ',' | cut -f2 -d ' ')
    END_VERSIONS
    """
 }
--- a/modules/nf-core/modules/malt/run/meta.yml
+++ b/modules/nf-core/modules/malt/run/meta.yml
@ -0,0 +1,58 @@
 name: malt_run
 description: MALT, an acronym for MEGAN alignment tool, is a sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics.
 keywords:
  - malt
  - alignment
  - metagenomics
  - ancient DNA
  - aDNA
  - palaeogenomics
  - archaeogenomics
  - microbiome
 tools:
  - malt:
      description: A tool for mapping metagenomic data
      homepage: https://www.wsi.uni-tuebingen.de/lehrstuehle/algorithms-in-bioinformatics/software/malt/
      documentation: https://software-ab.informatik.uni-tuebingen.de/download/malt/manual.pdf
      tool_dev_url: None
      doi: "10.1038/s41559-017-0446-6"
      licence: ["GPL v3"]
 input:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - fastqs:
      type: file
      description: Input FASTQ files
      pattern: "*.{fastq.gz,fq.gz}"
  - mode:
      type: string
      description: Program mode
      pattern: "Unknown|BlastN|BlastP|BlastX|Classifier"
  - index:
      type: directory
      description: Index/database directory from malt-build
      pattern: "*/"
 output:
  - versions:
      type: file
      description: File containing software versions
      pattern: "versions.yml"
  - rma6:
      type: file
      description: MEGAN6 RMA6 file
      pattern: "*.rma6"
  - sam:
      type: file
      description: Alignment files in Tab, Text or MEGAN-compatible SAM format
      pattern: "*.{tab,txt,sam}"
  - log:
      type: file
      description: Log of verbose MALT stdout
      pattern: "malt-run.log"
 authors:
  - "@jfy133"
--- a/nextflow.config
+++ b/nextflow.config
@ -56,6 +56,13 @@ params {
    // FASTQ preprocessing
    fastp_clip_merge           = false
    fastp_exclude_unmerged     = true
    // MALT
    run_malt                   = false
    malt_mode                  = 'BlastN'
    // kraken2
    run_kraken2                = false
 }
 // Load base.config by default for all pipelines
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -56,6 +56,9 @@ include { MULTIQC                     } from '../modules/nf-core/modules/multiqc
 include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main'
 include { CAT_FASTQ                   } from '../modules/nf-core/modules/cat/fastq/main'
 include { MALT_RUN                    } from '../modules/nf-core/modules/malt/run/main'
 include { KRAKEN2_KRAKEN2             } from '../modules/nf-core/modules/kraken2/kraken2/main'
 /*
 ========================================================================================
@ -95,13 +98,15 @@ workflow TAXPROFILER {
    )
    //
-    // MODULE: Run Clip/Merge/Complexity
+    // PERFORM PREPROCESSING
    //
    if ( params.fastp_clip_merge ) {
        FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq )
    }
-    // MODULE: Cat merge runs of same sample
+    //
    // PERFORM RUN MERGING
    //
    ch_processed_for_combine = FASTQ_PREPROCESSING.out.reads
        .dump(tag: "prep_for_combine_grouping")
        .map {
@ -118,15 +123,61 @@ workflow TAXPROFILER {
    CAT_FASTQ ( ch_processed_for_combine.combine )
    // Ready for profiling!
    ch_reads_for_profiling = ch_processed_for_combine.skip
                                .dump(tag: "skip_combine")
                                .mix( CAT_FASTQ.out.reads )
                                .dump(tag: "files_for_profiling")
-    // Combine reads with possible databases
+    //
    // COMBINE READS WITH POSSIBLE DATABASES
    //
    // output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
    ch_input_for_profiling = ch_reads_for_profiling
            .combine(DB_CHECK.out.dbs)
            .dump(tag: "reads_plus_db")
            .branch {
                malt:    it[2]['tool'] == 'malt'
                kraken2: it[2]['tool'] == 'kraken2'
                unknown: true
            }
    //
    // PREP PROFILER INPUT CHANNELS ON PER TOOL BASIS
    //
    // We groupTuple to have all samples in one channel for MALT as database
    // loading takes a long time, so we only want to run it once per database
    ch_input_for_malt =  ch_input_for_profiling.malt
                            .map {
                                it ->
                                    def temp_meta =  [ id: it[2]['db_name']]  + it[2]
                                    def db = it[3]
                                    [ temp_meta, it[1], db ]
                            }
                            .groupTuple(by: [0,2])
                            .dump(tag: "input for malt")
                            .multiMap {
                                it ->
                                    reads: [ it[0], it[1].flatten() ]
                                    db: it[2]
                            }
    // We can run Kraken2 one-by-one sample-wise
    ch_input_for_kraken2 =  ch_input_for_profiling.kraken2
                            .dump(tag: "input for kraken")
                            .multiMap {
                                it ->
                                    reads: [ it[0] + it[2], it[1] ]
                                    db: it[3]
                            }
    //
    // RUN PROFILING
    //
    MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db )
    KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db  )
    ch_reads_for_profiling.combine(DB_CHECK.out.dbs).dump(tag: "reads_plus_db")
    //
    // MODULE: MultiQC
@ -143,6 +194,14 @@ workflow TAXPROFILER {
    if (params.fastp_clip_merge) {
        ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc)
    }
    if (params.run_kraken2) {
        ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]))
        ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first())
    }
    if (params.run_malt) {
        ch_multiqc_files = ch_multiqc_files.mix(MALT_RUN.out.log.collect{it[1]}.ifEmpty([]))
        ch_versions = ch_versions.mix(MALT_RUN.out.versions.first())
    }
    MULTIQC (