From 87a1d8051979040f7dd1b6f976b2bd0224a7bf24 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Sat, 7 May 2022 05:22:35 +0200
Subject: [PATCH] Add working output for Kraken2/Centrifuge/DIAMOND

---
 conf/modules.config                           |  4 +-
 conf/test.config                              |  4 ++
 modules.json                                  |  6 +-
 .../nf-core/modules/diamond/blastx/main.nf    | 33 ++++++---
 .../nf-core/modules/diamond/blastx/meta.yml   | 30 +++++++-
 .../nf-core/modules/kraken2/kraken2/main.nf   | 23 ++++--
 .../nf-core/modules/kraken2/kraken2/meta.yml  | 25 +++++--
 nextflow.config                               | 14 ++--
 nextflow_schema.json                          | 71 ++++++++++++++-----
 subworkflows/local/profiling.nf               | 34 ++++++---
 workflows/taxprofiler.nf                      |  2 +
 11 files changed, 181 insertions(+), 65 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index cd0fb04..31d0fca 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -271,7 +271,7 @@ process {
         publishDir = [
             path: { "${params.outdir}/kraken2/${meta.db_name}" },
             mode: params.publish_dir_mode,
-            pattern: '*.{txt}'
+            pattern: '*.{txt,report,fastq.gz}'
         ]
     }
 
@@ -289,7 +289,7 @@ process {
         publishDir = [
             path: { "${params.outdir}/centrifuge/${meta.db_name}" },
             mode: params.publish_dir_mode,
-            pattern: '*.txt'
+            pattern: '*.{txt,sam,gz}'
         ]
         ext.args = { "${meta.db_params}" }
         ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
diff --git a/conf/test.config b/conf/test.config
index a5244f9..573db42 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -36,6 +36,10 @@ params {
     run_metaphlan3                        = true
     run_centrifuge                        = true
     run_diamond                           = true
+    malt_save_reads                       = true
+    kraken2_save_reads                    = true
+    centrifuge_save_reads                 = true
+    diamond_save_reads                    = true
 }
 
 process {
diff --git a/modules.json b/modules.json
index a55c88b..5cad32e 100644
--- a/modules.json
+++ b/modules.json
@@ -28,7 +28,7 @@
                 "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
             },
             "diamond/blastx": {
-                "git_sha": "42564565b934eeb2449e35ec97ed13ff2a67f1de"
+                "git_sha": "bd3bfe0817246082525ab93707976676b1fe208b"
             },
             "fastp": {
                 "git_sha": "d0a1cbb703a130c19f6796c3fce24fbe7dfce789"
@@ -43,7 +43,7 @@
                 "git_sha": "538dbac98ba9c8f799536cd5a617195501439457"
             },
             "kraken2/kraken2": {
-                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
+                "git_sha": "abe025677cdd805cc93032341ab19885473c1a07"
             },
             "malt/run": {
                 "git_sha": "72b96f4e504eef673f2b5c13560a9d90b669129b"
@@ -80,4 +80,4 @@
             }
         }
     }
-}
+}
\ No newline at end of file
diff --git a/modules/nf-core/modules/diamond/blastx/main.nf b/modules/nf-core/modules/diamond/blastx/main.nf
index 6703c1e..d327227 100644
--- a/modules/nf-core/modules/diamond/blastx/main.nf
+++ b/modules/nf-core/modules/diamond/blastx/main.nf
@@ -2,21 +2,26 @@ process DIAMOND_BLASTX {
     tag "$meta.id"
     label 'process_medium'
 
-    // Dimaond is limited to v2.0.9 because there is not a
-    // singularity version higher than this at the current time.
-    conda (params.enable_conda ? "bioconda::diamond=2.0.9" : null)
+    conda (params.enable_conda ? "bioconda::diamond=2.0.15" : null)
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/diamond:2.0.9--hdcc8f71_0' :
-        'quay.io/biocontainers/diamond:2.0.9--hdcc8f71_0' }"
+        'https://depot.galaxyproject.org/singularity/diamond:2.0.15--hb97b32f_0' :
+        'quay.io/biocontainers/diamond:2.0.15--hb97b32f_0' }"
 
     input:
     tuple val(meta), path(fasta)
     path db
-    val outext
+    val out_ext
+    val blast_columns
 
     output:
-    tuple val(meta), path('*.{blast,xml,txt,daa,sam,tsv,paf}'), emit: output
-    path "versions.yml"           , emit: versions
+    tuple val(meta), path('*.blast'), optional: true, emit: blast
+    tuple val(meta), path('*.xml')  , optional: true, emit: xml
+    tuple val(meta), path('*.txt')  , optional: true, emit: txt
+    tuple val(meta), path('*.daa')  , optional: true, emit: daa
+    tuple val(meta), path('*.sam')  , optional: true, emit: sam
+    tuple val(meta), path('*.tsv')  , optional: true, emit: tsv
+    tuple val(meta), path('*.paf')  , optional: true, emit: paf
+    path "versions.yml"                               , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -24,7 +29,8 @@ process DIAMOND_BLASTX {
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
-    switch ( outext ) {
+    def columns = blast_columns ? "${blast_columns}" : ''
+    switch ( out_ext ) {
         case "blast": outfmt = 0; break
         case "xml": outfmt = 5; break
         case "txt": outfmt = 6; break
@@ -32,6 +38,11 @@ process DIAMOND_BLASTX {
         case "sam": outfmt = 101; break
         case "tsv": outfmt = 102; break
         case "paf": outfmt = 103; break
+        default:
+            outfmt = '6';
+            out_ext = 'txt';
+            log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)");
+            break
     }
     """
     DB=`find -L ./ -name "*.dmnd" | sed 's/.dmnd//'`
@@ -41,9 +52,9 @@ process DIAMOND_BLASTX {
         --threads $task.cpus \\
         --db \$DB \\
         --query $fasta \\
-        --outfmt ${outfmt} \\
+        --outfmt ${outfmt} ${columns} \\
         $args \\
-        --out ${prefix}.${outext}
+        --out ${prefix}.${out_ext}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/nf-core/modules/diamond/blastx/meta.yml b/modules/nf-core/modules/diamond/blastx/meta.yml
index 5ee2d55..2dcd7bc 100644
--- a/modules/nf-core/modules/diamond/blastx/meta.yml
+++ b/modules/nf-core/modules/diamond/blastx/meta.yml
@@ -28,7 +28,7 @@ input:
       type: directory
       description: Directory containing the nucelotide blast database
       pattern: "*"
-  - outext:
+  - out_ext:
       type: string
       description: |
         Specify the type of output file to be generated. `blast` corresponds to
@@ -38,10 +38,34 @@ input:
       pattern: "blast|xml|txt|daa|sam|tsv|paf"
 
 output:
+  - blast:
+      type: file
+      description: File containing blastp hits
+      pattern: "*.{blast}"
+  - xml:
+      type: file
+      description: File containing blastp hits
+      pattern: "*.{xml}"
   - txt:
       type: file
-      description: File containing blastx hits
-      pattern: "*.{blastx.txt}"
+      description: File containing hits in tabular BLAST format.
+      pattern: "*.{txt}"
+  - daa:
+      type: file
+      description: File containing hits DAA format
+      pattern: "*.{daa}"
+  - sam:
+      type: file
+      description: File containing aligned reads in SAM format
+      pattern: "*.{sam}"
+  - tsv:
+      type: file
+      description: Tab separated file containing taxonomic classification of hits
+      pattern: "*.{tsv}"
+  - paf:
+      type: file
+      description: File containing aligned reads in pairwise mapping format format
+      pattern: "*.{paf}"
   - versions:
       type: file
       description: File containing software versions
diff --git a/modules/nf-core/modules/kraken2/kraken2/main.nf b/modules/nf-core/modules/kraken2/kraken2/main.nf
index 3ec5df5..d400023 100644
--- a/modules/nf-core/modules/kraken2/kraken2/main.nf
+++ b/modules/nf-core/modules/kraken2/kraken2/main.nf
@@ -10,12 +10,15 @@ process KRAKEN2_KRAKEN2 {
     input:
     tuple val(meta), path(reads)
     path  db
+    val save_output_fastqs
+    val save_reads_assignment
 
     output:
-    tuple val(meta), path('*classified*')  , emit: classified
-    tuple val(meta), path('*unclassified*'), emit: unclassified
-    tuple val(meta), path('*report.txt')   , emit: txt
-    path "versions.yml"                    , emit: versions
+    tuple val(meta), path('*classified*')     , optional:true, emit: classified_reads_fastq
+    tuple val(meta), path('*unclassified*')   , optional:true, emit: unclassified_reads_fastq
+    tuple val(meta), path('*classifiedreads*'), optional:true, emit: classified_reads_assignment
+    tuple val(meta), path('*report.txt')                     , emit: report
+    path "versions.yml"                                      , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -26,19 +29,25 @@ process KRAKEN2_KRAKEN2 {
     def paired       = meta.single_end ? "" : "--paired"
     def classified   = meta.single_end ? "${prefix}.classified.fastq"   : "${prefix}.classified#.fastq"
     def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq"
+    def classified_command = save_output_fastqs ? "--classified-out ${classified}" : ""
+    def unclassified_command = save_output_fastqs ? "--unclassified-out ${unclassified}" : ""
+    def readclassification_command = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : ""
+    def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : ""
+
     """
     kraken2 \\
         --db $db \\
         --threads $task.cpus \\
-        --unclassified-out $unclassified \\
-        --classified-out $classified \\
         --report ${prefix}.kraken2.report.txt \\
         --gzip-compressed \\
+        $unclassified_command \\
+        $classified_command \\
+        $readclassification_command \\
         $paired \\
         $args \\
         $reads
 
-    pigz -p $task.cpus *.fastq
+    $compress_reads_command
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/nf-core/modules/kraken2/kraken2/meta.yml b/modules/nf-core/modules/kraken2/kraken2/meta.yml
index 9d6a385..7129fe3 100644
--- a/modules/nf-core/modules/kraken2/kraken2/meta.yml
+++ b/modules/nf-core/modules/kraken2/kraken2/meta.yml
@@ -27,25 +27,40 @@ input:
   - db:
       type: directory
       description: Kraken2 database
+  - save_output_fastqs:
+      type: boolean
+      description: |
+        If true, optional commands are added to save classified and unclassified reads
+        as fastq files
+  - save_reads_assignment:
+      type: boolean
+      description: |
+        If true, an optional command is added to save a file reporting the taxonomic
+        classification of each input read
 output:
   - meta:
       type: map
       description: |
         Groovy Map containing sample information
         e.g. [ id:'test', single_end:false ]
-  - classified:
+  - classified_reads_fastq:
       type: file
       description: |
-        Reads classified to belong to any of the taxa
+        Reads classified as belonging to any of the taxa
         on the Kraken2 database.
       pattern: "*{fastq.gz}"
-  - unclassified:
+  - unclassified_reads_fastq:
       type: file
       description: |
-        Reads not classified to belong to any of the taxa
+        Reads not classified to any of the taxa
         on the Kraken2 database.
       pattern: "*{fastq.gz}"
-  - txt:
+  - classified_reads_assignment:
+      type: file
+      description: |
+        Kraken2 output file indicating the taxonomic assignment of
+        each input read
+  - report:
       type: file
       description: |
         Kraken2 report containing stats about classified
diff --git a/nextflow.config b/nextflow.config
index ca9e280..3f76d53 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -94,16 +94,17 @@ params {
     // MALT
     run_malt                   = false
     malt_mode                  = 'BlastN'
-    malt_generatemegansummary = false
+    malt_generate_megansummary = false
+    malt_save_reads            = false
 
     // kraken2
-    run_kraken2                = false
+    run_kraken2                     = false
+    kraken2_save_reads              = false
+    kraken2_save_readclassification = false
 
     // centrifuge
     run_centrifuge             = false
-    centrifuge_save_unaligned  = false
-    centrifuge_save_aligned    = false
-    centrifuge_sam_format      = false
+    centrifuge_save_reads      = false
 
     // metaphlan3
     run_metaphlan3             = false
@@ -114,7 +115,8 @@ params {
 
     // diamond
     run_diamond                = false
-    diamond_output_format      = 'txt'
+    diamond_output_format      = 'tsv'  // TSV is only format with taxonomic information apparently
+    diamond_save_reads         = false // this will override diamound output format so no taxonomic profile is generated!
 }
 
 // Load base.config by default for all pipelines
diff --git a/nextflow_schema.json b/nextflow_schema.json
index ab2108e..bb4b759 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -10,7 +10,10 @@
             "type": "object",
             "fa_icon": "fas fa-terminal",
             "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input", "outdir"],
+            "required": [
+                "input",
+                "outdir"
+            ],
             "properties": {
                 "input": {
                     "type": "string",
@@ -173,7 +176,14 @@
                     "description": "Method used to save pipeline results to output directory.",
                     "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                     "fa_icon": "fas fa-copy",
-                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
+                    "enum": [
+                        "symlink",
+                        "rellink",
+                        "link",
+                        "copy",
+                        "copyNoFollow",
+                        "move"
+                    ],
                     "hidden": true
                 },
                 "email_on_fail": {
@@ -278,15 +288,6 @@
         "run_centrifuge": {
             "type": "boolean"
         },
-        "centrifuge_save_unaligned": {
-            "type": "boolean"
-        },
-        "centrifuge_save_aligned": {
-            "type": "boolean"
-        },
-        "centrifuge_sam_format": {
-            "type": "boolean"
-        },
         "run_metaphlan3": {
             "type": "boolean",
             "description": "Enable MetaPhlAn for taxonomic profiling"
@@ -294,7 +295,10 @@
         "shortread_clipmerge_tool": {
             "type": "string",
             "default": "fastp",
-            "enum": ["fastp", "adapterremoval"]
+            "enum": [
+                "fastp",
+                "adapterremoval"
+            ]
         },
         "shortread_clipmerge_skipadaptertrim": {
             "type": "boolean"
@@ -335,7 +339,10 @@
         "shortread_complexityfilter_prinseqplusplus_mode": {
             "type": "string",
             "default": "entropy",
-            "enum": ["entropy", "dust"]
+            "enum": [
+                "entropy",
+                "dust"
+            ]
         },
         "shortread_complexityfilter_prinseqplusplus_dustscore": {
             "type": "number",
@@ -385,13 +392,20 @@
         "run_kaiju": {
             "type": "boolean"
         },
-        "malt_generatemegansummary": {
+        "malt_generate_megansummary": {
             "type": "boolean"
         },
         "kaiju_taxon_name": {
             "type": "string",
             "default": "species",
-            "enum": ["phylum", "class", "order", "family", "genus", "species"]
+            "enum": [
+                "phylum",
+                "class",
+                "order",
+                "family",
+                "genus",
+                "species"
+            ]
         },
         "run_diamond": {
             "type": "boolean"
@@ -399,11 +413,34 @@
         "diamond_output_format": {
             "type": "string",
             "default": "tsv",
-            "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"]
+            "enum": [
+                "blast",
+                "xml",
+                "txt",
+                "daa",
+                "sam",
+                "tsv",
+                "paf"
+            ]
         },
         "longread_hostremoval_index": {
             "type": "string",
             "default": "None"
+        },
+        "malt_save_reads": {
+            "type": "boolean"
+        },
+        "kraken2_save_reads": {
+            "type": "boolean"
+        },
+        "kraken2_save_readclassification": {
+            "type": "boolean"
+        },
+        "centrifuge_save_reads": {
+            "type": "boolean"
+        },
+        "diamond_save_reads": {
+            "type": "boolean"
         }
     }
-}
+}
\ No newline at end of file
diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf
index 7fb3ce9..18ea7fa 100644
--- a/subworkflows/local/profiling.nf
+++ b/subworkflows/local/profiling.nf
@@ -65,10 +65,18 @@ workflow PROFILING {
         ch_input_for_malt =  ch_input_for_profiling.malt
                                 .filter { it[0]['instrument_platform'] == 'ILLUMINA' }
                                 .map {
-                                    it ->
-                                        def temp_meta =  [ id: it[2]['db_name']]  + it[2]
-                                        def db = it[3]
-                                        [ temp_meta, it[1], db ]
+                                    meta, reads, db_meta, db ->
+                                        def sam_format = params.malt_save_reads ? ' --alignments' : ""
+                                        // TODO No MALT SAM?
+                                        // TODO check all aligned reads published
+                                        // TODO try turning on/off aligned reads
+                                        // TODO wut? [9a/a441d6] Submitted process > NFCORE_TAXPROFILER:TAXPROFILER:PROFILING:MALT_RUN (null)
+                                        def temp_meta = [ id: meta['db_name'] ]
+                                        def new_db_meta = db_meta.clone()
+                                        new_db_meta['db_params'] = db_meta['db_params'] + sam_format
+                                        def new_meta = temp_meta + new_db_meta
+
+                                        [ new_meta, reads, db ]
                                 }
                                 .groupTuple(by: [0,2])
                                 .multiMap {
@@ -92,7 +100,7 @@ workflow PROFILING {
                                         [ meta_new, rma ]
                                 }
 
-        MEGAN_RMA2INFO (ch_maltrun_for_megan, params.malt_generatemegansummary )
+        MEGAN_RMA2INFO (ch_maltrun_for_megan, params.malt_generate_megansummary )
         ch_multiqc_files   = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([])  )
         ch_versions        = ch_versions.mix( MALT_RUN.out.versions.first(), MEGAN_RMA2INFO.out.versions.first() )
         ch_raw_profiles    = ch_raw_profiles.mix( MEGAN_RMA2INFO.out.txt )
@@ -108,10 +116,10 @@ workflow PROFILING {
                                         db: it[3]
                                 }
 
-        KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db  )
-        ch_multiqc_files   = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([])  )
+        KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db, params.kraken2_save_reads, params.kraken2_save_readclassification )
+        ch_multiqc_files   = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report.collect{it[1]}.ifEmpty([])  )
         ch_versions        = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() )
-        ch_raw_profiles    = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.txt )
+        ch_raw_profiles    = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.report )
 
     }
 
@@ -128,7 +136,7 @@ workflow PROFILING {
                                         db: it[3]
                                 }
 
-        CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format  )
+        CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_reads, params.centrifuge_save_reads, params.centrifuge_save_reads  )
         CENTRIFUGE_KREPORT (CENTRIFUGE_CENTRIFUGE.out.results, ch_input_for_centrifuge.db)
         ch_versions        = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() )
         ch_raw_profiles    = ch_raw_profiles.mix( CENTRIFUGE_KREPORT.out.kreport )
@@ -180,9 +188,13 @@ workflow PROFILING {
                                         db: it[3]
                                 }
 
-        DIAMOND_BLASTX ( ch_input_for_diamond.reads, ch_input_for_diamond.db, params.diamond_output_format )
+        // diamond only accepts single output file specification, therefore
+        // this will replace output file!
+        ch_diamond_reads_format = params.diamond_save_reads ? 'sam' : params.diamond_output_format
+
+        DIAMOND_BLASTX ( ch_input_for_diamond.reads, ch_input_for_diamond.db, params.diamond_output_format, [] )
         ch_versions        = ch_versions.mix( DIAMOND_BLASTX.out.versions.first() )
-        ch_raw_profiles    = ch_raw_profiles.mix( DIAMOND_BLASTX.out.output )
+        ch_raw_profiles    = ch_raw_profiles.mix( DIAMOND_BLASTX.out.tsv )
 
     }
 
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index 7a6cd09..c319296 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -29,6 +29,8 @@ if (params.hostremoval_reference           ) { ch_reference = file(params.hostre
 if (params.shortread_hostremoval_index     ) { ch_shortread_reference_index = file(params.shortread_hostremoval_index    ) } else { ch_shortread_reference_index = [] }
 if (params.longread_hostremoval_index      ) { ch_longread_reference_index  = file(params.longread_hostremoval_index     ) } else { ch_longread_reference_index  = [] }
 
+if (params.diamond_save_reads              ) log.warn "[nf-core/taxprofiler] DIAMOND only allows output of a single format. Only aligned reads in SAM format will be produced, no taxonomic profiles will be available."
+
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     CONFIG FILES