From 3ff54e620e9b9212a3bad5c687769b8c37e5b89d Mon Sep 17 00:00:00 2001
From: sofstam <sofia.stamouli@scilifelab.se>
Date: Thu, 24 Mar 2022 12:51:45 +0100
Subject: [PATCH] Add centrifuge classification

---
 conf/modules.config                         | 10 +++
 conf/test.config                            |  1 +
 modules.json                                |  5 +-
 modules/nf-core/modules/centrifuge/main.nf  | 63 ++++++++++++++++++
 modules/nf-core/modules/centrifuge/meta.yml | 73 +++++++++++++++++++++
 nextflow.config                             |  8 ++-
 subworkflows/local/db_check.nf              |  2 +-
 subworkflows/local/input_check.nf           |  3 +-
 workflows/taxprofiler.nf                    | 22 +++++--
 9 files changed, 179 insertions(+), 8 deletions(-)
 create mode 100644 modules/nf-core/modules/centrifuge/main.nf
 create mode 100644 modules/nf-core/modules/centrifuge/meta.yml

diff --git a/conf/modules.config b/conf/modules.config
index 29a5135..20e6bba 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -121,4 +121,14 @@ process {
         ]
     }
 
+    withName: CENTRIFUGE {
+        publishDir = [
+            path: { "${params.outdir}/centrifuge/${meta.db_name}" },
+            mode: 'copy',
+            pattern: '*.{fastq.gz,txt}'
+        ]
+        ext.args = { "${meta.db_params}" }
+        ext.prefix = { "${meta.id}-${meta.db_name}" }
+    }
+
 }
diff --git a/conf/test.config b/conf/test.config
index 42d8de6..6fca9c0 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -29,5 +29,6 @@ params {
     run_kraken2         = true
     run_malt            = true
     shortread_clipmerge = true
+    run_centrifuge      = true
 
 }
diff --git a/modules.json b/modules.json
index 673a69b..b9dfc87 100644
--- a/modules.json
+++ b/modules.json
@@ -29,6 +29,9 @@
             "porechop": {
                 "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046"
             }
+            "centrifuge": {
+                "git_sha": "ea41a8a6f761b9993d857570e872abaae3fea555"
+            }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/modules/nf-core/modules/centrifuge/main.nf b/modules/nf-core/modules/centrifuge/main.nf
new file mode 100644
index 0000000..7eb566d
--- /dev/null
+++ b/modules/nf-core/modules/centrifuge/main.nf
@@ -0,0 +1,63 @@
+process CENTRIFUGE {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda (params.enable_conda ? "bioconda::centrifuge=1.0.4_beta" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6' :
+        'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }"
+
+    input:
+    tuple val(meta), path(reads)
+    path db
+    val save_unaligned
+    val save_aligned
+    val sam_format
+
+    output:
+    tuple val(meta), path('*report.txt')                 , emit: report
+    tuple val(meta), path('*results.txt')                , emit: results
+    tuple val(meta), path('*kreport.txt')                , emit: kreport
+    tuple val(meta), path('*.sam')                       , optional: true, emit: sam
+    tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz')   , optional: true, emit: fastq_mapped
+    tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped
+    path "versions.yml"                                  , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def paired = meta.single_end ? "-U ${reads}" :  "-1 ${reads[0]} -2 ${reads[1]}"
+    def db_name = db.toString().replace(".tar.gz","")
+    def unaligned = ''
+    def aligned = ''
+    if (meta.single_end) {
+        unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : ''
+        aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : ''
+    } else {
+        unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : ''
+        aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : ''
+    }
+    def sam_output = sam_format ? "--out-fmt 'sam'" : ''
+    """
+    tar -xf $db
+    centrifuge \\
+        -x $db_name \\
+        -p $task.cpus \\
+        $paired \\
+        --report-file ${prefix}.report.txt \\
+        -S ${prefix}.results.txt \\
+        $unaligned \\
+        $aligned \\
+        $sam_output \\
+        $args
+    centrifuge-kreport -x $db_name ${prefix}.results.txt > ${prefix}.kreport.txt
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        centrifuge: \$( centrifuge --version  | sed -n 1p | sed 's/^.*centrifuge-class version //')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/modules/centrifuge/meta.yml b/modules/nf-core/modules/centrifuge/meta.yml
new file mode 100644
index 0000000..3adf0e2
--- /dev/null
+++ b/modules/nf-core/modules/centrifuge/meta.yml
@@ -0,0 +1,73 @@
+name: centrifuge
+description: Classifies metagenomic sequence data
+keywords:
+  - classify
+  - metagenomics
+  - fastq
+  - db
+tools:
+  - centrifuge:
+      description: Centrifuge is a classifier for metagenomic sequences.
+      homepage: https://ccb.jhu.edu/software/centrifuge/
+      documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml
+      doi: 10.1101/gr.210641.116
+      licence: ["GPL v3"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: |
+        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+        respectively.
+  - db:
+      type: directory
+      description: Centrifuge database in .tar.gz format
+      pattern: "*.tar.gz"
+  - save_unaligned:
+      type: value
+      description: If true unmapped fastq files are saved
+  - save_aligned:
+      type: value
+      description: If true mapped fastq files are saved
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - report:
+      type: file
+      description: |
+        File containing a classification summary
+      pattern: "*.{report.txt}"
+  - results:
+      type: file
+      description: |
+        File containing classification results
+      pattern: "*.{results.txt}"
+  - kreport:
+      type: file
+      description: |
+        File containing kraken-style report from centrifuge
+        out files.
+      pattern: "*.{kreport.txt}"
+  - fastq_unmapped:
+      type: file
+      description: Unmapped fastq files
+      pattern: "*.unmapped.fastq.gz"
+  - fastq_mapped:
+      type: file
+      description: Mapped fastq files
+      pattern: "*.mapped.fastq.gz"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@sofstam"
+  - "@jfy133"
+  - "@sateeshperi"
diff --git a/nextflow.config b/nextflow.config
index 5f7aec6..5bd8f39 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -56,7 +56,7 @@ params {
 
     // FASTQ preprocessing
     shortread_clipmerge           = false
-    shortread_excludeunmerged        = true
+    shortread_excludeunmerged     = true
     longread_clip                 = false
 
     // MALT
@@ -65,6 +65,12 @@ params {
 
     // kraken2
     run_kraken2                = false
+
+    // centrifuge
+    run_centrifuge             = false
+    save_unaligned             = false
+    save_aligned               = false
+    sam_format                 = false
 }
 
 // Load base.config by default for all pipelines
diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf
index 890e373..28268c3 100644
--- a/subworkflows/local/db_check.nf
+++ b/subworkflows/local/db_check.nf
@@ -21,7 +21,7 @@ workflow DB_CHECK {
 
     ch_dbs_for_untar = parsed_samplesheet
         .branch {
-            untar: it[1].toString().endsWith(".tar.gz")
+            untar: it[1].toString().endsWith(".tar.gz") && it[0]['tool']!="centrifuge"
             skip: true
         }
 
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 4501386..b64e31e 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -67,8 +67,9 @@ def create_fastq_channel(LinkedHashMap row) {
             if (!file(row.fastq_2).exists()) {
                 exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}"
             }
-            fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
+         fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
         }
+
     }
     return fastq_meta
 }
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index 6fc5450..ea3ef18 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -58,7 +58,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/
 include { CAT_FASTQ                   } from '../modules/nf-core/modules/cat/fastq/main'
 include { MALT_RUN                    } from '../modules/nf-core/modules/malt/run/main'
 include { KRAKEN2_KRAKEN2             } from '../modules/nf-core/modules/kraken2/kraken2/main'
-
+include { CENTRIFUGE                  } from '../modules/nf-core/modules/centrifuge/main'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -149,9 +149,10 @@ workflow TAXPROFILER {
             .combine(DB_CHECK.out.dbs)
             .dump(tag: "reads_plus_db")
             .branch {
-                malt:    it[2]['tool'] == 'malt'
-                kraken2: it[2]['tool'] == 'kraken2'
-                unknown: true
+                malt:       it[2]['tool'] == 'malt'
+                kraken2:    it[2]['tool'] == 'kraken2'
+                centrifuge: it[2]['tool'] == 'centrifuge'
+                unknown:    true
             }
 
     //
@@ -184,6 +185,15 @@ workflow TAXPROFILER {
                                     db: it[3]
                             }
 
+    // We can run centrifuge one-by-one sample-wise
+    ch_input_for_centrifuge =  ch_input_for_profiling.centrifuge
+                               .dump(tag: "input for centrifuge")
+                               .multiMap {
+                                    it ->
+                                        reads: [ it[0] + it[2], it[1] ]
+                                        db: it[3]
+                                }
+
     //
     // RUN PROFILING
     //
@@ -195,6 +205,10 @@ workflow TAXPROFILER {
         KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db  )
     }
 
+    if ( params.run_centrifuge ) {
+        CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.save_unaligned, params.save_aligned, params.sam_format  )
+    }
+
     //
     // MODULE: MultiQC
     //