From 8126d16dee7e60f80d9cfb159db9199435bccd03 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Fri, 29 Apr 2022 21:59:42 +0200
Subject: [PATCH 1/3] Add draft version of DIAMOND

---
 CITATIONS.md                                  |  4 ++
 conf/modules.config                           | 50 ++++++++++-------
 conf/test.config                              |  5 ++
 docs/usage.md                                 |  3 ++
 modules.json                                  |  5 +-
 .../nf-core/modules/diamond/blastx/main.nf    | 53 +++++++++++++++++++
 .../nf-core/modules/diamond/blastx/meta.yml   | 52 ++++++++++++++++++
 nextflow.config                               |  4 ++
 nextflow_schema.json                          | 51 +++++++++++++++---
 subworkflows/local/profiling.nf               | 16 ++++++
 10 files changed, 216 insertions(+), 27 deletions(-)
 create mode 100644 modules/nf-core/modules/diamond/blastx/main.nf
 create mode 100644 modules/nf-core/modules/diamond/blastx/meta.yml

diff --git a/CITATIONS.md b/CITATIONS.md
index 02621d9..fd8c52a 100644
--- a/CITATIONS.md
+++ b/CITATIONS.md
@@ -52,6 +52,10 @@
 
   > Kim, Daehwan, Li Song, Florian P. Breitwieser, and Steven L. Salzberg. 2016. “Centrifuge: Rapid and Sensitive Classification of Metagenomic Sequences.” Genome Research 26 (12): 1721-29. doi: 10.1101/gr.210641.116.
 
+- [DIAMOND](https://doi.org/10.1038/nmeth.3176)
+
+> Buchfink, Benjamin, Chao Xie, and Daniel H. Huson. 2015. “Fast and Sensitive Protein Alignment Using DIAMOND.” Nature Methods 12 (1): 59-60. doi: 10.1038/nmeth.3176.
+
 ## Software packaging/containerisation tools
 
 - [Anaconda](https://anaconda.com)
diff --git a/conf/modules.config b/conf/modules.config
index a72561d..9b081b5 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -264,6 +264,36 @@ process {
         ]
     }
 
+    withName: KAIJU_KAIJU {
+        publishDir = [
+            path: { "${params.outdir}/kaiju/${meta.db_name}" },
+            mode: params.publish_dir_mode,
+            pattern: '*.tsv'
+        ]
+        ext.args = { "${meta.db_params}" }
+        ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
+    }
+
+   withName: KAIJU_KAIJU2TABLE {
+        ext.args = { "${meta.db_params}" }
+        ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
+        publishDir = [
+            path: { "${params.outdir}/kaiju/${meta.db_name}" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{txt}'
+        ]
+    }
+
+   withName: DIAMOND_BLASTX {
+        ext.args = { "${meta.db_params}" }
+        ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
+        publishDir = [
+            path: { "${params.outdir}/diamond/${meta.db_name}" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{blast,xml,txt,daa,sam,tsv,paf}'
+        ]
+    }
+
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },
@@ -279,24 +309,4 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-
-    withName: KAIJU_KAIJU {
-        publishDir = [
-            path: { "${params.outdir}/kaiju/${meta.db_name}" },
-            mode: params.publish_dir_mode,
-            pattern: '*.tsv'
-        ]
-        ext.args = { "${meta.db_params}" }
-        ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
-    }
-
-        withName: KAIJU_KAIJU2TABLE {
-        ext.args = { "${meta.db_params}" }
-        ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
-        publishDir = [
-            path: { "${params.outdir}/kaiju/${meta.db_name}" },
-            mode: params.publish_dir_mode,
-            pattern: '*.{txt}'
-        ]
-    }
 }
diff --git a/conf/test.config b/conf/test.config
index ecf55bd..35d3539 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -34,6 +34,11 @@ params {
     run_malt                              = true
     run_metaphlan3                        = true
     run_centrifuge                        = true
+    run_diamond                           = true
+    // TODO: setting to txt here as does not require taxonomy in database.
+    // Should consider re-building our test database but with the required
+    // taxonomy files, but this may make large files (prot2access: 9GB)
+    diamond_output_format                 = 'txt'
 }
 
 process {
diff --git a/docs/usage.md b/docs/usage.md
index 5d3268b..002f4f2 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -128,6 +128,9 @@ Expected (uncompressed) database files for each tool are as follows:
   - `kaiju_db_*.fmi`
   - `nodes.dmp`
   - `names.dmp`
+- **DIAMOND** output of `diamond makedb`. Note: requires building with taxonomy files
+  to generate taxonomic profile. See [DIAMOND documentation](https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#makedb-options).  A file named:
+  - `<database_name>.dmnd`
 
 ## Running the pipeline
 
diff --git a/modules.json b/modules.json
index ffcde5d..7b659c1 100644
--- a/modules.json
+++ b/modules.json
@@ -27,6 +27,9 @@
             "custom/dumpsoftwareversions": {
                 "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
             },
+            "diamond/blastx": {
+                "git_sha": "42564565b934eeb2449e35ec97ed13ff2a67f1de"
+            },
             "fastp": {
                 "git_sha": "d0a1cbb703a130c19f6796c3fce24fbe7dfce789"
             },
@@ -65,4 +68,4 @@
             }
         }
     }
-}
+}
\ No newline at end of file
diff --git a/modules/nf-core/modules/diamond/blastx/main.nf b/modules/nf-core/modules/diamond/blastx/main.nf
new file mode 100644
index 0000000..6703c1e
--- /dev/null
+++ b/modules/nf-core/modules/diamond/blastx/main.nf
@@ -0,0 +1,53 @@
+process DIAMOND_BLASTX {
+    tag "$meta.id"
+    label 'process_medium'
+
+    // Dimaond is limited to v2.0.9 because there is not a
+    // singularity version higher than this at the current time.
+    conda (params.enable_conda ? "bioconda::diamond=2.0.9" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/diamond:2.0.9--hdcc8f71_0' :
+        'quay.io/biocontainers/diamond:2.0.9--hdcc8f71_0' }"
+
+    input:
+    tuple val(meta), path(fasta)
+    path db
+    val outext
+
+    output:
+    tuple val(meta), path('*.{blast,xml,txt,daa,sam,tsv,paf}'), emit: output
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    switch ( outext ) {
+        case "blast": outfmt = 0; break
+        case "xml": outfmt = 5; break
+        case "txt": outfmt = 6; break
+        case "daa": outfmt = 100; break
+        case "sam": outfmt = 101; break
+        case "tsv": outfmt = 102; break
+        case "paf": outfmt = 103; break
+    }
+    """
+    DB=`find -L ./ -name "*.dmnd" | sed 's/.dmnd//'`
+
+    diamond \\
+        blastx \\
+        --threads $task.cpus \\
+        --db \$DB \\
+        --query $fasta \\
+        --outfmt ${outfmt} \\
+        $args \\
+        --out ${prefix}.${outext}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/modules/diamond/blastx/meta.yml b/modules/nf-core/modules/diamond/blastx/meta.yml
new file mode 100644
index 0000000..5ee2d55
--- /dev/null
+++ b/modules/nf-core/modules/diamond/blastx/meta.yml
@@ -0,0 +1,52 @@
+name: diamond_blastx
+description: Queries a DIAMOND database using blastx mode
+keywords:
+  - fasta
+  - diamond
+  - blastx
+  - DNA sequence
+tools:
+  - diamond:
+      description: Accelerated BLAST compatible local sequence aligner
+      homepage: https://github.com/bbuchfink/diamond
+      documentation: https://github.com/bbuchfink/diamond/wiki
+      tool_dev_url: https://github.com/bbuchfink/diamond
+      doi: "doi:10.1038/s41592-021-01101-x"
+      licence: ["GPL v3.0"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - fasta:
+      type: file
+      description: Input fasta file containing query sequences
+      pattern: "*.{fa,fasta}"
+  - db:
+      type: directory
+      description: Directory containing the nucelotide blast database
+      pattern: "*"
+  - outext:
+      type: string
+      description: |
+        Specify the type of output file to be generated. `blast` corresponds to
+        BLAST pairwise format. `xml` corresponds to BLAST xml format.
+        `txt` corresponds to to BLAST tabular format. `tsv` corresponds to
+        taxonomic classification format.
+      pattern: "blast|xml|txt|daa|sam|tsv|paf"
+
+output:
+  - txt:
+      type: file
+      description: File containing blastx hits
+      pattern: "*.{blastx.txt}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+authors:
+  - "@spficklin"
+  - "@jfy133"
diff --git a/nextflow.config b/nextflow.config
index 909da25..963a4a5 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -108,6 +108,10 @@ params {
     // kaiju
     run_kaiju                  = false
     kaiju_taxon_name           = 'species'
+
+    // diamond
+    run_diamond                = false
+    diamond_output_format      = 'tsv'
 }
 
 // Load base.config by default for all pipelines
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 83793e8..fc516d6 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -10,7 +10,10 @@
             "type": "object",
             "fa_icon": "fas fa-terminal",
             "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input", "outdir"],
+            "required": [
+                "input",
+                "outdir"
+            ],
             "properties": {
                 "input": {
                     "type": "string",
@@ -173,7 +176,14 @@
                     "description": "Method used to save pipeline results to output directory.",
                     "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                     "fa_icon": "fas fa-copy",
-                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
+                    "enum": [
+                        "symlink",
+                        "rellink",
+                        "link",
+                        "copy",
+                        "copyNoFollow",
+                        "move"
+                    ],
                     "hidden": true
                 },
                 "email_on_fail": {
@@ -294,7 +304,10 @@
         "shortread_clipmerge_tool": {
             "type": "string",
             "default": "fastp",
-            "enum": ["fastp", "adapterremoval"]
+            "enum": [
+                "fastp",
+                "adapterremoval"
+            ]
         },
         "shortread_clipmerge_skipadaptertrim": {
             "type": "boolean"
@@ -335,7 +348,10 @@
         "shortread_complexityfilter_prinseqplusplus_mode": {
             "type": "string",
             "default": "entropy",
-            "enum": ["entropy", "dust"]
+            "enum": [
+                "entropy",
+                "dust"
+            ]
         },
         "shortread_complexityfilter_prinseqplusplus_dustscore": {
             "type": "number",
@@ -388,7 +404,30 @@
         "kaiju_taxon_name": {
             "type": "string",
             "default": "species",
-            "enum": ["phylum", "class", "order", "family", "genus", "species"]
+            "enum": [
+                "phylum",
+                "class",
+                "order",
+                "family",
+                "genus",
+                "species"
+            ]
+        },
+        "run_diamond": {
+            "type": "boolean"
+        },
+        "diamond_output_format": {
+            "type": "string",
+            "default": "tsv",
+            "enum": [
+                "blast",
+                "xml",
+                "txt",
+                "daa",
+                "sam",
+                "tsv",
+                "paf"
+            ]
         }
     }
-}
+}
\ No newline at end of file
diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf
index 1f1d4da..9389e19 100644
--- a/subworkflows/local/profiling.nf
+++ b/subworkflows/local/profiling.nf
@@ -10,6 +10,8 @@ include { CENTRIFUGE_KREPORT          } from '../../modules/nf-core/modules/cent
 include { METAPHLAN3                  } from '../../modules/nf-core/modules/metaphlan3/main'
 include { KAIJU_KAIJU                 } from '../../modules/nf-core/modules/kaiju/kaiju/main'
 include { KAIJU_KAIJU2TABLE           } from '../../modules/nf-core/modules/kaiju/kaiju2table/main'
+include { DIAMOND_BLASTX              } from '../../modules/nf-core/modules/diamond/blastx/main'
+
 
 workflow PROFILING {
     take:
@@ -41,6 +43,7 @@ workflow PROFILING {
                 metaphlan3: it[2]['tool'] == 'metaphlan3'
                 centrifuge: it[2]['tool'] == 'centrifuge'
                 kaiju: it[2]['tool'] == 'kaiju'
+                diamond: it[2]['tool'] == 'diamond'
                 unknown: true
             }
 
@@ -109,6 +112,13 @@ workflow PROFILING {
                                     db: it[3]
                             }
 
+    ch_input_for_diamond = ch_input_for_profiling.diamond
+                            .multiMap {
+                                it ->
+                                    reads: [it[0] + it[2], it[1]]
+                                    db: it[3]
+                            }
+
     /*
         RUN PROFILING
     */
@@ -163,6 +173,12 @@ workflow PROFILING {
         ch_raw_profiles = ch_raw_profiles.mix( KAIJU_KAIJU2TABLE.out.summary )
     }
 
+    if ( params.run_diamond ) {
+        DIAMOND_BLASTX ( ch_input_for_diamond.reads, ch_input_for_diamond.db, params.diamond_output_format )
+        ch_versions        = ch_versions.mix( DIAMOND_BLASTX.out.versions.first() )
+        ch_raw_profiles    = ch_raw_profiles.mix( DIAMOND_BLASTX.out.output )
+    }
+
     emit:
     profiles = ch_raw_profiles    // channel: [ val(meta), [ reads ] ] - should be text files or biom
     versions = ch_versions          // channel: [ versions.yml ]

From a4a9b161d80914f7f1964b4a86c58229d4d884b3 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Fri, 29 Apr 2022 22:02:44 +0200
Subject: [PATCH 2/3] Lintin

---
 conf/modules.config  |  4 ++--
 docs/usage.md        |  2 +-
 modules.json         |  2 +-
 nextflow_schema.json | 45 +++++++-------------------------------------
 4 files changed, 11 insertions(+), 42 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 9b081b5..d8fb382 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -274,7 +274,7 @@ process {
         ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
     }
 
-   withName: KAIJU_KAIJU2TABLE {
+    withName: KAIJU_KAIJU2TABLE {
         ext.args = { "${meta.db_params}" }
         ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
         publishDir = [
@@ -284,7 +284,7 @@ process {
         ]
     }
 
-   withName: DIAMOND_BLASTX {
+    withName: DIAMOND_BLASTX {
         ext.args = { "${meta.db_params}" }
         ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
         publishDir = [
diff --git a/docs/usage.md b/docs/usage.md
index 002f4f2..cee2bb6 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -129,7 +129,7 @@ Expected (uncompressed) database files for each tool are as follows:
   - `nodes.dmp`
   - `names.dmp`
 - **DIAMOND** output of `diamond makedb`. Note: requires building with taxonomy files
-  to generate taxonomic profile. See [DIAMOND documentation](https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#makedb-options).  A file named:
+  to generate taxonomic profile. See [DIAMOND documentation](https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#makedb-options). A file named:
   - `<database_name>.dmnd`
 
 ## Running the pipeline
diff --git a/modules.json b/modules.json
index 7b659c1..a65926c 100644
--- a/modules.json
+++ b/modules.json
@@ -68,4 +68,4 @@
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/nextflow_schema.json b/nextflow_schema.json
index fc516d6..f429d1b 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -10,10 +10,7 @@
             "type": "object",
             "fa_icon": "fas fa-terminal",
             "description": "Define where the pipeline should find input data and save output data.",
-            "required": [
-                "input",
-                "outdir"
-            ],
+            "required": ["input", "outdir"],
             "properties": {
                 "input": {
                     "type": "string",
@@ -176,14 +173,7 @@
                     "description": "Method used to save pipeline results to output directory.",
                     "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                     "fa_icon": "fas fa-copy",
-                    "enum": [
-                        "symlink",
-                        "rellink",
-                        "link",
-                        "copy",
-                        "copyNoFollow",
-                        "move"
-                    ],
+                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
                     "hidden": true
                 },
                 "email_on_fail": {
@@ -304,10 +294,7 @@
         "shortread_clipmerge_tool": {
             "type": "string",
             "default": "fastp",
-            "enum": [
-                "fastp",
-                "adapterremoval"
-            ]
+            "enum": ["fastp", "adapterremoval"]
         },
         "shortread_clipmerge_skipadaptertrim": {
             "type": "boolean"
@@ -348,10 +335,7 @@
         "shortread_complexityfilter_prinseqplusplus_mode": {
             "type": "string",
             "default": "entropy",
-            "enum": [
-                "entropy",
-                "dust"
-            ]
+            "enum": ["entropy", "dust"]
         },
         "shortread_complexityfilter_prinseqplusplus_dustscore": {
             "type": "number",
@@ -404,14 +388,7 @@
         "kaiju_taxon_name": {
             "type": "string",
             "default": "species",
-            "enum": [
-                "phylum",
-                "class",
-                "order",
-                "family",
-                "genus",
-                "species"
-            ]
+            "enum": ["phylum", "class", "order", "family", "genus", "species"]
         },
         "run_diamond": {
             "type": "boolean"
@@ -419,15 +396,7 @@
         "diamond_output_format": {
             "type": "string",
             "default": "tsv",
-            "enum": [
-                "blast",
-                "xml",
-                "txt",
-                "daa",
-                "sam",
-                "tsv",
-                "paf"
-            ]
+            "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"]
         }
     }
-}
\ No newline at end of file
+}

From 0630fce3b5ddb4db1b1932b2405e11ba9bd321e2 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Sat, 30 Apr 2022 08:11:40 +0200
Subject: [PATCH 3/3] Tweak based on official DIAMOND test-data

---
 conf/test.config | 4 ----
 nextflow.config  | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/conf/test.config b/conf/test.config
index 35d3539..a2464b2 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -35,10 +35,6 @@ params {
     run_metaphlan3                        = true
     run_centrifuge                        = true
     run_diamond                           = true
-    // TODO: setting to txt here as does not require taxonomy in database.
-    // Should consider re-building our test database but with the required
-    // taxonomy files, but this may make large files (prot2access: 9GB)
-    diamond_output_format                 = 'txt'
 }
 
 process {
diff --git a/nextflow.config b/nextflow.config
index 963a4a5..5644786 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -111,7 +111,7 @@ params {
 
     // diamond
     run_diamond                = false
-    diamond_output_format      = 'tsv'
+    diamond_output_format      = 'txt'
 }
 
 // Load base.config by default for all pipelines