Add draft version of DIAMOND

2024-11-22 06:59:54 +00:00 · 2022-04-29 21:59:42 +02:00 · 2022-04-29 21:59:42 +02:00 · 8126d16dee
commit 8126d16dee
parent 120f86e5c7
10 changed files with 216 additions and 27 deletions
--- a/CITATIONS.md
+++ b/CITATIONS.md
@ -52,6 +52,10 @@

  > Kim, Daehwan, Li Song, Florian P. Breitwieser, and Steven L. Salzberg. 2016. “Centrifuge: Rapid and Sensitive Classification of Metagenomic Sequences.” Genome Research 26 (12): 1721-29. doi: 10.1101/gr.210641.116.

+- [DIAMOND](https://doi.org/10.1038/nmeth.3176)
+
+> Buchfink, Benjamin, Chao Xie, and Daniel H. Huson. 2015. “Fast and Sensitive Protein Alignment Using DIAMOND.” Nature Methods 12 (1): 59-60. doi: 10.1038/nmeth.3176.
+
 ## Software packaging/containerisation tools

 - [Anaconda](https://anaconda.com)
--- a/conf/modules.config
+++ b/conf/modules.config
@ -264,22 +264,6 @@ process {
        ]
    }

-    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
-        publishDir = [
-            path: { "${params.outdir}/pipeline_info" },
-            mode: params.publish_dir_mode,
-            pattern: '*_versions.yml'
-        ]
-    }
-
-    withName: MULTIQC {
-        publishDir = [
-            path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-
    withName: KAIJU_KAIJU {
        publishDir = [
            path: { "${params.outdir}/kaiju/${meta.db_name}" },
@ -299,4 +283,30 @@ process {
            pattern: '*.{txt}'
        ]
    }
+
+   withName: DIAMOND_BLASTX {
+        ext.args = { "${meta.db_params}" }
+        ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
+        publishDir = [
+            path: { "${params.outdir}/diamond/${meta.db_name}" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{blast,xml,txt,daa,sam,tsv,paf}'
+        ]
+    }
+
+    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
+        publishDir = [
+            path: { "${params.outdir}/pipeline_info" },
+            mode: params.publish_dir_mode,
+            pattern: '*_versions.yml'
+        ]
+    }
+
+    withName: MULTIQC {
+        publishDir = [
+            path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
 }
--- a/conf/test.config
+++ b/conf/test.config
@ -34,6 +34,11 @@ params {
    run_malt                              = true
    run_metaphlan3                        = true
    run_centrifuge                        = true
+    run_diamond                           = true
+    // TODO: setting to txt here as does not require taxonomy in database.
+    // Should consider re-building our test database but with the required
+    // taxonomy files, but this may make large files (prot2access: 9GB)
+    diamond_output_format                 = 'txt'
 }

 process {
--- a/docs/usage.md
+++ b/docs/usage.md
@ -128,6 +128,9 @@ Expected (uncompressed) database files for each tool are as follows:
  - `kaiju_db_*.fmi`
  - `nodes.dmp`
  - `names.dmp`
+- **DIAMOND** output of `diamond makedb`. Note: requires building with taxonomy files
+  to generate taxonomic profile. See [DIAMOND documentation](https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#makedb-options).  A file named:
+  - `<database_name>.dmnd`

 ## Running the pipeline

--- a/modules.json
+++ b/modules.json
@ -27,6 +27,9 @@
            "custom/dumpsoftwareversions": {
                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
            },
+            "diamond/blastx": {
+                "git_sha": "42564565b934eeb2449e35ec97ed13ff2a67f1de"
+            },
            "fastp": {
                "git_sha": "d0a1cbb703a130c19f6796c3fce24fbe7dfce789"
            },
--- a/modules/nf-core/modules/diamond/blastx/main.nf
+++ b/modules/nf-core/modules/diamond/blastx/main.nf
@ -0,0 +1,53 @@
+process DIAMOND_BLASTX {
+    tag "$meta.id"
+    label 'process_medium'
+
+    // Dimaond is limited to v2.0.9 because there is not a
+    // singularity version higher than this at the current time.
+    conda (params.enable_conda ? "bioconda::diamond=2.0.9" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/diamond:2.0.9--hdcc8f71_0' :
+        'quay.io/biocontainers/diamond:2.0.9--hdcc8f71_0' }"
+
+    input:
+    tuple val(meta), path(fasta)
+    path db
+    val outext
+
+    output:
+    tuple val(meta), path('*.{blast,xml,txt,daa,sam,tsv,paf}'), emit: output
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    switch ( outext ) {
+        case "blast": outfmt = 0; break
+        case "xml": outfmt = 5; break
+        case "txt": outfmt = 6; break
+        case "daa": outfmt = 100; break
+        case "sam": outfmt = 101; break
+        case "tsv": outfmt = 102; break
+        case "paf": outfmt = 103; break
+    }
+    """
+    DB=`find -L ./ -name "*.dmnd" | sed 's/.dmnd//'`
+
+    diamond \\
+        blastx \\
+        --threads $task.cpus \\
+        --db \$DB \\
+        --query $fasta \\
+        --outfmt ${outfmt} \\
+        $args \\
+        --out ${prefix}.${outext}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //')
+    END_VERSIONS
+    """
+}
--- a/modules/nf-core/modules/diamond/blastx/meta.yml
+++ b/modules/nf-core/modules/diamond/blastx/meta.yml
@ -0,0 +1,52 @@
+name: diamond_blastx
+description: Queries a DIAMOND database using blastx mode
+keywords:
+  - fasta
+  - diamond
+  - blastx
+  - DNA sequence
+tools:
+  - diamond:
+      description: Accelerated BLAST compatible local sequence aligner
+      homepage: https://github.com/bbuchfink/diamond
+      documentation: https://github.com/bbuchfink/diamond/wiki
+      tool_dev_url: https://github.com/bbuchfink/diamond
+      doi: "doi:10.1038/s41592-021-01101-x"
+      licence: ["GPL v3.0"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - fasta:
+      type: file
+      description: Input fasta file containing query sequences
+      pattern: "*.{fa,fasta}"
+  - db:
+      type: directory
+      description: Directory containing the nucelotide blast database
+      pattern: "*"
+  - outext:
+      type: string
+      description: |
+        Specify the type of output file to be generated. `blast` corresponds to
+        BLAST pairwise format. `xml` corresponds to BLAST xml format.
+        `txt` corresponds to to BLAST tabular format. `tsv` corresponds to
+        taxonomic classification format.
+      pattern: "blast|xml|txt|daa|sam|tsv|paf"
+
+output:
+  - txt:
+      type: file
+      description: File containing blastx hits
+      pattern: "*.{blastx.txt}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+authors:
+  - "@spficklin"
+  - "@jfy133"
--- a/nextflow.config
+++ b/nextflow.config
@ -108,6 +108,10 @@ params {
    // kaiju
    run_kaiju                  = false
    kaiju_taxon_name           = 'species'
+
+    // diamond
+    run_diamond                = false
+    diamond_output_format      = 'tsv'
 }

 // Load base.config by default for all pipelines
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -10,7 +10,10 @@
            "type": "object",
            "fa_icon": "fas fa-terminal",
            "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input", "outdir"],
+            "required": [
+                "input",
+                "outdir"
+            ],
            "properties": {
                "input": {
                    "type": "string",
@ -173,7 +176,14 @@
                    "description": "Method used to save pipeline results to output directory.",
                    "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                    "fa_icon": "fas fa-copy",
-                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
+                    "enum": [
+                        "symlink",
+                        "rellink",
+                        "link",
+                        "copy",
+                        "copyNoFollow",
+                        "move"
+                    ],
                    "hidden": true
                },
                "email_on_fail": {
@ -294,7 +304,10 @@
        "shortread_clipmerge_tool": {
            "type": "string",
            "default": "fastp",
-            "enum": ["fastp", "adapterremoval"]
+            "enum": [
+                "fastp",
+                "adapterremoval"
+            ]
        },
        "shortread_clipmerge_skipadaptertrim": {
            "type": "boolean"
@ -335,7 +348,10 @@
        "shortread_complexityfilter_prinseqplusplus_mode": {
            "type": "string",
            "default": "entropy",
-            "enum": ["entropy", "dust"]
+            "enum": [
+                "entropy",
+                "dust"
+            ]
        },
        "shortread_complexityfilter_prinseqplusplus_dustscore": {
            "type": "number",
@ -388,7 +404,30 @@
        "kaiju_taxon_name": {
            "type": "string",
            "default": "species",
-            "enum": ["phylum", "class", "order", "family", "genus", "species"]
+            "enum": [
+                "phylum",
+                "class",
+                "order",
+                "family",
+                "genus",
+                "species"
+            ]
+        },
+        "run_diamond": {
+            "type": "boolean"
+        },
+        "diamond_output_format": {
+            "type": "string",
+            "default": "tsv",
+            "enum": [
+                "blast",
+                "xml",
+                "txt",
+                "daa",
+                "sam",
+                "tsv",
+                "paf"
+            ]
        }
    }
 }
--- a/subworkflows/local/profiling.nf
+++ b/subworkflows/local/profiling.nf
@ -10,6 +10,8 @@ include { CENTRIFUGE_KREPORT          } from '../../modules/nf-core/modules/cent
 include { METAPHLAN3                  } from '../../modules/nf-core/modules/metaphlan3/main'
 include { KAIJU_KAIJU                 } from '../../modules/nf-core/modules/kaiju/kaiju/main'
 include { KAIJU_KAIJU2TABLE           } from '../../modules/nf-core/modules/kaiju/kaiju2table/main'
+include { DIAMOND_BLASTX              } from '../../modules/nf-core/modules/diamond/blastx/main'
+

 workflow PROFILING {
    take:
@ -41,6 +43,7 @@ workflow PROFILING {
                metaphlan3: it[2]['tool'] == 'metaphlan3'
                centrifuge: it[2]['tool'] == 'centrifuge'
                kaiju: it[2]['tool'] == 'kaiju'
+                diamond: it[2]['tool'] == 'diamond'
                unknown: true
            }

@ -109,6 +112,13 @@ workflow PROFILING {
                                    db: it[3]
                            }

+    ch_input_for_diamond = ch_input_for_profiling.diamond
+                            .multiMap {
+                                it ->
+                                    reads: [it[0] + it[2], it[1]]
+                                    db: it[3]
+                            }
+
    /*
        RUN PROFILING
    */
@ -163,6 +173,12 @@ workflow PROFILING {
        ch_raw_profiles = ch_raw_profiles.mix( KAIJU_KAIJU2TABLE.out.summary )
    }

+    if ( params.run_diamond ) {
+        DIAMOND_BLASTX ( ch_input_for_diamond.reads, ch_input_for_diamond.db, params.diamond_output_format )
+        ch_versions        = ch_versions.mix( DIAMOND_BLASTX.out.versions.first() )
+        ch_raw_profiles    = ch_raw_profiles.mix( DIAMOND_BLASTX.out.output )
+    }
+
    emit:
    profiles = ch_raw_profiles    // channel: [ val(meta), [ reads ] ] - should be text files or biom
    versions = ch_versions          // channel: [ versions.yml ]