Add draft version of DIAMOND

2024-11-22 13:19:54 +00:00 · 2022-04-29 21:59:42 +02:00 · 2022-04-29 21:59:42 +02:00 · 8126d16dee
commit 8126d16dee
parent 120f86e5c7
10 changed files with 216 additions and 27 deletions
--- a/CITATIONS.md
+++ b/CITATIONS.md
@ -52,6 +52,10 @@
  > Kim, Daehwan, Li Song, Florian P. Breitwieser, and Steven L. Salzberg. 2016. “Centrifuge: Rapid and Sensitive Classification of Metagenomic Sequences.” Genome Research 26 (12): 1721-29. doi: 10.1101/gr.210641.116.
 - [DIAMOND](https://doi.org/10.1038/nmeth.3176)
 > Buchfink, Benjamin, Chao Xie, and Daniel H. Huson. 2015. “Fast and Sensitive Protein Alignment Using DIAMOND.” Nature Methods 12 (1): 59-60. doi: 10.1038/nmeth.3176.
 ## Software packaging/containerisation tools
 - [Anaconda](https://anaconda.com)
--- a/conf/modules.config
+++ b/conf/modules.config
@ -264,22 +264,6 @@ process {
        ]
    }
    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
        publishDir = [
            path: { "${params.outdir}/pipeline_info" },
            mode: params.publish_dir_mode,
            pattern: '*_versions.yml'
        ]
    }
    withName: MULTIQC {
        publishDir = [
            path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
            mode: params.publish_dir_mode,
            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
        ]
    }
    withName: KAIJU_KAIJU {
        publishDir = [
            path: { "${params.outdir}/kaiju/${meta.db_name}" },
@ -299,4 +283,30 @@ process {
            pattern: '*.{txt}'
        ]
    }
   withName: DIAMOND_BLASTX {
        ext.args = { "${meta.db_params}" }
        ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
        publishDir = [
            path: { "${params.outdir}/diamond/${meta.db_name}" },
            mode: params.publish_dir_mode,
            pattern: '*.{blast,xml,txt,daa,sam,tsv,paf}'
        ]
    }
    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
        publishDir = [
            path: { "${params.outdir}/pipeline_info" },
            mode: params.publish_dir_mode,
            pattern: '*_versions.yml'
        ]
    }
    withName: MULTIQC {
        publishDir = [
            path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
            mode: params.publish_dir_mode,
            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
        ]
    }
 }
--- a/conf/test.config
+++ b/conf/test.config
@ -34,6 +34,11 @@ params {
    run_malt                              = true
    run_metaphlan3                        = true
    run_centrifuge                        = true
    run_diamond                           = true
    // TODO: setting to txt here as does not require taxonomy in database.
    // Should consider re-building our test database but with the required
    // taxonomy files, but this may make large files (prot2access: 9GB)
    diamond_output_format                 = 'txt'
 }
 process {
--- a/docs/usage.md
+++ b/docs/usage.md
@ -128,6 +128,9 @@ Expected (uncompressed) database files for each tool are as follows:
  - `kaiju_db_*.fmi`
  - `nodes.dmp`
  - `names.dmp`
 - **DIAMOND** output of `diamond makedb`. Note: requires building with taxonomy files
  to generate taxonomic profile. See [DIAMOND documentation](https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#makedb-options).  A file named:
  - `<database_name>.dmnd`
 ## Running the pipeline
--- a/modules.json
+++ b/modules.json
@ -27,6 +27,9 @@
            "custom/dumpsoftwareversions": {
                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
            },
            "diamond/blastx": {
                "git_sha": "42564565b934eeb2449e35ec97ed13ff2a67f1de"
            },
            "fastp": {
                "git_sha": "d0a1cbb703a130c19f6796c3fce24fbe7dfce789"
            },
--- a/modules/nf-core/modules/diamond/blastx/main.nf
+++ b/modules/nf-core/modules/diamond/blastx/main.nf
@ -0,0 +1,53 @@
 process DIAMOND_BLASTX {
    tag "$meta.id"
    label 'process_medium'
    // Dimaond is limited to v2.0.9 because there is not a
    // singularity version higher than this at the current time.
    conda (params.enable_conda ? "bioconda::diamond=2.0.9" : null)
    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
        'https://depot.galaxyproject.org/singularity/diamond:2.0.9--hdcc8f71_0' :
        'quay.io/biocontainers/diamond:2.0.9--hdcc8f71_0' }"
    input:
    tuple val(meta), path(fasta)
    path db
    val outext
    output:
    tuple val(meta), path('*.{blast,xml,txt,daa,sam,tsv,paf}'), emit: output
    path "versions.yml"           , emit: versions
    when:
    task.ext.when == null || task.ext.when
    script:
    def args = task.ext.args ?: ''
    def prefix = task.ext.prefix ?: "${meta.id}"
    switch ( outext ) {
        case "blast": outfmt = 0; break
        case "xml": outfmt = 5; break
        case "txt": outfmt = 6; break
        case "daa": outfmt = 100; break
        case "sam": outfmt = 101; break
        case "tsv": outfmt = 102; break
        case "paf": outfmt = 103; break
    }
    """
    DB=`find -L ./ -name "*.dmnd" | sed 's/.dmnd//'`
    diamond \\
        blastx \\
        --threads $task.cpus \\
        --db \$DB \\
        --query $fasta \\
        --outfmt ${outfmt} \\
        $args \\
        --out ${prefix}.${outext}
    cat <<-END_VERSIONS > versions.yml
    "${task.process}":
        diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //')
    END_VERSIONS
    """
 }
--- a/modules/nf-core/modules/diamond/blastx/meta.yml
+++ b/modules/nf-core/modules/diamond/blastx/meta.yml
@ -0,0 +1,52 @@
 name: diamond_blastx
 description: Queries a DIAMOND database using blastx mode
 keywords:
  - fasta
  - diamond
  - blastx
  - DNA sequence
 tools:
  - diamond:
      description: Accelerated BLAST compatible local sequence aligner
      homepage: https://github.com/bbuchfink/diamond
      documentation: https://github.com/bbuchfink/diamond/wiki
      tool_dev_url: https://github.com/bbuchfink/diamond
      doi: "doi:10.1038/s41592-021-01101-x"
      licence: ["GPL v3.0"]
 input:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - fasta:
      type: file
      description: Input fasta file containing query sequences
      pattern: "*.{fa,fasta}"
  - db:
      type: directory
      description: Directory containing the nucelotide blast database
      pattern: "*"
  - outext:
      type: string
      description: |
        Specify the type of output file to be generated. `blast` corresponds to
        BLAST pairwise format. `xml` corresponds to BLAST xml format.
        `txt` corresponds to to BLAST tabular format. `tsv` corresponds to
        taxonomic classification format.
      pattern: "blast|xml|txt|daa|sam|tsv|paf"
 output:
  - txt:
      type: file
      description: File containing blastx hits
      pattern: "*.{blastx.txt}"
  - versions:
      type: file
      description: File containing software versions
      pattern: "versions.yml"
 authors:
  - "@spficklin"
  - "@jfy133"
--- a/nextflow.config
+++ b/nextflow.config
@ -108,6 +108,10 @@ params {
    // kaiju
    run_kaiju                  = false
    kaiju_taxon_name           = 'species'
    // diamond
    run_diamond                = false
    diamond_output_format      = 'tsv'
 }
 // Load base.config by default for all pipelines
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -10,7 +10,10 @@
            "type": "object",
            "fa_icon": "fas fa-terminal",
            "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input", "outdir"],
+            "required": [
                "input",
                "outdir"
            ],
            "properties": {
                "input": {
                    "type": "string",
@ -173,7 +176,14 @@
                    "description": "Method used to save pipeline results to output directory.",
                    "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                    "fa_icon": "fas fa-copy",
-                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
+                    "enum": [
                        "symlink",
                        "rellink",
                        "link",
                        "copy",
                        "copyNoFollow",
                        "move"
                    ],
                    "hidden": true
                },
                "email_on_fail": {
@ -294,7 +304,10 @@
        "shortread_clipmerge_tool": {
            "type": "string",
            "default": "fastp",
-            "enum": ["fastp", "adapterremoval"]
+            "enum": [
                "fastp",
                "adapterremoval"
            ]
        },
        "shortread_clipmerge_skipadaptertrim": {
            "type": "boolean"
@ -335,7 +348,10 @@
        "shortread_complexityfilter_prinseqplusplus_mode": {
            "type": "string",
            "default": "entropy",
-            "enum": ["entropy", "dust"]
+            "enum": [
                "entropy",
                "dust"
            ]
        },
        "shortread_complexityfilter_prinseqplusplus_dustscore": {
            "type": "number",
@ -388,7 +404,30 @@
        "kaiju_taxon_name": {
            "type": "string",
            "default": "species",
-            "enum": ["phylum", "class", "order", "family", "genus", "species"]
+            "enum": [
                "phylum",
                "class",
                "order",
                "family",
                "genus",
                "species"
            ]
        },
        "run_diamond": {
            "type": "boolean"
        },
        "diamond_output_format": {
            "type": "string",
            "default": "tsv",
            "enum": [
                "blast",
                "xml",
                "txt",
                "daa",
                "sam",
                "tsv",
                "paf"
            ]
        }
    }
 }
--- a/subworkflows/local/profiling.nf
+++ b/subworkflows/local/profiling.nf
@ -10,6 +10,8 @@ include { CENTRIFUGE_KREPORT          } from '../../modules/nf-core/modules/cent
 include { METAPHLAN3                  } from '../../modules/nf-core/modules/metaphlan3/main'
 include { KAIJU_KAIJU                 } from '../../modules/nf-core/modules/kaiju/kaiju/main'
 include { KAIJU_KAIJU2TABLE           } from '../../modules/nf-core/modules/kaiju/kaiju2table/main'
 include { DIAMOND_BLASTX              } from '../../modules/nf-core/modules/diamond/blastx/main'
 workflow PROFILING {
    take:
@ -41,6 +43,7 @@ workflow PROFILING {
                metaphlan3: it[2]['tool'] == 'metaphlan3'
                centrifuge: it[2]['tool'] == 'centrifuge'
                kaiju: it[2]['tool'] == 'kaiju'
                diamond: it[2]['tool'] == 'diamond'
                unknown: true
            }
@ -109,6 +112,13 @@ workflow PROFILING {
                                    db: it[3]
                            }
    ch_input_for_diamond = ch_input_for_profiling.diamond
                            .multiMap {
                                it ->
                                    reads: [it[0] + it[2], it[1]]
                                    db: it[3]
                            }
    /*
        RUN PROFILING
    */
@ -163,6 +173,12 @@ workflow PROFILING {
        ch_raw_profiles = ch_raw_profiles.mix( KAIJU_KAIJU2TABLE.out.summary )
    }
    if ( params.run_diamond ) {
        DIAMOND_BLASTX ( ch_input_for_diamond.reads, ch_input_for_diamond.db, params.diamond_output_format )
        ch_versions        = ch_versions.mix( DIAMOND_BLASTX.out.versions.first() )
        ch_raw_profiles    = ch_raw_profiles.mix( DIAMOND_BLASTX.out.output )
    }
    emit:
    profiles = ch_raw_profiles    // channel: [ val(meta), [ reads ] ] - should be text files or biom
    versions = ch_versions          // channel: [ versions.yml ]