From 8126d16dee7e60f80d9cfb159db9199435bccd03 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 29 Apr 2022 21:59:42 +0200 Subject: [PATCH 1/3] Add draft version of DIAMOND --- CITATIONS.md | 4 ++ conf/modules.config | 50 ++++++++++------- conf/test.config | 5 ++ docs/usage.md | 3 ++ modules.json | 5 +- .../nf-core/modules/diamond/blastx/main.nf | 53 +++++++++++++++++++ .../nf-core/modules/diamond/blastx/meta.yml | 52 ++++++++++++++++++ nextflow.config | 4 ++ nextflow_schema.json | 51 +++++++++++++++--- subworkflows/local/profiling.nf | 16 ++++++ 10 files changed, 216 insertions(+), 27 deletions(-) create mode 100644 modules/nf-core/modules/diamond/blastx/main.nf create mode 100644 modules/nf-core/modules/diamond/blastx/meta.yml diff --git a/CITATIONS.md b/CITATIONS.md index 02621d9..fd8c52a 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -52,6 +52,10 @@ > Kim, Daehwan, Li Song, Florian P. Breitwieser, and Steven L. Salzberg. 2016. “Centrifuge: Rapid and Sensitive Classification of Metagenomic Sequences.” Genome Research 26 (12): 1721-29. doi: 10.1101/gr.210641.116. +- [DIAMOND](https://doi.org/10.1038/nmeth.3176) + +> Buchfink, Benjamin, Chao Xie, and Daniel H. Huson. 2015. “Fast and Sensitive Protein Alignment Using DIAMOND.” Nature Methods 12 (1): 59-60. doi: 10.1038/nmeth.3176. + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/conf/modules.config b/conf/modules.config index a72561d..9b081b5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -264,6 +264,36 @@ process { ] } + withName: KAIJU_KAIJU { + publishDir = [ + path: { "${params.outdir}/kaiju/${meta.db_name}" }, + mode: params.publish_dir_mode, + pattern: '*.tsv' + ] + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + } + + withName: KAIJU_KAIJU2TABLE { + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/kaiju/${meta.db_name}" }, + mode: params.publish_dir_mode, + pattern: '*.{txt}' + ] + } + + withName: DIAMOND_BLASTX { + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/diamond/${meta.db_name}" }, + mode: params.publish_dir_mode, + pattern: '*.{blast,xml,txt,daa,sam,tsv,paf}' + ] + } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -279,24 +309,4 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - - withName: KAIJU_KAIJU { - publishDir = [ - path: { "${params.outdir}/kaiju/${meta.db_name}" }, - mode: params.publish_dir_mode, - pattern: '*.tsv' - ] - ext.args = { "${meta.db_params}" } - ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } - } - - withName: KAIJU_KAIJU2TABLE { - ext.args = { "${meta.db_params}" } - ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } - publishDir = [ - path: { "${params.outdir}/kaiju/${meta.db_name}" }, - mode: params.publish_dir_mode, - pattern: '*.{txt}' - ] - } } diff --git a/conf/test.config b/conf/test.config index ecf55bd..35d3539 100644 --- a/conf/test.config +++ b/conf/test.config @@ -34,6 +34,11 @@ params { run_malt = true run_metaphlan3 = true run_centrifuge = true + run_diamond = true + // TODO: setting to txt here as does not require taxonomy in database. + // Should consider re-building our test database but with the required + // taxonomy files, but this may make large files (prot2access: 9GB) + diamond_output_format = 'txt' } process { diff --git a/docs/usage.md b/docs/usage.md index 5d3268b..002f4f2 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -128,6 +128,9 @@ Expected (uncompressed) database files for each tool are as follows: - `kaiju_db_*.fmi` - `nodes.dmp` - `names.dmp` +- **DIAMOND** output of `diamond makedb`. Note: requires building with taxonomy files + to generate taxonomic profile. See [DIAMOND documentation](https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#makedb-options). A file named: + - `.dmnd` ## Running the pipeline diff --git a/modules.json b/modules.json index ffcde5d..7b659c1 100644 --- a/modules.json +++ b/modules.json @@ -27,6 +27,9 @@ "custom/dumpsoftwareversions": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, + "diamond/blastx": { + "git_sha": "42564565b934eeb2449e35ec97ed13ff2a67f1de" + }, "fastp": { "git_sha": "d0a1cbb703a130c19f6796c3fce24fbe7dfce789" }, @@ -65,4 +68,4 @@ } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/diamond/blastx/main.nf b/modules/nf-core/modules/diamond/blastx/main.nf new file mode 100644 index 0000000..6703c1e --- /dev/null +++ b/modules/nf-core/modules/diamond/blastx/main.nf @@ -0,0 +1,53 @@ +process DIAMOND_BLASTX { + tag "$meta.id" + label 'process_medium' + + // Dimaond is limited to v2.0.9 because there is not a + // singularity version higher than this at the current time. + conda (params.enable_conda ? "bioconda::diamond=2.0.9" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/diamond:2.0.9--hdcc8f71_0' : + 'quay.io/biocontainers/diamond:2.0.9--hdcc8f71_0' }" + + input: + tuple val(meta), path(fasta) + path db + val outext + + output: + tuple val(meta), path('*.{blast,xml,txt,daa,sam,tsv,paf}'), emit: output + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + switch ( outext ) { + case "blast": outfmt = 0; break + case "xml": outfmt = 5; break + case "txt": outfmt = 6; break + case "daa": outfmt = 100; break + case "sam": outfmt = 101; break + case "tsv": outfmt = 102; break + case "paf": outfmt = 103; break + } + """ + DB=`find -L ./ -name "*.dmnd" | sed 's/.dmnd//'` + + diamond \\ + blastx \\ + --threads $task.cpus \\ + --db \$DB \\ + --query $fasta \\ + --outfmt ${outfmt} \\ + $args \\ + --out ${prefix}.${outext} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/diamond/blastx/meta.yml b/modules/nf-core/modules/diamond/blastx/meta.yml new file mode 100644 index 0000000..5ee2d55 --- /dev/null +++ b/modules/nf-core/modules/diamond/blastx/meta.yml @@ -0,0 +1,52 @@ +name: diamond_blastx +description: Queries a DIAMOND database using blastx mode +keywords: + - fasta + - diamond + - blastx + - DNA sequence +tools: + - diamond: + description: Accelerated BLAST compatible local sequence aligner + homepage: https://github.com/bbuchfink/diamond + documentation: https://github.com/bbuchfink/diamond/wiki + tool_dev_url: https://github.com/bbuchfink/diamond + doi: "doi:10.1038/s41592-021-01101-x" + licence: ["GPL v3.0"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input fasta file containing query sequences + pattern: "*.{fa,fasta}" + - db: + type: directory + description: Directory containing the nucelotide blast database + pattern: "*" + - outext: + type: string + description: | + Specify the type of output file to be generated. `blast` corresponds to + BLAST pairwise format. `xml` corresponds to BLAST xml format. + `txt` corresponds to to BLAST tabular format. `tsv` corresponds to + taxonomic classification format. + pattern: "blast|xml|txt|daa|sam|tsv|paf" + +output: + - txt: + type: file + description: File containing blastx hits + pattern: "*.{blastx.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@spficklin" + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index 909da25..963a4a5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -108,6 +108,10 @@ params { // kaiju run_kaiju = false kaiju_taxon_name = 'species' + + // diamond + run_diamond = false + diamond_output_format = 'tsv' } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index 83793e8..fc516d6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -294,7 +304,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -335,7 +348,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -388,7 +404,30 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": ["phylum", "class", "order", "family", "genus", "species"] + "enum": [ + "phylum", + "class", + "order", + "family", + "genus", + "species" + ] + }, + "run_diamond": { + "type": "boolean" + }, + "diamond_output_format": { + "type": "string", + "default": "tsv", + "enum": [ + "blast", + "xml", + "txt", + "daa", + "sam", + "tsv", + "paf" + ] } } -} +} \ No newline at end of file diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 1f1d4da..9389e19 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -10,6 +10,8 @@ include { CENTRIFUGE_KREPORT } from '../../modules/nf-core/modules/cent include { METAPHLAN3 } from '../../modules/nf-core/modules/metaphlan3/main' include { KAIJU_KAIJU } from '../../modules/nf-core/modules/kaiju/kaiju/main' include { KAIJU_KAIJU2TABLE } from '../../modules/nf-core/modules/kaiju/kaiju2table/main' +include { DIAMOND_BLASTX } from '../../modules/nf-core/modules/diamond/blastx/main' + workflow PROFILING { take: @@ -41,6 +43,7 @@ workflow PROFILING { metaphlan3: it[2]['tool'] == 'metaphlan3' centrifuge: it[2]['tool'] == 'centrifuge' kaiju: it[2]['tool'] == 'kaiju' + diamond: it[2]['tool'] == 'diamond' unknown: true } @@ -109,6 +112,13 @@ workflow PROFILING { db: it[3] } + ch_input_for_diamond = ch_input_for_profiling.diamond + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + /* RUN PROFILING */ @@ -163,6 +173,12 @@ workflow PROFILING { ch_raw_profiles = ch_raw_profiles.mix( KAIJU_KAIJU2TABLE.out.summary ) } + if ( params.run_diamond ) { + DIAMOND_BLASTX ( ch_input_for_diamond.reads, ch_input_for_diamond.db, params.diamond_output_format ) + ch_versions = ch_versions.mix( DIAMOND_BLASTX.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( DIAMOND_BLASTX.out.output ) + } + emit: profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom versions = ch_versions // channel: [ versions.yml ] From a4a9b161d80914f7f1964b4a86c58229d4d884b3 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 29 Apr 2022 22:02:44 +0200 Subject: [PATCH 2/3] Lintin --- conf/modules.config | 4 ++-- docs/usage.md | 2 +- modules.json | 2 +- nextflow_schema.json | 45 +++++++------------------------------------- 4 files changed, 11 insertions(+), 42 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 9b081b5..d8fb382 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -274,7 +274,7 @@ process { ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } - withName: KAIJU_KAIJU2TABLE { + withName: KAIJU_KAIJU2TABLE { ext.args = { "${meta.db_params}" } ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ @@ -284,7 +284,7 @@ process { ] } - withName: DIAMOND_BLASTX { + withName: DIAMOND_BLASTX { ext.args = { "${meta.db_params}" } ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ diff --git a/docs/usage.md b/docs/usage.md index 002f4f2..cee2bb6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -129,7 +129,7 @@ Expected (uncompressed) database files for each tool are as follows: - `nodes.dmp` - `names.dmp` - **DIAMOND** output of `diamond makedb`. Note: requires building with taxonomy files - to generate taxonomic profile. See [DIAMOND documentation](https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#makedb-options). A file named: + to generate taxonomic profile. See [DIAMOND documentation](https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#makedb-options). A file named: - `.dmnd` ## Running the pipeline diff --git a/modules.json b/modules.json index 7b659c1..a65926c 100644 --- a/modules.json +++ b/modules.json @@ -68,4 +68,4 @@ } } } -} \ No newline at end of file +} diff --git a/nextflow_schema.json b/nextflow_schema.json index fc516d6..f429d1b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -304,10 +294,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -348,10 +335,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -404,14 +388,7 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": [ - "phylum", - "class", - "order", - "family", - "genus", - "species" - ] + "enum": ["phylum", "class", "order", "family", "genus", "species"] }, "run_diamond": { "type": "boolean" @@ -419,15 +396,7 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": [ - "blast", - "xml", - "txt", - "daa", - "sam", - "tsv", - "paf" - ] + "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"] } } -} \ No newline at end of file +} From 0630fce3b5ddb4db1b1932b2405e11ba9bd321e2 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 30 Apr 2022 08:11:40 +0200 Subject: [PATCH 3/3] Tweak based on official DIAMOND test-data --- conf/test.config | 4 ---- nextflow.config | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/conf/test.config b/conf/test.config index 35d3539..a2464b2 100644 --- a/conf/test.config +++ b/conf/test.config @@ -35,10 +35,6 @@ params { run_metaphlan3 = true run_centrifuge = true run_diamond = true - // TODO: setting to txt here as does not require taxonomy in database. - // Should consider re-building our test database but with the required - // taxonomy files, but this may make large files (prot2access: 9GB) - diamond_output_format = 'txt' } process { diff --git a/nextflow.config b/nextflow.config index 963a4a5..5644786 100644 --- a/nextflow.config +++ b/nextflow.config @@ -111,7 +111,7 @@ params { // diamond run_diamond = false - diamond_output_format = 'tsv' + diamond_output_format = 'txt' } // Load base.config by default for all pipelines