From 3868c3ab4bce121bde7f15b9137a0beef59e5d98 Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Tue, 5 Oct 2021 22:23:01 +0200 Subject: [PATCH] Add gtdbtk/classifywf module (#765) * initial commit [ci skip] * reuse the modules code from nf-core/mag [ci skip] * add contextual information for the module [ci skip] * add stubs to avoid downloading db [ci skip] * trigger test * iterate on tests [ci skip] * itereate tests [ci skip] * add bins [ci skip] * fix stubs [ci skip] * interation on tests with stubs [ci skip] * use the existing pattern and fasta for input * accomodate the new version file format * use variable for the stub [ci skip] * update the versions file in meta.yml * Accomodate code review regarding publishDir function [ci skip] Co-authored-by: Harshil Patel * remove extra newline * use bioconda channel * update the description for filtered file * Apply suggestions from code review * Update main.nf * Update main.nf * Update modules/gtdbtk/classifywf/meta.yml Co-authored-by: Harshil Patel Co-authored-by: Robert A. Petit III Co-authored-by: Harshil Patel --- modules/gtdbtk/classifywf/functions.nf | 78 ++++++++++++++++++++++ modules/gtdbtk/classifywf/main.nf | 83 ++++++++++++++++++++++++ modules/gtdbtk/classifywf/meta.yml | 78 ++++++++++++++++++++++ tests/config/pytest_modules.yml | 4 ++ tests/modules/gtdbtk/classifywf/main.nf | 32 +++++++++ tests/modules/gtdbtk/classifywf/test.yml | 8 +++ 6 files changed, 283 insertions(+) create mode 100644 modules/gtdbtk/classifywf/functions.nf create mode 100644 modules/gtdbtk/classifywf/main.nf create mode 100644 modules/gtdbtk/classifywf/meta.yml create mode 100644 tests/modules/gtdbtk/classifywf/main.nf create mode 100644 tests/modules/gtdbtk/classifywf/test.yml diff --git a/modules/gtdbtk/classifywf/functions.nf b/modules/gtdbtk/classifywf/functions.nf new file mode 100644 index 00000000..85628ee0 --- /dev/null +++ b/modules/gtdbtk/classifywf/functions.nf @@ -0,0 +1,78 @@ +// +// Utility functions used in nf-core DSL2 module files +// + +// +// Extract name of software tool from process name using $task.process +// +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +// +// Extract name of module from process name using $task.process +// +def getProcessName(task_process) { + return task_process.tokenize(':')[-1] +} + +// +// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules +// +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.args3 = args.args3 ?: '' + options.publish_by_meta = args.publish_by_meta ?: [] + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +// +// Tidy up and join elements of a list to return a path string +// +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +// +// Function to save/publish module results +// +def saveFiles(Map args) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + + // Do not publish versions.yml unless running from pytest workflow + if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) { + return null + } + if (ioptions.publish_by_meta) { + def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta + for (key in key_list) { + if (args.meta && key instanceof String) { + def path = key + if (args.meta.containsKey(key)) { + path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] + } + path = path instanceof String ? path : '' + path_list.add(path) + } + } + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } +} diff --git a/modules/gtdbtk/classifywf/main.nf b/modules/gtdbtk/classifywf/main.nf new file mode 100644 index 00000000..fdcef76a --- /dev/null +++ b/modules/gtdbtk/classifywf/main.nf @@ -0,0 +1,83 @@ +include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +def VERSION = '1.5.0' // When using stubs for the GTDB database, the version info isn't printed. + +process GTDBTK_CLASSIFYWF { + tag "${meta.assembler}-${meta.id}" + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "bioconda::gtdbtk=1.5.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/gtdbtk:1.5.0--pyhdfd78af_0" + } else { + container "quay.io/biocontainers/gtdbtk:1.5.0--pyhdfd78af_0" + } + + input: + tuple val(meta), path("bins/*") + tuple val(db_name), path("database/*") + + output: + path "gtdbtk.${meta.assembler}-${meta.id}.*.summary.tsv" , emit: summary + path "gtdbtk.${meta.assembler}-${meta.id}.*.classify.tree.gz" , emit: tree + path "gtdbtk.${meta.assembler}-${meta.id}.*.markers_summary.tsv", emit: markers + path "gtdbtk.${meta.assembler}-${meta.id}.*.msa.fasta.gz" , emit: msa + path "gtdbtk.${meta.assembler}-${meta.id}.*.user_msa.fasta" , emit: user_msa + path "gtdbtk.${meta.assembler}-${meta.id}.*.filtered.tsv" , emit: filtered + path "gtdbtk.${meta.assembler}-${meta.id}.log" , emit: log + path "gtdbtk.${meta.assembler}-${meta.id}.warnings.log" , emit: warnings + path "gtdbtk.${meta.assembler}-${meta.id}.failed_genomes.tsv" , emit: failed + path "versions.yml" , emit: versions + + script: + def pplacer_scratch = params.gtdbtk_pplacer_scratch ? "--scratch_dir pplacer_tmp" : "" + """ + export GTDBTK_DATA_PATH="\${PWD}/database" + if [ ${pplacer_scratch} != "" ] ; then + mkdir pplacer_tmp + fi + + gtdbtk classify_wf \\ + $options.args \\ + --genome_dir bins \\ + --prefix "gtdbtk.${meta.assembler}-${meta.id}" \\ + --out_dir "\${PWD}" \\ + --cpus $task.cpus \\ + --pplacer_cpus $params.gtdbtk_pplacer_cpus \\ + $pplacer_scratch \\ + --min_perc_aa $params.gtdbtk_min_perc_aa \\ + --min_af $params.gtdbtk_min_af + + gzip "gtdbtk.${meta.assembler}-${meta.id}".*.classify.tree "gtdbtk.${meta.assembler}-${meta.id}".*.msa.fasta + mv gtdbtk.log "gtdbtk.${meta.assembler}-${meta.id}.log" + mv gtdbtk.warnings.log "gtdbtk.${meta.assembler}-${meta.id}.warnings.log" + + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + ${getSoftwareName(task.process)}: \$(echo \$(gtdbtk --version -v 2>&1) | sed "s/gtdbtk: version //; s/ Copyright.*//") + END_VERSIONS + """ + + stub: + """ + touch gtdbtk.${meta.assembler}-${meta.id}.stub.summary.tsv + touch gtdbtk.${meta.assembler}-${meta.id}.stub.classify.tree.gz + touch gtdbtk.${meta.assembler}-${meta.id}.stub.markers_summary.tsv + touch gtdbtk.${meta.assembler}-${meta.id}.stub.msa.fasta.gz + touch gtdbtk.${meta.assembler}-${meta.id}.stub.user_msa.fasta + touch gtdbtk.${meta.assembler}-${meta.id}.stub.filtered.tsv + touch gtdbtk.${meta.assembler}-${meta.id}.log + touch gtdbtk.${meta.assembler}-${meta.id}.warnings.log + touch gtdbtk.${meta.assembler}-${meta.id}.failed_genomes.tsv + + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + ${getSoftwareName(task.process)}: \$(echo "$VERSION") + END_VERSIONS + """ +} diff --git a/modules/gtdbtk/classifywf/meta.yml b/modules/gtdbtk/classifywf/meta.yml new file mode 100644 index 00000000..d70de362 --- /dev/null +++ b/modules/gtdbtk/classifywf/meta.yml @@ -0,0 +1,78 @@ +name: gtdbtk_classifywf +description: GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB. +keywords: + - GTDB taxonomy + - taxonomic classification +tools: + - gtdbtk: + description: GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB. + homepage: https://ecogenomics.github.io/GTDBTk/ + documentation: https://ecogenomics.github.io/GTDBTk/ + tool_dev_url: https://github.com/Ecogenomics/GTDBTk + doi: "10.1093/bioinformatics/btz848" + licence: ['GNU General Public v3 (GPL v3)'] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, assembler:'spades' ] + - bins: + type: The binned fasta files from the assembler + description: Fasta files + pattern: "*.{fasta,fa}" + - database: + type: The local copy of the taxonomic database used by GTDB-tk + description: The unzipped copy of the database + pattern: "*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - summary: + type: file + description: A TSV summary file for the classification + pattern: "*.{summary.tsv}" + - tree: + type: file + description: NJ or UPGMA tree in Newick format produced from a multiple sequence alignment + pattern: "*.{classify.tree.gz}" + - markers: + type: file + description: A TSV summary file lineage markers used for the classification. + pattern: "*.{markers_summary.tsv}" + - msa: + type: file + description: Multiple sequence alignments file. + pattern: "*.{msa.fasta.gz}" + - user_msa: + type: file + description: Multiple sequence alignments file for the user-provided files. + pattern: "*.{user_msa.fasta.gz}" + - filtered: + type: file + description: A list of genomes with an insufficient number of amino acids in MSA.. + pattern: "*.{filtered.tsv}" + - log: + type: file + description: GTDB-tk log file + pattern: "*.{log}" + - warnings: + type: file + description: GTDB-tk warnings log file + pattern: "*.{warnings.log}" + - failed: + type: file + description: A TSV summary of the genomes which GTDB-tk failed to classify. + pattern: "*.{failed_genomes.tsv}" +authors: + - "@skrakau" + - "@abhi18av" diff --git a/tests/config/pytest_modules.yml b/tests/config/pytest_modules.yml index af3645df..34a3889b 100644 --- a/tests/config/pytest_modules.yml +++ b/tests/config/pytest_modules.yml @@ -458,6 +458,10 @@ graphmap2/index: - modules/graphmap2/index/** - tests/modules/graphmap2/index/** +gtdbtk/classifywf: + - modules/gtdbtk/classifywf/** + - tests/modules/gtdbtk/classifywf/** + gubbins: - modules/gubbins/** - tests/modules/gubbins/** diff --git a/tests/modules/gtdbtk/classifywf/main.nf b/tests/modules/gtdbtk/classifywf/main.nf new file mode 100644 index 00000000..f52b0ccc --- /dev/null +++ b/tests/modules/gtdbtk/classifywf/main.nf @@ -0,0 +1,32 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +include { GTDBTK_CLASSIFYWF } from '../../../../modules/gtdbtk/classifywf/main.nf' addParams( options: [:] ) + +process STUB_GTDBTK_DATABASE { + output: + tuple val("gtdbtk_r202_data"), path("database/*"), emit: database + + stub: + """ + mkdir database + touch database/gtdbtk_r202_data + """ +} + +workflow test_gtdbtk_classifywf { + + STUB_GTDBTK_DATABASE() + + input = [ + [ id:'test', single_end:false, assembler:'SPADES' ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['contigs_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['scaffolds_fasta'], checkIfExists: true) + ] + ] + + GTDBTK_CLASSIFYWF ( input, STUB_GTDBTK_DATABASE.out.database ) +} diff --git a/tests/modules/gtdbtk/classifywf/test.yml b/tests/modules/gtdbtk/classifywf/test.yml new file mode 100644 index 00000000..6d0f055e --- /dev/null +++ b/tests/modules/gtdbtk/classifywf/test.yml @@ -0,0 +1,8 @@ +- name: gtdbtk classifywf + command: nextflow run ./tests/modules/gtdbtk/classifywf -entry test_gtdbtk_classifywf -c tests/config/nextflow.config -stub-run + tags: + - gtdbtk + - gtdbtk/classifywf + files: + - path: output/gtdbtk/gtdbtk.SPADES-test.stub.summary.tsv + md5sum: d41d8cd98f00b204e9800998ecf8427e