From 9d7208504d1898757cfdbb3065718e6545940408 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 22 Jun 2022 13:43:45 +0200 Subject: [PATCH] Add gecco/run module (#1790) * Add gecco/run module * Fix container URLs * Apply suggestions from code review Co-authored-by: Jasmin F <73216762+jasmezz@users.noreply.github.com> --- modules/gecco/run/main.nf | 47 +++++++++++++++++ modules/gecco/run/meta.yml | 67 +++++++++++++++++++++++++ tests/config/pytest_modules.yml | 4 ++ tests/modules/gecco/run/main.nf | 17 +++++++ tests/modules/gecco/run/nextflow.config | 5 ++ tests/modules/gecco/run/test.yml | 13 +++++ 6 files changed, 153 insertions(+) create mode 100644 modules/gecco/run/main.nf create mode 100644 modules/gecco/run/meta.yml create mode 100644 tests/modules/gecco/run/main.nf create mode 100644 tests/modules/gecco/run/nextflow.config create mode 100644 tests/modules/gecco/run/test.yml diff --git a/modules/gecco/run/main.nf b/modules/gecco/run/main.nf new file mode 100644 index 00000000..7103250c --- /dev/null +++ b/modules/gecco/run/main.nf @@ -0,0 +1,47 @@ +process GECCO_RUN { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::gecco=0.9.2" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gecco:0.9.2--pyhdfd78af_0': + 'quay.io/biocontainers/gecco:0.9.2--pyhdfd78af_0' }" + + input: + tuple val(meta), path(input), path(hmm) + path model_dir + + + output: + tuple val(meta), path("*.genes.tsv") , emit: genes + tuple val(meta), path("*.features.tsv") , emit: features + tuple val(meta), path("*.clusters.tsv") , emit: clusters + tuple val(meta), path("*_cluster_*.gbk"), optional: true, emit: gbk + tuple val(meta), path("*.json") , optional: true, emit: json + + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def custom_model = model_dir ? "--model ${model_dir}" : "" + def custom_hmm = hmm ? "--hmm ${hmm}" : "" + """ + gecco \\ + run \\ + $args \\ + -j $task.cpus \\ + -o ./ \\ + -g ${input} \\ + $custom_model \\ + $custom_hmm + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gecco: \$(echo \$(gecco --version) | cut -f 2 -d ' ' ) + END_VERSIONS + """ +} diff --git a/modules/gecco/run/meta.yml b/modules/gecco/run/meta.yml new file mode 100644 index 00000000..bfa55586 --- /dev/null +++ b/modules/gecco/run/meta.yml @@ -0,0 +1,67 @@ +name: "gecco_run" +description: GECCO is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs). +keywords: + - bgc + - detection + - metagenomics + - contigs +tools: + - "gecco": + description: "Biosynthetic Gene Cluster prediction with Conditional Random Fields." + homepage: "https://gecco.embl.de" + documentation: "https://gecco.embl.de" + tool_dev_url: "https://github.com/zellerlab/GECCO" + doi: "10.1101/2021.05.03.442509" + licence: "['GPL v3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: A genomic file containing one or more sequences as input. Input type is any supported by Biopython (fasta, gbk, etc.) + pattern: "*" + - hmm: + file: file + description: Alternative HMM file(s) to use in HMMER format + pattern: "*.hmm" + - model_dir: + file: directory + description: Path to an alternative CRF (Conditional Random Fields) module to use + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - genes: + type: file + description: TSV file containing detected/predicted genes with BGC probability scores + pattern: "*.genes.tsv" + - features: + type: file + description: TSV file containing identified domains + pattern: "*.features.tsv" + - clusters: + type: file + description: TSV file containing coordinates of predicted clusters and BGC types + pattern: "*.clusters.tsv" + - gbk: + type: file + description: Per cluster GenBank file (if found) containing sequence with annotations + pattern: "*.gbk" + - json: + type: file + description: AntiSMASH v6 sideload JSON file (if --antismash-sideload) supplied + pattern: "*.gbk" + +authors: + - "@jfy133" diff --git a/tests/config/pytest_modules.yml b/tests/config/pytest_modules.yml index 6bd5b230..017e7752 100644 --- a/tests/config/pytest_modules.yml +++ b/tests/config/pytest_modules.yml @@ -915,6 +915,10 @@ gatk4/variantrecalibrator: - modules/gatk4/variantrecalibrator/** - tests/modules/gatk4/variantrecalibrator/** +gecco/run: + - modules/gecco/run/** + - tests/modules/gecco/run/** + genescopefk: - modules/genescopefk/** - tests/modules/genescopefk/** diff --git a/tests/modules/gecco/run/main.nf b/tests/modules/gecco/run/main.nf new file mode 100644 index 00000000..42ff5f3d --- /dev/null +++ b/tests/modules/gecco/run/main.nf @@ -0,0 +1,17 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +include { GECCO_RUN } from '../../../../modules/gecco/run/main.nf' + +workflow test_gecco_run { + + input = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['candidatus_portiera_aleyrodidarum']['genome']['genome_fasta'], checkIfExists: true), + [] + ] + model_dir = [] + + GECCO_RUN ( input, model_dir ) +} diff --git a/tests/modules/gecco/run/nextflow.config b/tests/modules/gecco/run/nextflow.config new file mode 100644 index 00000000..50f50a7a --- /dev/null +++ b/tests/modules/gecco/run/nextflow.config @@ -0,0 +1,5 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + +} \ No newline at end of file diff --git a/tests/modules/gecco/run/test.yml b/tests/modules/gecco/run/test.yml new file mode 100644 index 00000000..0fe67e68 --- /dev/null +++ b/tests/modules/gecco/run/test.yml @@ -0,0 +1,13 @@ +- name: gecco run test_gecco_run + command: nextflow run ./tests/modules/gecco/run -entry test_gecco_run -c ./tests/config/nextflow.config -c ./tests/modules/gecco/run/nextflow.config + tags: + - gecco/run + - gecco + files: + - path: output/gecco/NC_018507.1_cluster_1.gbk + - path: output/gecco/genome.clusters.tsv + md5sum: 6560ec765f6bbae50645896fd93b35cd + - path: output/gecco/genome.features.tsv + md5sum: 07492cf186003311ba9829056c65787f + - path: output/gecco/genome.genes.tsv + md5sum: 031a37dc603f8a5296f462608fd7bcc3