Add gecco/run module (#1790)

* Add gecco/run module

* Fix container URLs

* Apply suggestions from code review

Co-authored-by: Jasmin F <73216762+jasmezz@users.noreply.github.com>
This commit is contained in:
James A. Fellows Yates 2022-06-22 13:43:45 +02:00 committed by GitHub
parent 280eec5317
commit 9d7208504d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 153 additions and 0 deletions

47
modules/gecco/run/main.nf Normal file
View file

@ -0,0 +1,47 @@
process GECCO_RUN {
tag "$meta.id"
label 'process_low'
conda (params.enable_conda ? "bioconda::gecco=0.9.2" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/gecco:0.9.2--pyhdfd78af_0':
'quay.io/biocontainers/gecco:0.9.2--pyhdfd78af_0' }"
input:
tuple val(meta), path(input), path(hmm)
path model_dir
output:
tuple val(meta), path("*.genes.tsv") , emit: genes
tuple val(meta), path("*.features.tsv") , emit: features
tuple val(meta), path("*.clusters.tsv") , emit: clusters
tuple val(meta), path("*_cluster_*.gbk"), optional: true, emit: gbk
tuple val(meta), path("*.json") , optional: true, emit: json
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def custom_model = model_dir ? "--model ${model_dir}" : ""
def custom_hmm = hmm ? "--hmm ${hmm}" : ""
"""
gecco \\
run \\
$args \\
-j $task.cpus \\
-o ./ \\
-g ${input} \\
$custom_model \\
$custom_hmm
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gecco: \$(echo \$(gecco --version) | cut -f 2 -d ' ' )
END_VERSIONS
"""
}

View file

@ -0,0 +1,67 @@
name: "gecco_run"
description: GECCO is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).
keywords:
- bgc
- detection
- metagenomics
- contigs
tools:
- "gecco":
description: "Biosynthetic Gene Cluster prediction with Conditional Random Fields."
homepage: "https://gecco.embl.de"
documentation: "https://gecco.embl.de"
tool_dev_url: "https://github.com/zellerlab/GECCO"
doi: "10.1101/2021.05.03.442509"
licence: "['GPL v3']"
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- input:
type: file
description: A genomic file containing one or more sequences as input. Input type is any supported by Biopython (fasta, gbk, etc.)
pattern: "*"
- hmm:
file: file
description: Alternative HMM file(s) to use in HMMER format
pattern: "*.hmm"
- model_dir:
file: directory
description: Path to an alternative CRF (Conditional Random Fields) module to use
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- genes:
type: file
description: TSV file containing detected/predicted genes with BGC probability scores
pattern: "*.genes.tsv"
- features:
type: file
description: TSV file containing identified domains
pattern: "*.features.tsv"
- clusters:
type: file
description: TSV file containing coordinates of predicted clusters and BGC types
pattern: "*.clusters.tsv"
- gbk:
type: file
description: Per cluster GenBank file (if found) containing sequence with annotations
pattern: "*.gbk"
- json:
type: file
description: AntiSMASH v6 sideload JSON file (if --antismash-sideload) supplied
pattern: "*.gbk"
authors:
- "@jfy133"

View file

@ -915,6 +915,10 @@ gatk4/variantrecalibrator:
- modules/gatk4/variantrecalibrator/** - modules/gatk4/variantrecalibrator/**
- tests/modules/gatk4/variantrecalibrator/** - tests/modules/gatk4/variantrecalibrator/**
gecco/run:
- modules/gecco/run/**
- tests/modules/gecco/run/**
genescopefk: genescopefk:
- modules/genescopefk/** - modules/genescopefk/**
- tests/modules/genescopefk/** - tests/modules/genescopefk/**

View file

@ -0,0 +1,17 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { GECCO_RUN } from '../../../../modules/gecco/run/main.nf'
workflow test_gecco_run {
input = [
[ id:'test', single_end:false ], // meta map
file(params.test_data['candidatus_portiera_aleyrodidarum']['genome']['genome_fasta'], checkIfExists: true),
[]
]
model_dir = []
GECCO_RUN ( input, model_dir )
}

View file

@ -0,0 +1,5 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
}

View file

@ -0,0 +1,13 @@
- name: gecco run test_gecco_run
command: nextflow run ./tests/modules/gecco/run -entry test_gecco_run -c ./tests/config/nextflow.config -c ./tests/modules/gecco/run/nextflow.config
tags:
- gecco/run
- gecco
files:
- path: output/gecco/NC_018507.1_cluster_1.gbk
- path: output/gecco/genome.clusters.tsv
md5sum: 6560ec765f6bbae50645896fd93b35cd
- path: output/gecco/genome.features.tsv
md5sum: 07492cf186003311ba9829056c65787f
- path: output/gecco/genome.genes.tsv
md5sum: 031a37dc603f8a5296f462608fd7bcc3