new module: deepbgc/pipeline (#2014)

* not working yet (db not found)

* modify deeparg/download module to return db-path

* 🪄

* Prettier

* add test.yml

* much prettier

* test.yml delete md5 for pot. empty files

* adapt test.yml

* test.yml again

* Apply suggestions from code review

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>
This commit is contained in:
louperelo 2022-09-07 14:59:01 +02:00 committed by GitHub
parent f2264c1052
commit eae945721d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 275 additions and 1 deletions

View file

@ -8,7 +8,7 @@ process DEEPBGC_DOWNLOAD {
'quay.io/biocontainers/deepbgc:0.1.30--pyhb7b1952_1' }"
output:
path "deepbgc_db" , emit: db
path "deepbgc_db/" , emit: db
path "versions.yml" , emit: versions
when:

View file

@ -0,0 +1,47 @@
process DEEPBGC_PIPELINE {
tag "$meta.id"
label 'process_low'
conda (params.enable_conda ? "bioconda::deepbgc=0.1.30" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/deepbgc:0.1.30--pyhb7b1952_1':
'quay.io/biocontainers/deepbgc:0.1.30--pyhb7b1952_1' }"
input:
tuple val(meta), path(genome)
path(db)
output:
tuple val(meta), path("${genome.baseName}/README.txt") , optional: true, emit: readme
tuple val(meta), path("${genome.baseName}/LOG.txt") , emit: log
tuple val(meta), path("${genome.baseName}/${genome.baseName}.antismash.json") , optional: true, emit: json
tuple val(meta), path("${genome.baseName}/${genome.baseName}.bgc.gbk") , optional: true, emit: bgc_gbk
tuple val(meta), path("${genome.baseName}/${genome.baseName}.bgc.tsv") , optional: true, emit: bgc_tsv
tuple val(meta), path("${genome.baseName}/${genome.baseName}.full.gbk") , optional: true, emit: full_gbk
tuple val(meta), path("${genome.baseName}/${genome.baseName}.pfam.tsv") , optional: true, emit: pfam_tsv
tuple val(meta), path("${genome.baseName}/evaluation/${genome.baseName}.bgc.png") , optional: true, emit: bgc_png
tuple val(meta), path("${genome.baseName}/evaluation/${genome.baseName}.pr.png") , optional: true, emit: pr_png
tuple val(meta), path("${genome.baseName}/evaluation/${genome.baseName}.roc.png") , optional: true, emit: roc_png
tuple val(meta), path("${genome.baseName}/evaluation/${genome.baseName}.score.png") , optional: true, emit: score_png
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
export DEEPBGC_DOWNLOADS_DIR=${db}
deepbgc \\
pipeline \\
$args \\
$genome
cat <<-END_VERSIONS > versions.yml
"${task.process}":
deepbgc: \$(echo \$(deepbgc info 2>&1 /dev/null/ | grep 'version' | cut -d " " -f3) )
END_VERSIONS
"""
}

View file

@ -0,0 +1,88 @@
name: "deepbgc_pipeline"
description: detect BGCs in bacterial and fungal genomes using deep learning
keywords:
- Biosynthetic Gene Cluster
- deep learning
- neural network
- random forest
- genomes
- bacteria
- fungi
tools:
- "deepbgc":
description: "DeepBGC - Biosynthetic Gene Cluster detection and classification"
homepage: "https://github.com/Merck/deepbgc"
documentation: "https://github.com/Merck/deepbgc"
tool_dev_url: "https://github.com/Merck/deepbgc"
doi: "10.1093/nar/gkz654"
licence: "['MIT']"
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test' ]
- genome:
type: file
description: FASTA/GenBank/Pfam CSV file
pattern: "*.{fasta,fa,fna,gbk,csv}"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test']
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- readme:
type: file
description: txt file containing description of output files
pattern: "*.{txt}"
- log:
type: file
description: Log output of DeepBGC
pattern: "*.{txt}"
- json:
type: file
description: AntiSMASH JSON file for sideloading.
pattern: "*.{json}"
- bgc_gbk:
type: file
description: Sequences and features of all detected BGCs in GenBank format.
pattern: "*.{bgc.gbk}"
- bgc_tsv:
type: file
description: Table of detected BGCs and their properties.
pattern: "*.{bgc.tsv}"
- full_gbk:
type: file
description: Fully annotated input sequence with proteins, Pfam domains (PFAM_domain features) and BGCs (cluster features)
pattern: "*.{full.gbk}"
- pfam_tsv:
type: file
description: Table of Pfam domains (pfam_id) from given sequence (sequence_id) in genomic order, with BGC detection scores.
pattern: "*.{pfam.tsv}"
- bgc_png:
type: file
description: Detected BGCs plotted by their nucleotide coordinates.
pattern: "*.{bgc.png}"
- pr_png:
type: file
description: Precision-Recall curve based on predicted per-Pfam BGC scores.
pattern: "*.{pr.png}"
- roc_png:
type: file
description: ROC curve based on predicted per-Pfam BGC scores.
pattern: "*.{roc.png}"
- score_png:
type: file
description: BGC detection scores of each Pfam domain in genomic order.
pattern: "*.{score.png}"
authors:
- "@louperelo"
- "@jfy133"

View file

@ -607,6 +607,10 @@ deepbgc/download:
- modules/deepbgc/download/**
- tests/modules/deepbgc/download/**
deepbgc/pipeline:
- modules/deepbgc/pipeline/**
- tests/modules/deepbgc/pipeline/**
deeptools/bamcoverage:
- modules/deeptools/bamcoverage/**
- tests/modules/deeptools/bamcoverage/**

View file

@ -0,0 +1,33 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { GUNZIP } from '../../../modules/gunzip/main.nf'
include { PRODIGAL } from '../../../modules/prodigal/main.nf'
include { DEEPBGC_DOWNLOAD } from '../../../../modules/deepbgc/download/main.nf'
include { DEEPBGC_PIPELINE } from '../../../../modules/deepbgc/pipeline/main.nf'
workflow test_deepbgc_pipeline_gbk {
input = [
[ id:'test_gbk', single_end:false ], // meta map
file(params.test_data['bacteroides_fragilis']['illumina']['test1_contigs_fa_gz'], checkIfExists: true)
]
DEEPBGC_DOWNLOAD ()
GUNZIP ( input )
PRODIGAL ( GUNZIP.out.gunzip, 'gbk' )
DEEPBGC_PIPELINE ( PRODIGAL.out.gene_annotations, DEEPBGC_DOWNLOAD.out.db )
}
workflow test_deepbgc_pipeline_fa {
input = [
[ id:'test_fa', single_end:false ], // meta map
file(params.test_data['bacteroides_fragilis']['illumina']['test1_contigs_fa_gz'], checkIfExists: true)
]
DEEPBGC_DOWNLOAD ()
GUNZIP ( input )
DEEPBGC_PIPELINE ( GUNZIP.out.gunzip, DEEPBGC_DOWNLOAD.out.db )
}

View file

@ -0,0 +1,5 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
}

View file

@ -0,0 +1,97 @@
- name: deepbgc pipeline test_deepbgc_pipeline_gbk
command: nextflow run ./tests/modules/deepbgc/pipeline -entry test_deepbgc_pipeline_gbk -c ./tests/config/nextflow.config -c ./tests/modules/deepbgc/pipeline/nextflow.config
tags:
- deepbgc/pipeline
- deepbgc
files:
- path: output/deepbgc/deepbgc_db/0.1.0/classifier/product_activity.pkl
md5sum: 90f0c010460e9df882cb057664a49f30
- path: output/deepbgc/deepbgc_db/0.1.0/classifier/product_class.pkl
md5sum: f78a2eda240403d2f40643d42202f3ac
- path: output/deepbgc/deepbgc_db/0.1.0/detector/clusterfinder_geneborder.pkl
md5sum: ca4be7031ae9f70780f17c616a4fa5b5
- path: output/deepbgc/deepbgc_db/0.1.0/detector/clusterfinder_original.pkl
md5sum: 2ca2429bb9bc99a401d1093c376b37aa
- path: output/deepbgc/deepbgc_db/0.1.0/detector/clusterfinder_retrained.pkl
md5sum: 65679a3b61c562ff4b84bdb574bb6d93
- path: output/deepbgc/deepbgc_db/0.1.0/detector/deepbgc.pkl
md5sum: 7e9218be79ba45bc9adb23bed3845dc1
- path: output/deepbgc/deepbgc_db/common/Pfam-A.31.0.clans.tsv
md5sum: a0a4590ffb2b33b83ef2b28f6ead886b
- path: output/deepbgc/deepbgc_db/common/Pfam-A.31.0.hmm
md5sum: 79a3328e4c95b13949a4489b19959fc5
- path: output/deepbgc/deepbgc_db/common/Pfam-A.31.0.hmm.h3f
md5sum: cbca323cf8dd4e5e7c109114ec444162
- path: output/deepbgc/deepbgc_db/common/Pfam-A.31.0.hmm.h3i
md5sum: 5242332a3f6a60cd1ab634cd9331afd6
- path: output/deepbgc/deepbgc_db/common/Pfam-A.31.0.hmm.h3m
md5sum: 1fe946fa2b3bcde1d4b2bad732bce612
- path: output/deepbgc/deepbgc_db/common/Pfam-A.31.0.hmm.h3p
md5sum: 27b98a1ded123b6a1ef72db01927017c
- path: output/deepbgc/test_gbk/LOG.txt
contains: ["Saved DeepBGC result to:"]
- path: output/deepbgc/test_gbk/README.txt
- path: output/deepbgc/test_gbk/test_gbk.antismash.json
md5sum: 7dba3996cf38756b05e7612de8433c23
- path: output/deepbgc/test_gbk/test_gbk.bgc.gbk
- path: output/deepbgc/test_gbk/test_gbk.full.gbk
- path: output/gunzip/test1.contigs.fa
md5sum: 80c4d78f2810f6d9e90fa6da9bb9c4f9
- path: output/prodigal/test_gbk.faa
md5sum: b140ca303ff9ee32e615bfcc4b05038c
- path: output/prodigal/test_gbk.fna
md5sum: 28232dd696754fb95308874c9528296f
- path: output/prodigal/test_gbk.gbk
md5sum: b5c309b0296e7cdc21d1e71f33400f20
- path: output/prodigal/test_gbk_all.txt
md5sum: 8fe56fcf4d9e839e83be7523cd3efa02
- name: deepbgc pipeline test_deepbgc_pipeline_fa
command: nextflow run ./tests/modules/deepbgc/pipeline -entry test_deepbgc_pipeline_fa -c ./tests/config/nextflow.config -c ./tests/modules/deepbgc/pipeline/nextflow.config
tags:
- deepbgc/pipeline
- deepbgc
files:
- path: output/deepbgc/deepbgc_db/0.1.0/classifier/product_activity.pkl
md5sum: 90f0c010460e9df882cb057664a49f30
- path: output/deepbgc/deepbgc_db/0.1.0/classifier/product_class.pkl
md5sum: f78a2eda240403d2f40643d42202f3ac
- path: output/deepbgc/deepbgc_db/0.1.0/detector/clusterfinder_geneborder.pkl
md5sum: ca4be7031ae9f70780f17c616a4fa5b5
- path: output/deepbgc/deepbgc_db/0.1.0/detector/clusterfinder_original.pkl
md5sum: 2ca2429bb9bc99a401d1093c376b37aa
- path: output/deepbgc/deepbgc_db/0.1.0/detector/clusterfinder_retrained.pkl
md5sum: 65679a3b61c562ff4b84bdb574bb6d93
- path: output/deepbgc/deepbgc_db/0.1.0/detector/deepbgc.pkl
md5sum: 7e9218be79ba45bc9adb23bed3845dc1
- path: output/deepbgc/deepbgc_db/common/Pfam-A.31.0.clans.tsv
md5sum: a0a4590ffb2b33b83ef2b28f6ead886b
- path: output/deepbgc/deepbgc_db/common/Pfam-A.31.0.hmm
md5sum: 79a3328e4c95b13949a4489b19959fc5
- path: output/deepbgc/deepbgc_db/common/Pfam-A.31.0.hmm.h3f
md5sum: cbca323cf8dd4e5e7c109114ec444162
- path: output/deepbgc/deepbgc_db/common/Pfam-A.31.0.hmm.h3i
md5sum: 5242332a3f6a60cd1ab634cd9331afd6
- path: output/deepbgc/deepbgc_db/common/Pfam-A.31.0.hmm.h3m
md5sum: 1fe946fa2b3bcde1d4b2bad732bce612
- path: output/deepbgc/deepbgc_db/common/Pfam-A.31.0.hmm.h3p
md5sum: 27b98a1ded123b6a1ef72db01927017c
- path: output/deepbgc/test1.contigs/LOG.txt
contains: ["Saved DeepBGC result to:"]
- path: output/deepbgc/test1.contigs/README.txt
- path: output/deepbgc/test1.contigs/evaluation/test1.contigs.bgc.png
md5sum: f4a0fc6cd260e2d7ad16f7a1fa103f96
- path: output/deepbgc/test1.contigs/evaluation/test1.contigs.score.png
md5sum: 572e8882031f667580d8c8e13c2cbb91
- path: output/deepbgc/test1.contigs/test1.contigs.antismash.json
contains: ['"name": "DeepBGC"']
- path: output/deepbgc/test1.contigs/test1.contigs.bgc.gbk
md5sum: 7fc70dd034903622dae273bf71b402f2
- path: output/deepbgc/test1.contigs/test1.contigs.bgc.tsv
contains: ["sequence_id"]
- path: output/deepbgc/test1.contigs/test1.contigs.full.gbk
contains: ["LOCUS"]
- path: output/deepbgc/test1.contigs/test1.contigs.pfam.tsv
md5sum: 1179eb4e6df0c83aaeec18d7d34e7524
- path: output/gunzip/test1.contigs.fa
md5sum: 80c4d78f2810f6d9e90fa6da9bb9c4f9