From 8ce68107871c96519b3eb0095d97896e34ef4489 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 1 Apr 2022 11:33:07 +0200 Subject: [PATCH] Update DASTool to 1.1.4 (#1471) * fix: remove left-over unnecessary code * Update DASTool * Fix tests * Fix test.ymls * Fix container build version * Make tests less strict to account for variability * Apply suggestions from code review Co-authored-by: Daniel Straub <42973691+d4straub@users.noreply.github.com> * Add missing description Co-authored-by: Sateesh Peri <33637490+sateeshperi@users.noreply.github.com> Co-authored-by: Daniel Straub <42973691+d4straub@users.noreply.github.com> --- modules/dastool/dastool/main.nf | 29 ++++------ modules/dastool/dastool/meta.yml | 18 +++--- modules/dastool/fastatocontig2bin/main.nf | 41 ++++++++++++++ modules/dastool/fastatocontig2bin/meta.yml | 56 +++++++++++++++++++ tests/config/pytest_modules.yml | 4 ++ tests/modules/dastool/dastool/main.nf | 10 ++-- tests/modules/dastool/dastool/test.yml | 29 +++++----- .../modules/dastool/fastatocontig2bin/main.nf | 48 ++++++++++++++++ .../dastool/fastatocontig2bin/nextflow.config | 5 ++ .../dastool/fastatocontig2bin/test.yml | 20 +++++++ 10 files changed, 216 insertions(+), 44 deletions(-) create mode 100644 modules/dastool/fastatocontig2bin/main.nf create mode 100644 modules/dastool/fastatocontig2bin/meta.yml create mode 100644 tests/modules/dastool/fastatocontig2bin/main.nf create mode 100644 tests/modules/dastool/fastatocontig2bin/nextflow.config create mode 100644 tests/modules/dastool/fastatocontig2bin/test.yml diff --git a/modules/dastool/dastool/main.nf b/modules/dastool/dastool/main.nf index 53dfea19..968f85de 100644 --- a/modules/dastool/dastool/main.nf +++ b/modules/dastool/dastool/main.nf @@ -2,27 +2,28 @@ process DASTOOL_DASTOOL { tag "$meta.id" label 'process_medium' - conda (params.enable_conda ? "bioconda::das_tool=1.1.3" : null) + conda (params.enable_conda ? "bioconda::das_tool=1.1.4" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/das_tool:1.1.3--r41hdfd78af_0' : - 'quay.io/biocontainers/das_tool:1.1.3--r41hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/das_tool:1.1.4--r41hdfd78af_1' : + 'quay.io/biocontainers/das_tool:1.1.4--r41hdfd78af_1' }" input: tuple val(meta), path(contigs), path(bins) path(proteins) path(db_directory) - val(search_engine) output: tuple val(meta), path("*.log") , emit: log - tuple val(meta), path("*_summary.txt") , emit: summary - tuple val(meta), path("*_DASTool_scaffolds2bin.txt") , emit: scaffolds2bin + tuple val(meta), path("*_summary.tsv") , emit: summary + tuple val(meta), path("*_DASTool_contig2bin.tsv") , emit: contig2bin tuple val(meta), path("*.eval") , optional: true, emit: eval tuple val(meta), path("*_DASTool_bins/*.fa") , optional: true, emit: bins tuple val(meta), path("*.pdf") , optional: true, emit: pdfs - tuple val(meta), path("*.proteins.faa") , optional: true, emit: fasta_proteins + tuple val(meta), path("*.candidates.faa") , optional: true, emit: fasta_proteins + tuple val(meta), path("*.faa") , optional: true, emit: candidates_faa tuple val(meta), path("*.archaea.scg") , optional: true, emit: fasta_archaea_scg tuple val(meta), path("*.bacteria.scg") , optional: true, emit: fasta_bacteria_scg + tuple val(meta), path("*.b6") , optional: true, emit: b6 tuple val(meta), path("*.seqlength") , optional: true, emit: seqlength path "versions.yml" , emit: versions @@ -33,17 +34,12 @@ process DASTOOL_DASTOOL { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" def bin_list = bins instanceof List ? bins.join(",") : "$bins" - def engine = search_engine ? "--search_engine $search_engine" : "--search_engine diamond" def db_dir = db_directory ? "--db_directory $db_directory" : "" def clean_contigs = contigs.toString() - ".gz" def decompress_contigs = contigs.toString() == clean_contigs ? "" : "gunzip -q -f $contigs" - def decompress_proteins = proteins ? "gunzip -f $proteins" : "" def clean_proteins = proteins ? proteins.toString() - ".gz" : "" - def proteins_pred = proteins ? "--proteins $clean_proteins" : "" - - if (! search_engine) { - log.info('[DAS_Tool] Default search engine (USEARCH) is proprietary software and not available in bioconda. Using DIAMOND as alternative.') - } + def decompress_proteins = proteins ? "gunzip -f $proteins" : "" + def proteins_pred = proteins ? "-p $clean_proteins" : "" """ $decompress_proteins @@ -53,15 +49,14 @@ process DASTOOL_DASTOOL { $args \\ $proteins_pred \\ $db_dir \\ - $engine \\ -t $task.cpus \\ - --bins $bin_list \\ + -i $bin_list \\ -c $clean_contigs \\ -o $prefix cat <<-END_VERSIONS > versions.yml "${task.process}": - dastool: \$( DAS_Tool --version 2>&1 | grep "DAS Tool" | sed 's/DAS Tool version //' ) + dastool: \$( DAS_Tool --version 2>&1 | grep "DAS Tool" | sed 's/DAS Tool //' ) END_VERSIONS """ } diff --git a/modules/dastool/dastool/meta.yml b/modules/dastool/dastool/meta.yml index a77df9bd..0889ca47 100644 --- a/modules/dastool/dastool/meta.yml +++ b/modules/dastool/dastool/meta.yml @@ -34,8 +34,8 @@ input: pattern: "*.{fa.gz,fas.gz,fasta.gz}" - bins: type: file - description: "Scaffolds2bin tabular file generated with dastool/scaffolds2bin" - pattern: "*.scaffolds2bin.tsv" + description: "FastaToContig2Bin tabular file generated with dastool/fastatocontig2bin" + pattern: "*.tsv" - proteins: type: file description: Predicted proteins in prodigal fasta format (>scaffoldID_geneNo) @@ -43,9 +43,6 @@ input: - db_directory: type: file description: (optional) Directory of single copy gene database. - - search_engine: - type: val - description: Engine used for single copy gene identification. USEARCH is not supported due to it being proprietary [blast/diamond] output: - meta: @@ -65,14 +62,17 @@ output: type: file description: Summary of output bins including quality and completeness estimates pattern: "*summary.txt" - - scaffolds2bin: + - contig2bin: type: file description: Scaffolds to bin file of output bins - pattern: "*.scaffolds2bin.txt" + pattern: "*.contig2bin.txt" - eval: type: file description: Quality and completeness estimates of input bin sets pattern: "*.eval" + - bins: + description: Final refined bins in fasta format + pattern: "*.fa" - pdfs: type: file description: Plots showing the amount of high quality bins and score distribution of bins per method @@ -89,6 +89,10 @@ output: type: file description: Results of bacterial single-copy-gene prediction pattern: "*.bacteria.scg" + - b6: + type: file + description: Results in b6 format + pattern: "*.b6" - seqlength: type: file description: Summary of contig lengths diff --git a/modules/dastool/fastatocontig2bin/main.nf b/modules/dastool/fastatocontig2bin/main.nf new file mode 100644 index 00000000..8bb13380 --- /dev/null +++ b/modules/dastool/fastatocontig2bin/main.nf @@ -0,0 +1,41 @@ +process DASTOOL_FASTATOCONTIG2BIN { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::das_tool=1.1.4" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/das_tool:1.1.4--r41hdfd78af_1' : + 'quay.io/biocontainers/das_tool:1.1.4--r41hdfd78af_1' }" + + input: + tuple val(meta), path(fasta) + val(extension) + + output: + tuple val(meta), path("*.tsv"), emit: fastatocontig2bin + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def file_extension = extension ? extension : "fasta" + def clean_fasta = fasta.toString() - ".gz" + def decompress_fasta = fasta.toString() == clean_fasta ? "" : "gunzip -q -f $fasta" + """ + $decompress_fasta + + Fasta_to_Contig2Bin.sh \\ + $args \\ + -i . \\ + -e $file_extension \\ + > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dastool: \$( DAS_Tool --version 2>&1 | grep "DAS Tool" | sed 's/DAS Tool //' ) + END_VERSIONS + """ +} diff --git a/modules/dastool/fastatocontig2bin/meta.yml b/modules/dastool/fastatocontig2bin/meta.yml new file mode 100644 index 00000000..1176ae96 --- /dev/null +++ b/modules/dastool/fastatocontig2bin/meta.yml @@ -0,0 +1,56 @@ +name: dastool_fastatocontig2bin +description: Helper script to convert a set of bins in fasta format to tabular scaffolds2bin format +keywords: + - binning + - das tool + - table + - de novo + - bins + - contigs + - assembly + - das_tool +tools: + - dastool: + description: | + DAS Tool is an automated method that integrates the results + of a flexible number of binning algorithms to calculate an optimized, non-redundant + set of bins from a single assembly. + + homepage: https://github.com/cmks/DAS_Tool + documentation: https://github.com/cmks/DAS_Tool + tool_dev_url: https://github.com/cmks/DAS_Tool + doi: "10.1038/s41564-018-0171-1" + licence: ["BSD"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Fasta of list of fasta files recommended to be gathered via with .collect() of bins + pattern: "*.{fa,fa.gz,fas,fas.gz,fna,fna.gz,fasta,fasta.gz}" + - extension: + type: val + description: Fasta file extension (fa | fas | fasta | ...), without .gz suffix, if gzipped input. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastatocontig2bin: + type: file + description: tabular contig2bin file for DAS tool input + pattern: "*.tsv" + +authors: + - "@maxibor" + - "@jfy133" diff --git a/tests/config/pytest_modules.yml b/tests/config/pytest_modules.yml index 8425b16c..24bfe641 100644 --- a/tests/config/pytest_modules.yml +++ b/tests/config/pytest_modules.yml @@ -487,6 +487,10 @@ dastool/dastool: - modules/dastool/dastool/** - tests/modules/dastool/dastool/** +dastool/fastatocontig2bin: + - modules/dastool/fastatocontig2bin/** + - tests/modules/dastool/fastatocontig2bin/** + dastool/scaffolds2bin: - modules/dastool/scaffolds2bin/** - tests/modules/dastool/scaffolds2bin/** diff --git a/tests/modules/dastool/dastool/main.nf b/tests/modules/dastool/dastool/main.nf index f6f6becf..9853e724 100644 --- a/tests/modules/dastool/dastool/main.nf +++ b/tests/modules/dastool/dastool/main.nf @@ -3,7 +3,7 @@ nextflow.enable.dsl = 2 include { METABAT2_METABAT2 } from '../../../../modules/metabat2/metabat2/main.nf' include { METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS } from '../../../../modules/metabat2/jgisummarizebamcontigdepths/main.nf' -include { DASTOOL_SCAFFOLDS2BIN } from '../../../../modules/dastool/scaffolds2bin/main.nf' +include { DASTOOL_FASTATOCONTIG2BIN } from '../../../../modules/dastool/fastatocontig2bin/main.nf' include { DASTOOL_DASTOOL } from '../../../../modules/dastool/dastool/main.nf' workflow test_dastool_dastool { @@ -21,13 +21,13 @@ workflow test_dastool_dastool { METABAT2_METABAT2 ( input_metabat2 ) - DASTOOL_SCAFFOLDS2BIN ( METABAT2_METABAT2.out.fasta.collect(), "fa") + DASTOOL_FASTATOCONTIG2BIN ( METABAT2_METABAT2.out.fasta.collect(), "fa") Channel.of([ [ id:'test', single_end:false ], // meta map file(params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true)]) - .join(DASTOOL_SCAFFOLDS2BIN.out.scaffolds2bin) + .join( DASTOOL_FASTATOCONTIG2BIN.out.fastatocontig2bin ) .set {input_dastool} - - DASTOOL_DASTOOL ( input_dastool, [], [], [] ) + + DASTOOL_DASTOOL ( input_dastool, [], [] ) } diff --git a/tests/modules/dastool/dastool/test.yml b/tests/modules/dastool/dastool/test.yml index 7f7eb19c..cda17bda 100644 --- a/tests/modules/dastool/dastool/test.yml +++ b/tests/modules/dastool/dastool/test.yml @@ -1,29 +1,28 @@ - name: dastool dastool test_dastool_dastool - command: nextflow run ./tests/modules/dastool/dastool -entry test_dastool_dastool -c ./tests/config/nextflow.config -c ./tests/modules/dastool/dastool/nextflow.config + command: nextflow run tests/modules/dastool/dastool -entry test_dastool_dastool -c tests/config/nextflow.config tags: - - dastool - dastool/dastool + - dastool files: - path: output/dastool/test.seqlength md5sum: b815a5811008c36808a59b1d0dcfab24 - path: output/dastool/test.tsv md5sum: 6e46c0be14dded7cb13af38f54feea47 - path: output/dastool/test_DASTool.log - contains: - - "DAS Tool run on" - - path: output/dastool/test_DASTool_scaffolds2bin.txt + - path: output/dastool/test_DASTool_contig2bin.tsv md5sum: 6e46c0be14dded7cb13af38f54feea47 - - path: output/dastool/test_DASTool_summary.txt - md5sum: a3efa8717b30dfada78dc5ae9a3dc396 + - path: output/dastool/test_DASTool_summary.tsv + md5sum: ab9dd3709a59a69bc66030b9e0ff3d5b + - path: output/dastool/test_proteins.faa + - path: output/dastool/test_proteins.faa.all.b6 + md5sum: 39c11237ef22ac73109aaac267e185d0 - path: output/dastool/test_proteins.faa.archaea.scg md5sum: e79d82eecee25821d1658ea4f082601d - path: output/dastool/test_proteins.faa.bacteria.scg md5sum: 8132cfb17cf398d41c036ead55c96ffe - - path: output/dastool/test_test.tsv.eval - md5sum: a3efa8717b30dfada78dc5ae9a3dc396 - - path: output/metabat2/bins/test.1.fa.gz - md5sum: 2b297bf557cc3831b800348859331268 - - path: output/metabat2/test.tsv.gz - md5sum: 619338fa5019e361d5545ce385a6961f - - path: output/metabat2/test.txt.gz - md5sum: 745a0446af6ef68b930975e9ce5a95d6 + - path: output/dastool/test_proteins.faa.findSCG.b6 + md5sum: 48e90e12cd6c88d00608777dbc48a82a + - path: output/dastool/test_proteins.faa.scg.candidates.faa + md5sum: d94b7bed0f8aa9cf2824d72c548c537c + - path: output/dastool/versions.yml + md5sum: 004e04c6a38652df2e0c59c44e29c9de diff --git a/tests/modules/dastool/fastatocontig2bin/main.nf b/tests/modules/dastool/fastatocontig2bin/main.nf new file mode 100644 index 00000000..0178dbf9 --- /dev/null +++ b/tests/modules/dastool/fastatocontig2bin/main.nf @@ -0,0 +1,48 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +include { GUNZIP } from '../../../../modules/gunzip/main.nf' +include { METABAT2_METABAT2 } from '../../../../modules/metabat2/metabat2/main.nf' +include { METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS } from '../../../../modules/metabat2/jgisummarizebamcontigdepths/main.nf' +include { DASTOOL_FASTATOCONTIG2BIN } from '../../../../modules/dastool/fastatocontig2bin/main.nf' + +workflow test_dastool_fastatocontig2bin { + + input_depth = [ [ id:'test', single_end:false ], // meta map + file(params.test_data['bacteroides_fragilis']['illumina']['test1_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['bacteroides_fragilis']['illumina']['test1_paired_end_sorted_bam_bai'], checkIfExists: true) ] + + METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS ( input_depth ) + + Channel.fromPath(params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true) + .map { it -> [[ id:'test', single_end:false ], it] } + .join(METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS.out.depth) + .set { input_metabat2 } + + METABAT2_METABAT2 ( input_metabat2 ) + + DASTOOL_FASTATOCONTIG2BIN ( METABAT2_METABAT2.out.fasta.collect(), "fa") +} + +workflow test_dastool_fastatocontig2bin_ungzipped { + + input_depth = [ [ id:'test', single_end:false ], // meta map + file(params.test_data['bacteroides_fragilis']['illumina']['test1_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['bacteroides_fragilis']['illumina']['test1_paired_end_sorted_bam_bai'], checkIfExists: true) ] + + + METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS ( input_depth ) + + Channel.fromPath(params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true) + .map { it -> [[ id:'test', single_end:false ], it] } + .join(METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS.out.depth) + .set { input_metabat2 } + + METABAT2_METABAT2 ( input_metabat2 ) + + // TODO test unzipped input files + ch_input_2_fastatocontig2bin = GUNZIP( METABAT2_METABAT2.out.fasta ).gunzip + + DASTOOL_FASTATOCONTIG2BIN ( ch_input_2_fastatocontig2bin, "fa") +} diff --git a/tests/modules/dastool/fastatocontig2bin/nextflow.config b/tests/modules/dastool/fastatocontig2bin/nextflow.config new file mode 100644 index 00000000..50f50a7a --- /dev/null +++ b/tests/modules/dastool/fastatocontig2bin/nextflow.config @@ -0,0 +1,5 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + +} \ No newline at end of file diff --git a/tests/modules/dastool/fastatocontig2bin/test.yml b/tests/modules/dastool/fastatocontig2bin/test.yml new file mode 100644 index 00000000..94881438 --- /dev/null +++ b/tests/modules/dastool/fastatocontig2bin/test.yml @@ -0,0 +1,20 @@ +- name: dastool fastatocontig2bin test_dastool_fastatocontig2bin + command: nextflow run tests/modules/dastool/fastatocontig2bin -entry test_dastool_fastatocontig2bin -c tests/config/nextflow.config + tags: + - dastool + - dastool/fastatocontig2bin + files: + - path: output/dastool/test.tsv + md5sum: 6e46c0be14dded7cb13af38f54feea47 + - path: output/dastool/versions.yml + md5sum: ff4b6f14bee4548bf09b5e602c306595 + +- name: dastool fastatocontig2bin test_dastool_fastatocontig2bin_ungzipped + command: nextflow run tests/modules/dastool/fastatocontig2bin -entry test_dastool_fastatocontig2bin_ungzipped -c tests/config/nextflow.config + tags: + - dastool + - dastool/fastatocontig2bin + files: + - path: output/dastool/test.tsv + md5sum: 6e46c0be14dded7cb13af38f54feea47 + - path: output/dastool/versions.yml