Add new module: Flye (#1164)

* changing mv by gzip

* changing mv by gzip

* first module creation

* add test.yml

* add flye to pyestes_modules.yml

* update flye module

* delete functions.nf

* generate test.yml

* fix contains from test.yml

* test file assembly_info.txt with regex

* check that file contains at least contig_1

* fix typo in contains

* update version

* split fastq file for raw runs

* use asm-coverage to reduce memory usage

* fix module name error

* add genome-size

* decrease coverage

* change test data for raw runs

* add coverage and genome size

* Apply comments from code review

Co-authored-by: SusiJo <43847534+SusiJo@users.noreply.github.com>

* after many trys, add a stub run

* remove md5sum for stub run

* Apply suggestions from code review

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>

* fix review comments

* Apply suggestions from code review

Co-authored-by: SusiJo <43847534+SusiJo@users.noreply.github.com>

* no hardcoded version in stub run

* Update modules/flye/main.nf

Co-authored-by: Mahesh Binzer-Panchal <mahesh.binzer-panchal@nbis.se>

Co-authored-by: SusiJo <43847534+SusiJo@users.noreply.github.com>
Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>
Co-authored-by: Mahesh Binzer-Panchal <mahesh.binzer-panchal@nbis.se>
master
Júlia Mir Pedrol 2 years ago committed by GitHub
parent 031fbd37aa
commit bd0fa881f6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,68 @@
process FLYE {
tag "$meta.id"
label 'process_high'
conda (params.enable_conda ? "bioconda::flye=2.9" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/flye:2.9--py39h6935b12_1' :
'quay.io/biocontainers/flye:2.9--py39h6935b12_1' }"
input:
tuple val(meta), path(reads)
val mode
output:
tuple val(meta), path("*.fasta.gz"), emit: fasta
tuple val(meta), path("*.gfa.gz") , emit: gfa
tuple val(meta), path("*.gv.gz") , emit: gv
tuple val(meta), path("*.txt") , emit: txt
tuple val(meta), path("*.log") , emit: log
tuple val(meta), path("*.json") , emit: json
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def valid_mode = ["--pacbio-raw", "--pacbio-corr", "--pacbio-hifi", "--nano-raw", "--nano-corr", "--nano-hq"]
if ( !valid_mode.contains(mode) ) { error "Unrecognised mode to run Flye. Options: ${valid_mode.join(', ')}" }
"""
flye \\
$mode \\
$reads \\
--out-dir . \\
--threads \\
$task.cpus \\
$args
gzip -c assembly.fasta > ${prefix}.assembly.fasta.gz
gzip -c assembly_graph.gfa > ${prefix}.assembly_graph.gfa.gz
gzip -c assembly_graph.gv > ${prefix}.assembly_graph.gv.gz
mv assembly_info.txt ${prefix}.assembly_info.txt
mv flye.log ${prefix}.flye.log
mv params.json ${prefix}.params.json
cat <<-END_VERSIONS > versions.yml
"${task.process}":
flye: \$( flye --version )
END_VERSIONS
"""
stub:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
echo stub > assembly.fasta | gzip -c assembly.fasta > ${prefix}.assembly.fasta.gz
echo stub > assembly_graph.gfa | gzip -c assembly_graph.gfa > ${prefix}.assembly_graph.gfa.gz
echo stub > assembly_graph.gv | gzip -c assembly_graph.gv > ${prefix}.assembly_graph.gv.gz
echo contig_1 > ${prefix}.assembly_info.txt
echo stub > ${prefix}.flye.log
echo stub > ${prefix}.params.json
cat <<-END_VERSIONS > versions.yml
"${task.process}":
flye: \$( flye --version )
END_VERSIONS
"""
}

@ -0,0 +1,69 @@
name: "flye"
description: De novo assembler for single molecule sequencing reads
keywords:
- assembly
- genome
- de novo
- genome assembler
- single molecule
tools:
- "flye":
description: "Fast and accurate de novo assembler for single molecule sequencing reads"
homepage: "https://github.com/fenderglass/Flye"
documentation: "https://github.com/fenderglass/Flye/blob/flye/docs/USAGE.md"
tool_dev_url: "https://github.com/fenderglass/Flye"
doi: "10.1038/s41592-020-00971-x"
licence: "['BSD-3-clause']"
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test' ]
- reads:
type: file
description: Input reads from Oxford Nanopore or PacBio data in FASTA/FASTQ format.
pattern: "*.{fasta,fastq,fasta.gz,fastq.gz,fa,fq,fa.gz,fq.gz}"
- mode:
type: value
description: Flye mode depending on the input data (source and error rate)
pattern: "--pacbio-raw|--pacbio-corr|--pacbio-hifi|--nano-raw|--nano-corr|--nano-hq"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test' ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- fasta:
type: file
description: Assembled FASTA file
pattern: "*.fasta.gz"
- gfa:
type: file
description: Repeat graph in gfa format
pattern: "*.gfa.gz"
- gv:
type: file
description: Repeat graph in gv format
pattern: "*.gv.gz"
- txt:
type: file
description: Extra information and statistics about resulting contigs
pattern: "*.txt"
- log:
type: file
description: Flye log file
pattern: "*.log"
- json:
type: file
description: Flye parameters
pattern: "*.json"
authors:
- "@mirpedrol"

@ -739,6 +739,10 @@ flash:
- modules/flash/**
- tests/modules/flash/**
flye:
- modules/flye/**
- tests/modules/flye/**
freebayes:
- modules/freebayes/**
- tests/modules/freebayes/**

@ -0,0 +1,71 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { FLYE } from '../../../modules/flye/main.nf'
workflow test_flye_pacbio_raw {
input = [
[ id:'test' ], // meta map
file(params.test_data['homo_sapiens']['pacbio']['hifi'], checkIfExists: true)
]
mode = "--pacbio-raw"
FLYE ( input, mode )
}
workflow test_flye_pacbio_corr {
input = [
[ id:'test' ], // meta map
file(params.test_data['homo_sapiens']['pacbio']['hifi'], checkIfExists: true)
]
mode = "--pacbio-corr"
FLYE ( input, mode )
}
workflow test_flye_pacbio_hifi {
input = [
[ id:'test' ], // meta map
file(params.test_data['homo_sapiens']['pacbio']['hifi'], checkIfExists: true)
]
mode = "--pacbio-hifi"
FLYE ( input, mode )
}
workflow test_flye_nano_raw {
input = [
[ id:'test' ], // meta map
file(params.test_data['homo_sapiens']['pacbio']['hifi'], checkIfExists: true)
]
mode = "--nano-raw"
FLYE ( input, mode )
}
workflow test_flye_nano_corr {
input = [
[ id:'test' ], // meta map
file(params.test_data['homo_sapiens']['pacbio']['hifi'], checkIfExists: true)
]
mode = "--nano-corr"
FLYE ( input, mode )
}
workflow test_flye_nano_hq {
input = [
[ id:'test' ], // meta map
file(params.test_data['homo_sapiens']['pacbio']['hifi'], checkIfExists: true)
]
mode = "--nano-hq"
FLYE ( input, mode )
}

@ -0,0 +1,5 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
}

@ -0,0 +1,85 @@
# According to the issue https://github.com/fenderglass/Flye/issues/164
# Some fluctuations are expected because of the heuristics
# Here we check the that test.assembly_info.txt contains at least one contig
- name: flye test_flye_pacbio_raw
command: nextflow run ./tests/modules/flye -entry test_flye_pacbio_raw -c ./tests/config/nextflow.config -c ./tests/modules/flye/nextflow.config -stub-run
tags:
- flye
files:
- path: output/flye/test.assembly.fasta.gz
- path: output/flye/test.assembly_graph.gfa.gz
- path: output/flye/test.assembly_graph.gv.gz
- path: output/flye/test.assembly_info.txt
contains: ["contig_1"]
- path: output/flye/test.flye.log
- path: output/flye/test.params.json
- name: flye test_flye_pacbio_corr
command: nextflow run ./tests/modules/flye -entry test_flye_pacbio_corr -c ./tests/config/nextflow.config -c ./tests/modules/flye/nextflow.config
tags:
- flye
files:
- path: output/flye/test.assembly.fasta.gz
- path: output/flye/test.assembly_graph.gfa.gz
- path: output/flye/test.assembly_graph.gv.gz
- path: output/flye/test.assembly_info.txt
contains: ["contig_1"]
- path: output/flye/test.flye.log
- path: output/flye/test.params.json
md5sum: 54b576cb6d4d27656878a7fd3657bde9
- name: flye test_flye_pacbio_hifi
command: nextflow run ./tests/modules/flye -entry test_flye_pacbio_hifi -c ./tests/config/nextflow.config -c ./tests/modules/flye/nextflow.config
tags:
- flye
files:
- path: output/flye/test.assembly.fasta.gz
- path: output/flye/test.assembly_graph.gfa.gz
- path: output/flye/test.assembly_graph.gv.gz
- path: output/flye/test.assembly_info.txt
contains: ["contig_1"]
- path: output/flye/test.flye.log
- path: output/flye/test.params.json
md5sum: 54b576cb6d4d27656878a7fd3657bde9
- name: flye test_flye_nano_raw
command: nextflow run ./tests/modules/flye -entry test_flye_nano_raw -c ./tests/config/nextflow.config -c ./tests/modules/flye/nextflow.config -stub-run
tags:
- flye
files:
- path: output/flye/test.assembly.fasta.gz
- path: output/flye/test.assembly_graph.gfa.gz
- path: output/flye/test.assembly_graph.gv.gz
- path: output/flye/test.assembly_info.txt
contains: ["contig_1"]
- path: output/flye/test.flye.log
- path: output/flye/test.params.json
- name: flye test_flye_nano_corr
command: nextflow run ./tests/modules/flye -entry test_flye_nano_corr -c ./tests/config/nextflow.config -c ./tests/modules/flye/nextflow.config
tags:
- flye
files:
- path: output/flye/test.assembly.fasta.gz
- path: output/flye/test.assembly_graph.gfa.gz
- path: output/flye/test.assembly_graph.gv.gz
- path: output/flye/test.assembly_info.txt
contains: ["contig_1"]
- path: output/flye/test.flye.log
- path: output/flye/test.params.json
md5sum: 54b576cb6d4d27656878a7fd3657bde9
- name: flye test_flye_nano_hq
command: nextflow run ./tests/modules/flye -entry test_flye_nano_hq -c ./tests/config/nextflow.config -c ./tests/modules/flye/nextflow.config
tags:
- flye
files:
- path: output/flye/test.assembly.fasta.gz
- path: output/flye/test.assembly_graph.gfa.gz
- path: output/flye/test.assembly_graph.gv.gz
- path: output/flye/test.assembly_info.txt
contains: ["contig_1"]
- path: output/flye/test.flye.log
- path: output/flye/test.params.json
md5sum: 54b576cb6d4d27656878a7fd3657bde9
Loading…
Cancel
Save