From 5b1cea7f7f38b78a8fd8e2b90b1b877bd117ed96 Mon Sep 17 00:00:00 2001 From: Jose Espinosa-Carrasco Date: Thu, 30 Sep 2021 14:37:35 +0200 Subject: [PATCH] Add bbmap/bbsplit module (#771) * Add bbmap/bbsplit module * Conda complains about md5sum * Apply suggestions from code review Co-authored-by: Harshil Patel --- modules/bbmap/bbsplit/functions.nf | 78 ++++++++++++++++++++++ modules/bbmap/bbsplit/main.nf | 96 ++++++++++++++++++++++++++++ modules/bbmap/bbsplit/meta.yml | 75 ++++++++++++++++++++++ tests/config/pytest_modules.yml | 4 ++ tests/modules/bbmap/bbsplit/main.nf | 22 +++++++ tests/modules/bbmap/bbsplit/test.yml | 24 +++++++ 6 files changed, 299 insertions(+) create mode 100644 modules/bbmap/bbsplit/functions.nf create mode 100644 modules/bbmap/bbsplit/main.nf create mode 100644 modules/bbmap/bbsplit/meta.yml create mode 100644 tests/modules/bbmap/bbsplit/main.nf create mode 100644 tests/modules/bbmap/bbsplit/test.yml diff --git a/modules/bbmap/bbsplit/functions.nf b/modules/bbmap/bbsplit/functions.nf new file mode 100644 index 00000000..85628ee0 --- /dev/null +++ b/modules/bbmap/bbsplit/functions.nf @@ -0,0 +1,78 @@ +// +// Utility functions used in nf-core DSL2 module files +// + +// +// Extract name of software tool from process name using $task.process +// +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +// +// Extract name of module from process name using $task.process +// +def getProcessName(task_process) { + return task_process.tokenize(':')[-1] +} + +// +// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules +// +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.args3 = args.args3 ?: '' + options.publish_by_meta = args.publish_by_meta ?: [] + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +// +// Tidy up and join elements of a list to return a path string +// +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +// +// Function to save/publish module results +// +def saveFiles(Map args) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + + // Do not publish versions.yml unless running from pytest workflow + if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) { + return null + } + if (ioptions.publish_by_meta) { + def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta + for (key in key_list) { + if (args.meta && key instanceof String) { + def path = key + if (args.meta.containsKey(key)) { + path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] + } + path = path instanceof String ? path : '' + path_list.add(path) + } + } + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } +} diff --git a/modules/bbmap/bbsplit/main.nf b/modules/bbmap/bbsplit/main.nf new file mode 100644 index 00000000..614a4c02 --- /dev/null +++ b/modules/bbmap/bbsplit/main.nf @@ -0,0 +1,96 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process BBMAP_BBSPLIT { + tag "$meta.id" + label 'process_high' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "bioconda::bbmap=38.93" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bbmap:38.93--he522d1c_0" + } else { + container "quay.io/biocontainers/bbmap:38.93--he522d1c_0" + } + + input: + tuple val(meta), path(reads) + path index + path primary_ref + tuple val(other_ref_names), path (other_ref_paths) + val only_build_index + + output: + path "bbsplit" , optional:true, emit: index + tuple val(meta), path('*primary*fastq.gz'), optional:true, emit: primary_fastq + tuple val(meta), path('*fastq.gz') , optional:true, emit: all_fastq + tuple val(meta), path('*txt') , optional:true, emit: stats + path "versions.yml" , emit: version + + script: + def software = getSoftwareName(task.process) + def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + + def avail_mem = 3 + if (!task.memory) { + log.info '[BBSplit] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = task.memory.giga + } + + def other_refs = [] + other_ref_names.eachWithIndex { name, index -> + other_refs << "ref_${name}=${other_ref_paths[index]}" + } + if (only_build_index) { + if (primary_ref && other_ref_names && other_ref_paths) { + """ + bbsplit.sh \\ + -Xmx${avail_mem}g \\ + ref_primary=$primary_ref \\ + ${other_refs.join(' ')} \\ + path=bbsplit \\ + threads=$task.cpus \\ + $options.args + + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + ${getSoftwareName(task.process)}: \$(bbversion.sh 2>&1) + END_VERSIONS + """ + } else { + log.error 'ERROR: Please specify as input a primary fasta file along with names and paths to non-primary fasta files.' + } + } else { + def index_files = '' + if (index) { + index_files = "path=$index" + } else if (primary_ref && other_ref_names && other_ref_paths) { + index_files = "ref_primary=${primary_ref} ${other_refs.join(' ')}" + } else { + log.error 'ERROR: Please either specify a BBSplit index as input or a primary fasta file along with names and paths to non-primary fasta files.' + } + def fastq_in = meta.single_end ? "in=${reads}" : "in=${reads[0]} in2=${reads[1]}" + def fastq_out = meta.single_end ? "basename=${prefix}_%.fastq.gz" : "basename=${prefix}_%_#.fastq.gz" + """ + bbsplit.sh \\ + -Xmx${avail_mem}g \\ + $index_files \\ + threads=$task.cpus \\ + $fastq_in \\ + $fastq_out \\ + refstats=${prefix}.stats.txt \\ + $options.args + + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + ${getSoftwareName(task.process)}: \$(bbversion.sh 2>&1) + END_VERSIONS + """ + } +} diff --git a/modules/bbmap/bbsplit/meta.yml b/modules/bbmap/bbsplit/meta.yml new file mode 100644 index 00000000..2eb3a6c9 --- /dev/null +++ b/modules/bbmap/bbsplit/meta.yml @@ -0,0 +1,75 @@ +name: bbmap_bbsplit +description: write your description here +keywords: + - align + - map + - genome + - reference +tools: + - bbmap: + description: BBMap is a short read aligner, as well as various other bioinformatic tools. + homepage: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + documentation: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + tool_dev_url: None + doi: "" + licence: ['UC-LBL license (see package)'] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - index: + type: directory + description: Directory to place generated index + pattern: "*" + - primary_ref: + type: path + description: Path to the primary reference + pattern: "*" + - other_ref_names: + type: list + description: List of other reference ids apart from the primary + - other_ref_paths: + type: list + description: Path to other references paths corresponding to "other_ref_names" + - only_build_index: + type: string + description: true = only build index; false = mapping + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - version: + type: file + description: File containing software version + pattern: "versions.yml" + - index: + type: directory + description: Directory with index files + pattern: "bbsplit" + - primary_fastq: + type: file + description: Output reads that map to the primary reference + pattern: "*primary*fastq.gz" + - all_fastq: + type: file + description: All reads mapping to any of the references + pattern: "*fastq.gz" + - stats: + type: file + description: Tab-delimited text file containing mapping statistics + pattern: "*.txt" + +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/tests/config/pytest_modules.yml b/tests/config/pytest_modules.yml index 3dda1d94..63152fe0 100644 --- a/tests/config/pytest_modules.yml +++ b/tests/config/pytest_modules.yml @@ -46,6 +46,10 @@ bbmap/bbduk: - modules/bbmap/bbduk/** - tests/modules/bbmap/bbduk/** +bbmap/bbsplit: + - modules/bbmap/bbsplit/** + - tests/modules/bbmap/bbsplit/** + bbmap/index: - modules/bbmap/index/** - tests/modules/bbmap/index/** diff --git a/tests/modules/bbmap/bbsplit/main.nf b/tests/modules/bbmap/bbsplit/main.nf new file mode 100644 index 00000000..1d3c30c1 --- /dev/null +++ b/tests/modules/bbmap/bbsplit/main.nf @@ -0,0 +1,22 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +include { BBMAP_BBSPLIT as BBMAP_BBSPLIT_INDEX } from '../../../../modules/bbmap/bbsplit/main.nf' addParams( options: [:] ) +include { BBMAP_BBSPLIT as BBMAP_BBSPLIT_SPLIT } from '../../../../modules/bbmap/bbsplit/main.nf' addParams( options: [:] ) + +workflow test_bbmap_bbsplit { + + input = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + bbsplit_fasta_list = [ + ['human'], + file('https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/chr22_23800000-23980000.fa', checkIfExists: true) + ] + fasta = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + + BBMAP_BBSPLIT_INDEX ( [ [:], [] ], [], fasta, bbsplit_fasta_list, true ) + BBMAP_BBSPLIT_SPLIT ( input, BBMAP_BBSPLIT_INDEX.out.index, fasta, bbsplit_fasta_list, true ) +} diff --git a/tests/modules/bbmap/bbsplit/test.yml b/tests/modules/bbmap/bbsplit/test.yml new file mode 100644 index 00000000..87bdebea --- /dev/null +++ b/tests/modules/bbmap/bbsplit/test.yml @@ -0,0 +1,24 @@ +- name: bbmap bbsplit test_bbmap_bbsplit + command: nextflow run tests/modules/bbmap/bbsplit -entry test_bbmap_bbsplit -c tests/config/nextflow.config + tags: + - bbmap/bbsplit + - bbmap + files: + - path: output/bbmap/bbsplit/ref/genome/1/chr1.chrom.gz + - path: output/bbmap/bbsplit/ref/genome/1/info.txt + contains: + - 'Chromosome' + - path: output/bbmap/bbsplit/ref/genome/1/merged_ref_9222711925172838098.fa.gz + - path: output/bbmap/bbsplit/ref/genome/1/namelist.txt + md5sum: 45e7a4cdc7a11a39ada56844ca3a1e30 + - path: output/bbmap/bbsplit/ref/genome/1/reflist.txt + contains: + - 'genome.fasta' + - path: output/bbmap/bbsplit/ref/genome/1/scaffolds.txt.gz + - path: output/bbmap/bbsplit/ref/genome/1/summary.txt + contains: + - 'scaffolds' + - path: output/bbmap/bbsplit/ref/index/1/chr1_index_k13_c13_b1.block + md5sum: 385913c1e84b77dc7bf36288ee1c8706 + - path: output/bbmap/bbsplit/ref/index/1/chr1_index_k13_c13_b1.block2.gz + md5sum: 9de572b603abe5b6540056db8dee05a5