From 207930139a314b04ce8b9cac4090df31fad30257 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Wed, 26 May 2021 06:10:48 +0900 Subject: [PATCH] New last/lastal module to align query sequences on a target index (#510) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * New last/lastal to align query sequences on a target index `lastal` is the main program of the [LAST](https://gitlab.com/mcfrith/last) suite. It align query DNA sequences in FASTA or FASTQ format to a target index of DNA or protein sequences. The index is produced by the `lastdb` program (module `last/lastdb`). The score matrix for evaluating the alignment can be chosen among preset ones or computed iteratively by the `last-train` program (module `last/train`). For this reason, the `last/lastal` module proposed here has one input channel containing an optional file, that has to be dummy when not used. The LAST aligner outputs MAF files that can be very large (up to hundreds of gigabytes), therefore this module unconditionally compresses its output with gzip. This new module is part of the work described in Issue #464. During this development, we fix the version of LAST to 1219 to ensure consistency (hence ignore lint's version warning). * Apply suggestions from code review Co-authored-by: Harshil Patel * Un-hardcode the path to the LAST index. Among multiple alternatives I have chosen the following command to detect the sample name of the index, because it fails in situations where there is no index files in the index folder, and in situations were there are two indexes files in the folder. Not failing would result in feeding garbage information in the INDEX_NAME variable. basename \$(ls $index/*.bck) .bck In case of missing file, a clear error message is given by `ls`. In case of more than one file, the error message of `basename` is more cryptic, unfortunately. (`basename: extra operand ‘.bck’`) Alternatives that do not fail if there is no .bck file: basename $index/*bck .bck find $index -name '*bck' | sed 's/.bck//' Alternatives that do not fail if there are more than one .bck file: basename -s .bck $index/*bck ls $index/*.bck | xargs basename -s .bck find $index -name '*bck' | sed 's/.bck//' Co-authored-by: Harshil Patel --- software/last/lastal/functions.nf | 70 +++++++++++++++++++++++++++++ software/last/lastal/main.nf | 48 ++++++++++++++++++++ software/last/lastal/meta.yml | 52 +++++++++++++++++++++ tests/config/pytest_software.yml | 4 ++ tests/config/test_data.config | 1 + tests/software/last/lastal/main.nf | 27 +++++++++++ tests/software/last/lastal/test.yml | 45 +++++++++++++++++++ 7 files changed, 247 insertions(+) create mode 100644 software/last/lastal/functions.nf create mode 100644 software/last/lastal/main.nf create mode 100644 software/last/lastal/meta.yml create mode 100644 tests/software/last/lastal/main.nf create mode 100644 tests/software/last/lastal/test.yml diff --git a/software/last/lastal/functions.nf b/software/last/lastal/functions.nf new file mode 100644 index 00000000..9d0137e3 --- /dev/null +++ b/software/last/lastal/functions.nf @@ -0,0 +1,70 @@ +/* + * ----------------------------------------------------- + * Utility functions used in nf-core DSL2 module files + * ----------------------------------------------------- + */ + +/* + * Extract name of software tool from process name using $task.process + */ +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +/* + * Function to initialise default values and to generate a Groovy Map of available options for nf-core modules + */ +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.args3 = args.args3 ?: '' + options.publish_by_meta = args.publish_by_meta ?: [] + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +/* + * Tidy up and join elements of a list to return a path string + */ +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +/* + * Function to save/publish module results + */ +def saveFiles(Map args) { + if (!args.filename.endsWith('.version.txt')) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + if (ioptions.publish_by_meta) { + def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta + for (key in key_list) { + if (args.meta && key instanceof String) { + def path = key + if (args.meta.containsKey(key)) { + path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] + } + path = path instanceof String ? path : '' + path_list.add(path) + } + } + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } + } +} diff --git a/software/last/lastal/main.nf b/software/last/lastal/main.nf new file mode 100644 index 00000000..83a24d6c --- /dev/null +++ b/software/last/lastal/main.nf @@ -0,0 +1,48 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process LAST_LASTAL { + tag "$meta.id" + label 'process_high' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "bioconda::last=1219" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/last:1219--h2e03b76_0" + } else { + container "quay.io/biocontainers/last:1219--h2e03b76_0" + } + + input: + tuple val(meta), path(fastx) + path index + path param_file + + output: + tuple val(meta), path("*.maf.gz"), emit: maf + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + def trained_params = param_file ? "-p ${param_file}" : '' + """ + INDEX_NAME=\$(basename \$(ls $index/*.bck) .bck) + lastal \\ + $trained_params \\ + $options.args \\ + -P $task.cpus \\ + ${index}/\$INDEX_NAME \\ + $fastx \\ + | gzip --no-name > ${prefix}.\$INDEX_NAME.maf.gz + # gzip needs --no-name otherwise it puts a timestamp in the file, + # which makes its checksum non-reproducible. + + echo \$(lastal --version 2>&1) | sed 's/lastal //' > ${software}.version.txt + """ +} diff --git a/software/last/lastal/meta.yml b/software/last/lastal/meta.yml new file mode 100644 index 00000000..2237c75a --- /dev/null +++ b/software/last/lastal/meta.yml @@ -0,0 +1,52 @@ +name: last_lastal +description: Find suitable score parameters for sequence alignment +keywords: + - LAST + - align + - fastq + - fasta +tools: + - last: + description: LAST finds & aligns related regions of sequences. + homepage: https://gitlab.com/mcfrith/last + documentation: https://gitlab.com/mcfrith/last/-/blob/main/doc/last-train.rst + tool_dev_url: https://gitlab.com/mcfrith/last + doi: "" + licence: ['GPL v3-or-later'] + +input: + - index: + type: directory + description: Directory containing the files of the LAST index + pattern: "lastdb/" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastx: + type: file + description: FASTA/FASTQ file + pattern: "*.{fasta,fastq}" + - param_file: + type: file + description: Trained parameter file + pattern: "*.par" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" + - maf: + type: file + description: Gzipped MAF (Multiple Alignment Format) file + pattern: "*.{maf.gz}" + +authors: + - "@charles-plessy" diff --git a/tests/config/pytest_software.yml b/tests/config/pytest_software.yml index b538f2b7..126368cc 100644 --- a/tests/config/pytest_software.yml +++ b/tests/config/pytest_software.yml @@ -370,6 +370,10 @@ kraken2/run: - software/untar/** - tests/software/kraken2/run/** +last/lastal: + - software/last/lastal/** + - tests/software/last/lastal/** + last/lastdb: - software/last/lastdb/** - tests/software/last/lastdb/** diff --git a/tests/config/test_data.config b/tests/config/test_data.config index 0b497ff9..228a327f 100644 --- a/tests/config/test_data.config +++ b/tests/config/test_data.config @@ -28,6 +28,7 @@ params { informative_sites_fas = "${test_data_dir}/genomics/sarscov2/genome/alignment/informative_sites.fas" contigs_genome_maf_gz = "${test_data_dir}/genomics/sarscov2/genome/alignment/last/contigs.genome.maf.gz" + contigs_genome_par = "${test_data_dir}/genomics/sarscov2/genome/alignment/last/contigs.genome.par" lastdb_tar_gz = "${test_data_dir}/genomics/sarscov2/genome/alignment/last/lastdb.tar.gz" } 'illumina' { diff --git a/tests/software/last/lastal/main.nf b/tests/software/last/lastal/main.nf new file mode 100644 index 00000000..68dd7ee9 --- /dev/null +++ b/tests/software/last/lastal/main.nf @@ -0,0 +1,27 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +include { UNTAR } from '../../../../software/untar/main.nf' addParams( options: [:] ) +include { LAST_LASTAL } from '../../../../software/last/lastal/main.nf' addParams( options: [:] ) + +workflow test_last_lastal_with_dummy_param_file { + + input = [ [ id:'contigs', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['contigs_fasta'], checkIfExists: true) ] + db = [ file(params.test_data['sarscov2']['genome']['lastdb_tar_gz'], checkIfExists: true) ] + + UNTAR ( db ) + LAST_LASTAL ( input, UNTAR.out.untar, [] ) +} + +workflow test_last_lastal_with_real_param_file { + + input = [ [ id:'contigs', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['contigs_fasta'], checkIfExists: true) ] + db = [ file(params.test_data['sarscov2']['genome']['lastdb_tar_gz'], checkIfExists: true) ] + param_file = [ file(params.test_data['sarscov2']['genome']['contigs_genome_par'], checkIfExists: true) ] + + UNTAR ( db ) + LAST_LASTAL ( input, UNTAR.out.untar, param_file ) +} diff --git a/tests/software/last/lastal/test.yml b/tests/software/last/lastal/test.yml new file mode 100644 index 00000000..5bb41dc7 --- /dev/null +++ b/tests/software/last/lastal/test.yml @@ -0,0 +1,45 @@ +- name: last lastal test_last_lastal_with_dummy_param_file + command: nextflow run tests/software/last/lastal -entry test_last_lastal_with_dummy_param_file -c tests/config/nextflow.config + tags: + - last + - last/lastal + files: + - path: output/last/contigs.genome.maf.gz + md5sum: 33e9098d8242b0aac829339c03fe94aa + - path: output/untar/lastdb/genome.bck + md5sum: 5519879b9b6c4d1fc508da7f17f88f2e + - path: output/untar/lastdb/genome.des + md5sum: 3a9ea6d336e113a74d7fdca5e7b623fc + - path: output/untar/lastdb/genome.prj + md5sum: 489715f14b0fea6273822696e72357f9 + - path: output/untar/lastdb/genome.sds + md5sum: 2cd381f4f8a9c52cfcd323a2863eccb2 + - path: output/untar/lastdb/genome.ssp + md5sum: 4137fb6fe9df2b3d78d5b960390aac7b + - path: output/untar/lastdb/genome.suf + md5sum: 1895efa8653e8e9bd3605cff0408ed33 + - path: output/untar/lastdb/genome.tis + md5sum: b7c40f06b1309dc6f37849eeb86dfd22 + +- name: last lastal test_last_lastal_with_real_param_file + command: nextflow run tests/software/last/lastal -entry test_last_lastal_with_real_param_file -c tests/config/nextflow.config + tags: + - last + - last/lastal + files: + - path: output/last/contigs.genome.maf.gz + md5sum: 3a0f42e76da9549748983ac4d7ff7473 + - path: output/untar/lastdb/genome.bck + md5sum: 5519879b9b6c4d1fc508da7f17f88f2e + - path: output/untar/lastdb/genome.des + md5sum: 3a9ea6d336e113a74d7fdca5e7b623fc + - path: output/untar/lastdb/genome.prj + md5sum: 489715f14b0fea6273822696e72357f9 + - path: output/untar/lastdb/genome.sds + md5sum: 2cd381f4f8a9c52cfcd323a2863eccb2 + - path: output/untar/lastdb/genome.ssp + md5sum: 4137fb6fe9df2b3d78d5b960390aac7b + - path: output/untar/lastdb/genome.suf + md5sum: 1895efa8653e8e9bd3605cff0408ed33 + - path: output/untar/lastdb/genome.tis + md5sum: b7c40f06b1309dc6f37849eeb86dfd22