New last/lastal module to align query sequences on a target index (#510)

* New last/lastal to align query sequences on a target index

`lastal` is the main program of the [LAST](https://gitlab.com/mcfrith/last)
suite.  It align query DNA sequences in FASTA or FASTQ format to a
target index of DNA or protein sequences.  The index is produced by
the `lastdb` program (module `last/lastdb`).  The score matrix for
evaluating the alignment can be chosen among preset ones or computed
iteratively by the `last-train` program (module `last/train`).  For
this reason, the `last/lastal` module proposed here has one input
channel containing an optional file, that has to be dummy when not used.

The LAST aligner outputs MAF files that can be very large (up to
hundreds of gigabytes), therefore this module unconditionally compresses
its output with gzip.

This new module is part of the work described in Issue #464.  During
this development, we fix the version of LAST to 1219 to ensure
consistency (hence ignore lint's version warning).

* Apply suggestions from code review

Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>

* Un-hardcode the path to the LAST index.

Among multiple alternatives I have chosen the following command to
detect the sample name of the index, because it fails in situations
where there is no index files in the index folder, and in situations
were there are two indexes files in the folder.  Not failing would
result in feeding garbage information in the INDEX_NAME variable.

    basename \$(ls $index/*.bck) .bck

In case of missing file, a clear error message is given by `ls`.  In
case of more than one file, the error message of `basename` is more
cryptic, unfortunately.  (`basename: extra operand ‘.bck’`)

Alternatives that do not fail if there is no .bck file:

    basename $index/*bck .bck
    find $index -name '*bck' | sed 's/.bck//'

Alternatives that do not fail if there are more than one .bck file:

    basename -s .bck $index/*bck
    ls $index/*.bck | xargs basename -s .bck
    find $index -name '*bck' | sed 's/.bck//'

Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>
This commit is contained in:
Charles Plessy 2021-05-26 06:10:48 +09:00 committed by GitHub
parent 34f555a26a
commit 207930139a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 247 additions and 0 deletions

View file

@ -0,0 +1,70 @@
/*
* -----------------------------------------------------
* Utility functions used in nf-core DSL2 module files
* -----------------------------------------------------
*/
/*
* Extract name of software tool from process name using $task.process
*/
def getSoftwareName(task_process) {
return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
}
/*
* Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
*/
def initOptions(Map args) {
def Map options = [:]
options.args = args.args ?: ''
options.args2 = args.args2 ?: ''
options.args3 = args.args3 ?: ''
options.publish_by_meta = args.publish_by_meta ?: []
options.publish_dir = args.publish_dir ?: ''
options.publish_files = args.publish_files
options.suffix = args.suffix ?: ''
return options
}
/*
* Tidy up and join elements of a list to return a path string
*/
def getPathFromList(path_list) {
def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries
paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
return paths.join('/')
}
/*
* Function to save/publish module results
*/
def saveFiles(Map args) {
if (!args.filename.endsWith('.version.txt')) {
def ioptions = initOptions(args.options)
def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
if (ioptions.publish_by_meta) {
def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta
for (key in key_list) {
if (args.meta && key instanceof String) {
def path = key
if (args.meta.containsKey(key)) {
path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key]
}
path = path instanceof String ? path : ''
path_list.add(path)
}
}
}
if (ioptions.publish_files instanceof Map) {
for (ext in ioptions.publish_files) {
if (args.filename.endsWith(ext.key)) {
def ext_list = path_list.collect()
ext_list.add(ext.value)
return "${getPathFromList(ext_list)}/$args.filename"
}
}
} else if (ioptions.publish_files == null) {
return "${getPathFromList(path_list)}/$args.filename"
}
}
}

View file

@ -0,0 +1,48 @@
// Import generic module functions
include { initOptions; saveFiles; getSoftwareName } from './functions'
params.options = [:]
options = initOptions(params.options)
process LAST_LASTAL {
tag "$meta.id"
label 'process_high'
publishDir "${params.outdir}",
mode: params.publish_dir_mode,
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) }
conda (params.enable_conda ? "bioconda::last=1219" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/last:1219--h2e03b76_0"
} else {
container "quay.io/biocontainers/last:1219--h2e03b76_0"
}
input:
tuple val(meta), path(fastx)
path index
path param_file
output:
tuple val(meta), path("*.maf.gz"), emit: maf
path "*.version.txt" , emit: version
script:
def software = getSoftwareName(task.process)
def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}"
def trained_params = param_file ? "-p ${param_file}" : ''
"""
INDEX_NAME=\$(basename \$(ls $index/*.bck) .bck)
lastal \\
$trained_params \\
$options.args \\
-P $task.cpus \\
${index}/\$INDEX_NAME \\
$fastx \\
| gzip --no-name > ${prefix}.\$INDEX_NAME.maf.gz
# gzip needs --no-name otherwise it puts a timestamp in the file,
# which makes its checksum non-reproducible.
echo \$(lastal --version 2>&1) | sed 's/lastal //' > ${software}.version.txt
"""
}

View file

@ -0,0 +1,52 @@
name: last_lastal
description: Find suitable score parameters for sequence alignment
keywords:
- LAST
- align
- fastq
- fasta
tools:
- last:
description: LAST finds & aligns related regions of sequences.
homepage: https://gitlab.com/mcfrith/last
documentation: https://gitlab.com/mcfrith/last/-/blob/main/doc/last-train.rst
tool_dev_url: https://gitlab.com/mcfrith/last
doi: ""
licence: ['GPL v3-or-later']
input:
- index:
type: directory
description: Directory containing the files of the LAST index
pattern: "lastdb/"
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- fastx:
type: file
description: FASTA/FASTQ file
pattern: "*.{fasta,fastq}"
- param_file:
type: file
description: Trained parameter file
pattern: "*.par"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- version:
type: file
description: File containing software version
pattern: "*.{version.txt}"
- maf:
type: file
description: Gzipped MAF (Multiple Alignment Format) file
pattern: "*.{maf.gz}"
authors:
- "@charles-plessy"

View file

@ -370,6 +370,10 @@ kraken2/run:
- software/untar/**
- tests/software/kraken2/run/**
last/lastal:
- software/last/lastal/**
- tests/software/last/lastal/**
last/lastdb:
- software/last/lastdb/**
- tests/software/last/lastdb/**

View file

@ -28,6 +28,7 @@ params {
informative_sites_fas = "${test_data_dir}/genomics/sarscov2/genome/alignment/informative_sites.fas"
contigs_genome_maf_gz = "${test_data_dir}/genomics/sarscov2/genome/alignment/last/contigs.genome.maf.gz"
contigs_genome_par = "${test_data_dir}/genomics/sarscov2/genome/alignment/last/contigs.genome.par"
lastdb_tar_gz = "${test_data_dir}/genomics/sarscov2/genome/alignment/last/lastdb.tar.gz"
}
'illumina' {

View file

@ -0,0 +1,27 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { UNTAR } from '../../../../software/untar/main.nf' addParams( options: [:] )
include { LAST_LASTAL } from '../../../../software/last/lastal/main.nf' addParams( options: [:] )
workflow test_last_lastal_with_dummy_param_file {
input = [ [ id:'contigs', single_end:false ], // meta map
file(params.test_data['sarscov2']['illumina']['contigs_fasta'], checkIfExists: true) ]
db = [ file(params.test_data['sarscov2']['genome']['lastdb_tar_gz'], checkIfExists: true) ]
UNTAR ( db )
LAST_LASTAL ( input, UNTAR.out.untar, [] )
}
workflow test_last_lastal_with_real_param_file {
input = [ [ id:'contigs', single_end:false ], // meta map
file(params.test_data['sarscov2']['illumina']['contigs_fasta'], checkIfExists: true) ]
db = [ file(params.test_data['sarscov2']['genome']['lastdb_tar_gz'], checkIfExists: true) ]
param_file = [ file(params.test_data['sarscov2']['genome']['contigs_genome_par'], checkIfExists: true) ]
UNTAR ( db )
LAST_LASTAL ( input, UNTAR.out.untar, param_file )
}

View file

@ -0,0 +1,45 @@
- name: last lastal test_last_lastal_with_dummy_param_file
command: nextflow run tests/software/last/lastal -entry test_last_lastal_with_dummy_param_file -c tests/config/nextflow.config
tags:
- last
- last/lastal
files:
- path: output/last/contigs.genome.maf.gz
md5sum: 33e9098d8242b0aac829339c03fe94aa
- path: output/untar/lastdb/genome.bck
md5sum: 5519879b9b6c4d1fc508da7f17f88f2e
- path: output/untar/lastdb/genome.des
md5sum: 3a9ea6d336e113a74d7fdca5e7b623fc
- path: output/untar/lastdb/genome.prj
md5sum: 489715f14b0fea6273822696e72357f9
- path: output/untar/lastdb/genome.sds
md5sum: 2cd381f4f8a9c52cfcd323a2863eccb2
- path: output/untar/lastdb/genome.ssp
md5sum: 4137fb6fe9df2b3d78d5b960390aac7b
- path: output/untar/lastdb/genome.suf
md5sum: 1895efa8653e8e9bd3605cff0408ed33
- path: output/untar/lastdb/genome.tis
md5sum: b7c40f06b1309dc6f37849eeb86dfd22
- name: last lastal test_last_lastal_with_real_param_file
command: nextflow run tests/software/last/lastal -entry test_last_lastal_with_real_param_file -c tests/config/nextflow.config
tags:
- last
- last/lastal
files:
- path: output/last/contigs.genome.maf.gz
md5sum: 3a0f42e76da9549748983ac4d7ff7473
- path: output/untar/lastdb/genome.bck
md5sum: 5519879b9b6c4d1fc508da7f17f88f2e
- path: output/untar/lastdb/genome.des
md5sum: 3a9ea6d336e113a74d7fdca5e7b623fc
- path: output/untar/lastdb/genome.prj
md5sum: 489715f14b0fea6273822696e72357f9
- path: output/untar/lastdb/genome.sds
md5sum: 2cd381f4f8a9c52cfcd323a2863eccb2
- path: output/untar/lastdb/genome.ssp
md5sum: 4137fb6fe9df2b3d78d5b960390aac7b
- path: output/untar/lastdb/genome.suf
md5sum: 1895efa8653e8e9bd3605cff0408ed33
- path: output/untar/lastdb/genome.tis
md5sum: b7c40f06b1309dc6f37849eeb86dfd22