New module last/mafswap to reorder sequences in alignments (#500)

* New module last/mafswap to reorder sequences in alignments

The `maf-swap` tool distributed with [LAST](https://gitlab.com/mcfrith/last)
reorders sequences in alignment files in Multiple Alignment Format.
When run without command-line arguments, it will swap the target and the
query sequences.  This is useful when turning a many-to-many alignment
into a many-to-one and then a one-to-one alignment in conjunction with
the `last-split` command (split, swap, split and swap again).

The LAST aligner outputs MAF files, but other tools also use this
format.  As MAF files can be very large (up to hundreds of gigabytes),
the module expects its input to be compressed with gzip and will
compress its output.

This new module is part of the work described in Issue #464.  During
this development, we fix the version of LAST to 1219 to ensure
consistency (hence ignore lint's version warning).

* Update MD5 sum.

Actually, 7029066c27ac6f5ef18d660d5741979a is the MD5 sum of
an empty file compressed with `gzip --no-name`…  This happened
because I forgot to update the config file after correcting the
module… sorry !

* Apply suggestions from code review

Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>

* Change name as suggested in pull request.

Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>
This commit is contained in:
Charles Plessy 2021-05-19 16:59:23 +09:00 committed by GitHub
parent b592cea30b
commit e75f88c68a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 172 additions and 0 deletions

View file

@ -0,0 +1,70 @@
/*
* -----------------------------------------------------
* Utility functions used in nf-core DSL2 module files
* -----------------------------------------------------
*/
/*
* Extract name of software tool from process name using $task.process
*/
def getSoftwareName(task_process) {
return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
}
/*
* Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
*/
def initOptions(Map args) {
def Map options = [:]
options.args = args.args ?: ''
options.args2 = args.args2 ?: ''
options.args3 = args.args3 ?: ''
options.publish_by_meta = args.publish_by_meta ?: []
options.publish_dir = args.publish_dir ?: ''
options.publish_files = args.publish_files
options.suffix = args.suffix ?: ''
return options
}
/*
* Tidy up and join elements of a list to return a path string
*/
def getPathFromList(path_list) {
def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries
paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
return paths.join('/')
}
/*
* Function to save/publish module results
*/
def saveFiles(Map args) {
if (!args.filename.endsWith('.version.txt')) {
def ioptions = initOptions(args.options)
def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
if (ioptions.publish_by_meta) {
def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta
for (key in key_list) {
if (args.meta && key instanceof String) {
def path = key
if (args.meta.containsKey(key)) {
path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key]
}
path = path instanceof String ? path : ''
path_list.add(path)
}
}
}
if (ioptions.publish_files instanceof Map) {
for (ext in ioptions.publish_files) {
if (args.filename.endsWith(ext.key)) {
def ext_list = path_list.collect()
ext_list.add(ext.value)
return "${getPathFromList(ext_list)}/$args.filename"
}
}
} else if (ioptions.publish_files == null) {
return "${getPathFromList(path_list)}/$args.filename"
}
}
}

View file

@ -0,0 +1,37 @@
// Import generic module functions
include { initOptions; saveFiles; getSoftwareName } from './functions'
params.options = [:]
options = initOptions(params.options)
process LAST_MAFSWAP {
tag "$meta.id"
label 'process_low'
publishDir "${params.outdir}",
mode: params.publish_dir_mode,
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) }
conda (params.enable_conda ? "bioconda::last=1219" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/last:1219--h2e03b76_0"
} else {
container "quay.io/biocontainers/last:1219--h2e03b76_0"
}
input:
tuple val(meta), path(maf)
output:
tuple val(meta), path("*.maf.gz"), emit: maf
path "*.version.txt" , emit: version
script:
def software = getSoftwareName(task.process)
def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}"
"""
zcat $maf | maf-swap $options.args | gzip --no-name > ${prefix}.swapped.maf.gz
# maf-swap has no --version option but lastdb, part of the same package, has.
echo \$(lastdb --version 2>&1) | sed 's/lastdb //' > ${software}.version.txt
"""
}

View file

@ -0,0 +1,39 @@
name: last_mafswap
description: Reorder alignments in a MAF file
keywords:
- LAST
- reorder
- alignment
- MAF
tools:
- last:
description: LAST finds & aligns related regions of sequences.
homepage: https://gitlab.com/mcfrith/last
documentation: https://gitlab.com/mcfrith/last/-/blob/main/doc/
tool_dev_url: https://gitlab.com/mcfrith/last
doi: ""
licence: ['GPL-3.0-or-later']
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- maf:
type: file
description: Multiple Aligment Format (MAF) file, compressed with gzip
pattern: "*.{maf.gz}"
output:
- maf:
type: file
description: Multiple Aligment Format (MAF) file, compressed with gzip
pattern: "*.{maf.gz}"
- version:
type: file
description: File containing software version
pattern: "*.{version.txt}"
authors:
- "@charles-plessy"

View file

@ -374,6 +374,10 @@ last/lastdb:
- software/last/lastdb/** - software/last/lastdb/**
- tests/software/last/lastdb/** - tests/software/last/lastdb/**
last/mafswap:
- software/last/mafswap/**
- tests/software/last/mafswap/**
last/train: last/train:
- software/last/train/** - software/last/train/**
- tests/software/last/train/** - tests/software/last/train/**

View file

@ -27,6 +27,7 @@ params {
all_sites_fas = "${test_data_dir}/genomics/sarscov2/genome/alignment/all_sites.fas" all_sites_fas = "${test_data_dir}/genomics/sarscov2/genome/alignment/all_sites.fas"
informative_sites_fas = "${test_data_dir}/genomics/sarscov2/genome/alignment/informative_sites.fas" informative_sites_fas = "${test_data_dir}/genomics/sarscov2/genome/alignment/informative_sites.fas"
contigs_genome_maf_gz = "${test_data_dir}/genomics/sarscov2/genome/alignment/last/contigs.genome.maf.gz"
lastdb_tar_gz = "${test_data_dir}/genomics/sarscov2/genome/alignment/last/lastdb.tar.gz" lastdb_tar_gz = "${test_data_dir}/genomics/sarscov2/genome/alignment/last/lastdb.tar.gz"
} }
'illumina' { 'illumina' {

View file

@ -0,0 +1,13 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { LAST_MAFSWAP } from '../../../../software/last/mafswap/main.nf' addParams( options: [:] )
workflow test_last_mafswap {
input = [ [ id:'contigs.genome' ], // meta map
file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) ]
LAST_MAFSWAP ( input )
}

View file

@ -0,0 +1,8 @@
- name: last mafswap test_last_mafswap
command: nextflow run tests/software/last/mafswap -entry test_last_mafswap -c tests/config/nextflow.config
tags:
- last
- last/mafswap
files:
- path: output/last/contigs.genome.swapped.maf.gz
md5sum: b98c5ff297878a19f1ab4f1a5e354678