New last/train module to train alignment parameters. (#492)

* New last/train module to train alignment parameters.

The last-train command creates a parameter file that
will be used by last/lastal module for sequence alignment.
It takes indexed sequences and query sequences as input
and we use the metadata of both to create an id of the
parameter output file.

Submission of the LAST modules is discussed in more
details in the issue #464. For consistancy, we use LAST
version 1219 for this whole development and will upgrade later.

* Corrected files according to the nf-core v1.14 standards.

* Fixed function.nf file for the last-train module.

* Apply suggestions from code review

Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>

* Find index name.

* Correct after the input channels were changed.

* Use double underscore as a name separator.

Single underscores can happen in ids, therefore, we would like to keep two underscores.

* Remove extra spaces.

* Fixed the passing of the "score matrix" line.

* Apply suggestions from code review

Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>

* Update software/last/train/main.nf

Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>
This commit is contained in:
aleksandrabliznina 2021-05-19 16:37:08 +09:00 committed by GitHub
parent e84eaa22f3
commit b592cea30b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 206 additions and 0 deletions

View file

@ -0,0 +1,70 @@
/*
* -----------------------------------------------------
* Utility functions used in nf-core DSL2 module files
* -----------------------------------------------------
*/
/*
* Extract name of software tool from process name using $task.process
*/
def getSoftwareName(task_process) {
return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
}
/*
* Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
*/
def initOptions(Map args) {
def Map options = [:]
options.args = args.args ?: ''
options.args2 = args.args2 ?: ''
options.args3 = args.args3 ?: ''
options.publish_by_meta = args.publish_by_meta ?: []
options.publish_dir = args.publish_dir ?: ''
options.publish_files = args.publish_files
options.suffix = args.suffix ?: ''
return options
}
/*
* Tidy up and join elements of a list to return a path string
*/
def getPathFromList(path_list) {
def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries
paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
return paths.join('/')
}
/*
* Function to save/publish module results
*/
def saveFiles(Map args) {
if (!args.filename.endsWith('.version.txt')) {
def ioptions = initOptions(args.options)
def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
if (ioptions.publish_by_meta) {
def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta
for (key in key_list) {
if (args.meta && key instanceof String) {
def path = key
if (args.meta.containsKey(key)) {
path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key]
}
path = path instanceof String ? path : ''
path_list.add(path)
}
}
}
if (ioptions.publish_files instanceof Map) {
for (ext in ioptions.publish_files) {
if (args.filename.endsWith(ext.key)) {
def ext_list = path_list.collect()
ext_list.add(ext.value)
return "${getPathFromList(ext_list)}/$args.filename"
}
}
} else if (ioptions.publish_files == null) {
return "${getPathFromList(path_list)}/$args.filename"
}
}
}

View file

@ -0,0 +1,44 @@
// Import generic module functions
include { initOptions; saveFiles; getSoftwareName } from './functions'
params.options = [:]
options = initOptions(params.options)
process LAST_TRAIN {
tag "$meta.id"
label 'process_high'
publishDir "${params.outdir}",
mode: params.publish_dir_mode,
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) }
conda (params.enable_conda ? "bioconda::last=1219" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/last:1219--h2e03b76_0"
} else {
container "quay.io/biocontainers/last:1219--h2e03b76_0"
}
input:
tuple val(meta), path(fastx)
path index
output:
tuple val(meta), path("*.par"), emit: param_file
path "*.version.txt" , emit: version
script:
def software = getSoftwareName(task.process)
def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}"
"""
INDEX_NAME=`find -L lastdb/ -name "*.bck" | sed 's/.bck//' | sed 's,lastdb/,,'`
last-train \\
$options.args \\
-P $task.cpus \\
${index}/\$INDEX_NAME \\
$fastx \\
> ${prefix}.\$INDEX_NAME.par
lastdb --version | sed 's/lastdb //' > ${software}.version.txt
"""
}

View file

@ -0,0 +1,48 @@
name: last_train
description: Find suitable score parameters for sequence alignment
keywords:
- LAST
- train
- fastq
- fasta
tools:
- last:
description: LAST finds & aligns related regions of sequences.
homepage: https://gitlab.com/mcfrith/last
documentation: https://gitlab.com/mcfrith/last/-/blob/main/doc/last-train.rst
tool_dev_url: https://gitlab.com/mcfrith/last
doi: ""
licence: ['GPL v3-or-later']
input:
- index:
type: directory
description: Directory containing the files of the LAST index
pattern: "lastdb/"
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- fastx:
type: file
description: FASTA/FASTQ file
pattern: "*.{fasta,fastq}"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- version:
type: file
description: File containing software version
pattern: "*.{version.txt}"
- param_file:
type: file
description: Trained parameter file
pattern: "*.par"
authors:
- "@aleksandrabliznina"

View file

@ -374,6 +374,10 @@ last/lastdb:
- software/last/lastdb/** - software/last/lastdb/**
- tests/software/last/lastdb/** - tests/software/last/lastdb/**
last/train:
- software/last/train/**
- tests/software/last/train/**
mash/sketch: mash/sketch:
- software/mash/sketch/** - software/mash/sketch/**
- tests/software/mash/sketch/** - tests/software/mash/sketch/**

View file

@ -26,6 +26,8 @@ params {
all_sites_fas = "${test_data_dir}/genomics/sarscov2/genome/alignment/all_sites.fas" all_sites_fas = "${test_data_dir}/genomics/sarscov2/genome/alignment/all_sites.fas"
informative_sites_fas = "${test_data_dir}/genomics/sarscov2/genome/alignment/informative_sites.fas" informative_sites_fas = "${test_data_dir}/genomics/sarscov2/genome/alignment/informative_sites.fas"
lastdb_tar_gz = "${test_data_dir}/genomics/sarscov2/genome/alignment/last/lastdb.tar.gz"
} }
'illumina' { 'illumina' {
test_single_end_bam = "${test_data_dir}/genomics/sarscov2/illumina/bam/test.single_end.bam" test_single_end_bam = "${test_data_dir}/genomics/sarscov2/illumina/bam/test.single_end.bam"

View file

@ -0,0 +1,15 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { UNTAR } from '../../../../software/untar/main.nf' addParams( options: [:] )
include { LAST_TRAIN } from '../../../../software/last/train/main.nf' addParams( options: [:] )
workflow test_last_train {
db = [ file(params.test_data['sarscov2']['genome']['lastdb_tar_gz'], checkIfExists: true) ]
input = [ [ id:'contigs' ], // meta map
file(params.test_data['sarscov2']['illumina']['contigs_fasta'], checkIfExists: true) ]
UNTAR ( db )
LAST_TRAIN ( input, UNTAR.out.untar )
}

View file

@ -0,0 +1,23 @@
- name: last train test_last_train
command: nextflow run tests/software/last/train -entry test_last_train -c tests/config/nextflow.config
tags:
- last/train
- last
files:
- path: output/last/contigs.genome.par
contains:
- "score matrix"
- path: output/untar/lastdb/genome.bck
md5sum: 5519879b9b6c4d1fc508da7f17f88f2e
- path: output/untar/lastdb/genome.des
md5sum: 3a9ea6d336e113a74d7fdca5e7b623fc
- path: output/untar/lastdb/genome.prj
md5sum: 489715f14b0fea6273822696e72357f9
- path: output/untar/lastdb/genome.sds
md5sum: 2cd381f4f8a9c52cfcd323a2863eccb2
- path: output/untar/lastdb/genome.ssp
md5sum: 4137fb6fe9df2b3d78d5b960390aac7b
- path: output/untar/lastdb/genome.suf
md5sum: 1895efa8653e8e9bd3605cff0408ed33
- path: output/untar/lastdb/genome.tis
md5sum: b7c40f06b1309dc6f37849eeb86dfd22