From b592cea30baaf04440e890202ee10a69ffa6026a Mon Sep 17 00:00:00 2001 From: aleksandrabliznina <34429012+aleksandrabliznina@users.noreply.github.com> Date: Wed, 19 May 2021 16:37:08 +0900 Subject: [PATCH] New last/train module to train alignment parameters. (#492) * New last/train module to train alignment parameters. The last-train command creates a parameter file that will be used by last/lastal module for sequence alignment. It takes indexed sequences and query sequences as input and we use the metadata of both to create an id of the parameter output file. Submission of the LAST modules is discussed in more details in the issue #464. For consistancy, we use LAST version 1219 for this whole development and will upgrade later. * Corrected files according to the nf-core v1.14 standards. * Fixed function.nf file for the last-train module. * Apply suggestions from code review Co-authored-by: Harshil Patel * Find index name. * Correct after the input channels were changed. * Use double underscore as a name separator. Single underscores can happen in ids, therefore, we would like to keep two underscores. * Remove extra spaces. * Fixed the passing of the "score matrix" line. * Apply suggestions from code review Co-authored-by: Harshil Patel * Update software/last/train/main.nf Co-authored-by: Harshil Patel --- software/last/train/functions.nf | 70 ++++++++++++++++++++++++++++++ software/last/train/main.nf | 44 +++++++++++++++++++ software/last/train/meta.yml | 48 ++++++++++++++++++++ tests/config/pytest_software.yml | 4 ++ tests/config/test_data.config | 2 + tests/software/last/train/main.nf | 15 +++++++ tests/software/last/train/test.yml | 23 ++++++++++ 7 files changed, 206 insertions(+) create mode 100644 software/last/train/functions.nf create mode 100644 software/last/train/main.nf create mode 100644 software/last/train/meta.yml create mode 100644 tests/software/last/train/main.nf create mode 100644 tests/software/last/train/test.yml diff --git a/software/last/train/functions.nf b/software/last/train/functions.nf new file mode 100644 index 00000000..9d0137e3 --- /dev/null +++ b/software/last/train/functions.nf @@ -0,0 +1,70 @@ +/* + * ----------------------------------------------------- + * Utility functions used in nf-core DSL2 module files + * ----------------------------------------------------- + */ + +/* + * Extract name of software tool from process name using $task.process + */ +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +/* + * Function to initialise default values and to generate a Groovy Map of available options for nf-core modules + */ +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.args3 = args.args3 ?: '' + options.publish_by_meta = args.publish_by_meta ?: [] + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +/* + * Tidy up and join elements of a list to return a path string + */ +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +/* + * Function to save/publish module results + */ +def saveFiles(Map args) { + if (!args.filename.endsWith('.version.txt')) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + if (ioptions.publish_by_meta) { + def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta + for (key in key_list) { + if (args.meta && key instanceof String) { + def path = key + if (args.meta.containsKey(key)) { + path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] + } + path = path instanceof String ? path : '' + path_list.add(path) + } + } + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } + } +} diff --git a/software/last/train/main.nf b/software/last/train/main.nf new file mode 100644 index 00000000..43c0eab7 --- /dev/null +++ b/software/last/train/main.nf @@ -0,0 +1,44 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process LAST_TRAIN { + tag "$meta.id" + label 'process_high' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "bioconda::last=1219" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/last:1219--h2e03b76_0" + } else { + container "quay.io/biocontainers/last:1219--h2e03b76_0" + } + + input: + tuple val(meta), path(fastx) + path index + + output: + tuple val(meta), path("*.par"), emit: param_file + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + """ + INDEX_NAME=`find -L lastdb/ -name "*.bck" | sed 's/.bck//' | sed 's,lastdb/,,'` + + last-train \\ + $options.args \\ + -P $task.cpus \\ + ${index}/\$INDEX_NAME \\ + $fastx \\ + > ${prefix}.\$INDEX_NAME.par + + lastdb --version | sed 's/lastdb //' > ${software}.version.txt + """ +} diff --git a/software/last/train/meta.yml b/software/last/train/meta.yml new file mode 100644 index 00000000..5796b764 --- /dev/null +++ b/software/last/train/meta.yml @@ -0,0 +1,48 @@ +name: last_train +description: Find suitable score parameters for sequence alignment +keywords: + - LAST + - train + - fastq + - fasta +tools: + - last: + description: LAST finds & aligns related regions of sequences. + homepage: https://gitlab.com/mcfrith/last + documentation: https://gitlab.com/mcfrith/last/-/blob/main/doc/last-train.rst + tool_dev_url: https://gitlab.com/mcfrith/last + doi: "" + licence: ['GPL v3-or-later'] + +input: + - index: + type: directory + description: Directory containing the files of the LAST index + pattern: "lastdb/" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastx: + type: file + description: FASTA/FASTQ file + pattern: "*.{fasta,fastq}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" + - param_file: + type: file + description: Trained parameter file + pattern: "*.par" + +authors: + - "@aleksandrabliznina" diff --git a/tests/config/pytest_software.yml b/tests/config/pytest_software.yml index e5e14467..a4a931f8 100644 --- a/tests/config/pytest_software.yml +++ b/tests/config/pytest_software.yml @@ -374,6 +374,10 @@ last/lastdb: - software/last/lastdb/** - tests/software/last/lastdb/** +last/train: + - software/last/train/** + - tests/software/last/train/** + mash/sketch: - software/mash/sketch/** - tests/software/mash/sketch/** diff --git a/tests/config/test_data.config b/tests/config/test_data.config index 919c33ae..7f4db28a 100644 --- a/tests/config/test_data.config +++ b/tests/config/test_data.config @@ -26,6 +26,8 @@ params { all_sites_fas = "${test_data_dir}/genomics/sarscov2/genome/alignment/all_sites.fas" informative_sites_fas = "${test_data_dir}/genomics/sarscov2/genome/alignment/informative_sites.fas" + + lastdb_tar_gz = "${test_data_dir}/genomics/sarscov2/genome/alignment/last/lastdb.tar.gz" } 'illumina' { test_single_end_bam = "${test_data_dir}/genomics/sarscov2/illumina/bam/test.single_end.bam" diff --git a/tests/software/last/train/main.nf b/tests/software/last/train/main.nf new file mode 100644 index 00000000..8449f878 --- /dev/null +++ b/tests/software/last/train/main.nf @@ -0,0 +1,15 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +include { UNTAR } from '../../../../software/untar/main.nf' addParams( options: [:] ) +include { LAST_TRAIN } from '../../../../software/last/train/main.nf' addParams( options: [:] ) + +workflow test_last_train { + + db = [ file(params.test_data['sarscov2']['genome']['lastdb_tar_gz'], checkIfExists: true) ] + input = [ [ id:'contigs' ], // meta map + file(params.test_data['sarscov2']['illumina']['contigs_fasta'], checkIfExists: true) ] + UNTAR ( db ) + LAST_TRAIN ( input, UNTAR.out.untar ) +} diff --git a/tests/software/last/train/test.yml b/tests/software/last/train/test.yml new file mode 100644 index 00000000..2f356c3f --- /dev/null +++ b/tests/software/last/train/test.yml @@ -0,0 +1,23 @@ +- name: last train test_last_train + command: nextflow run tests/software/last/train -entry test_last_train -c tests/config/nextflow.config + tags: + - last/train + - last + files: + - path: output/last/contigs.genome.par + contains: + - "score matrix" + - path: output/untar/lastdb/genome.bck + md5sum: 5519879b9b6c4d1fc508da7f17f88f2e + - path: output/untar/lastdb/genome.des + md5sum: 3a9ea6d336e113a74d7fdca5e7b623fc + - path: output/untar/lastdb/genome.prj + md5sum: 489715f14b0fea6273822696e72357f9 + - path: output/untar/lastdb/genome.sds + md5sum: 2cd381f4f8a9c52cfcd323a2863eccb2 + - path: output/untar/lastdb/genome.ssp + md5sum: 4137fb6fe9df2b3d78d5b960390aac7b + - path: output/untar/lastdb/genome.suf + md5sum: 1895efa8653e8e9bd3605cff0408ed33 + - path: output/untar/lastdb/genome.tis + md5sum: b7c40f06b1309dc6f37849eeb86dfd22