From 481d3c811d07c35ddf7dbf2ee528575bbfb8254c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Guizard?= Date: Sat, 23 Oct 2021 18:55:28 +0100 Subject: [PATCH] New module: `gstama/collapse` (#809) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 👌 IMPROVE: Add some pacbio test files * 🐛 FIX: Add Pacbio index to test_data.config * 👌 IMPROVE: Re add 10000 data test * 👌 IMPROVE: Add some pbindex * 🐛 FIX: Add pbi extension to files * 📦 NEW: Add galgal6 chr30 test data * 📦 NEW: Add gd-tama module * 🐛 FIX (TEMP): Update singularity container address * 📦 NEW: Add bamtools module * 📦 NEW: Rewrite and rename module (gstama => gstama/collapse) * 👌 IMPROVE: ignore test data * 👌 IMPROVE: Remove junk files * 👌 IMPROVE: Update output * 👌 IMPROVE: Add channel for publishing tama's metadata outputs * 👌 IMPROVE: Update process label * 🐛 FIX: Use depot.galxyproject.org url for singularity * 👌 IMPROVE: autoselect running mode * 🐛 FIX: correct gstama collapse bash test * 👌 IMPROVE: Update to last templates version * 👌 IMPROVE: Update tama package and label * 👌 IMPROVE: Final version of test datasets config * 👌 IMPROVE: Remove useless index + Fix Typos * 👌 IMPROVE: Update test * 👌 IMPROVE: Add some pacbio test files * 🐛 FIX: Add Pacbio index to test_data.config * 👌 IMPROVE: Re add 10000 data test * 👌 IMPROVE: Add some pbindex * 🐛 FIX: Add pbi extension to files * 📦 NEW: Add galgal6 chr30 test data * 📦 NEW: Add gd-tama module * 🐛 FIX (TEMP): Update singularity container address * 📦 NEW: Add bamtools module * 📦 NEW: Rewrite and rename module (gstama => gstama/collapse) * 👌 IMPROVE: ignore test data * 👌 IMPROVE: Update output * 👌 IMPROVE: Add channel for publishing tama's metadata outputs * 👌 IMPROVE: Update process label * 🐛 FIX: Use depot.galxyproject.org url for singularity * 👌 IMPROVE: autoselect running mode * 🐛 FIX: correct gstama collapse bash test * 👌 IMPROVE: Update to last templates version * 👌 IMPROVE: Update tama package and label * 👌 IMPROVE: Final version of test datasets config * 👌 IMPROVE: Remove useless index + Fix Typos * 👌 IMPROVE: Update test * 👌 IMPROVE: delete unnecessary files * 👌 IMPROVE: Update + clean - Remove unnecessary files - Update to new versions.yml file - Better output channels * 👌 IMPROVE: Update meta.yml and output channels * 👌 IMPROVE: Remove useless files * 👌 IMPROVE: Remove automatic MODE setup * 👌 IMPROVE: Applied @jfy133 code modification suggestions * Update modules/gstama/collapse/meta.yml Co-authored-by: James A. Fellows Yates * 🐛 FIX: Add missing fasta option in meta.yml * 🐛 FIX: Fix typo * 🐛 FIX: Update package version * Update main.nf * Update meta.yml * Update modules/gstama/collapse/meta.yml * Apply suggestions from code review * Update tests/modules/gstama/collapse/main.nf * Update main.nf Co-authored-by: James A. Fellows Yates Co-authored-by: Harshil Patel --- modules/gstama/collapse/functions.nf | 78 ++++++++++++++++++++++++ modules/gstama/collapse/main.nf | 52 ++++++++++++++++ modules/gstama/collapse/meta.yml | 83 ++++++++++++++++++++++++++ tests/config/pytest_modules.yml | 4 ++ tests/modules/gstama/collapse/main.nf | 16 +++++ tests/modules/gstama/collapse/test.yml | 22 +++++++ 6 files changed, 255 insertions(+) create mode 100644 modules/gstama/collapse/functions.nf create mode 100644 modules/gstama/collapse/main.nf create mode 100644 modules/gstama/collapse/meta.yml create mode 100644 tests/modules/gstama/collapse/main.nf create mode 100644 tests/modules/gstama/collapse/test.yml diff --git a/modules/gstama/collapse/functions.nf b/modules/gstama/collapse/functions.nf new file mode 100644 index 00000000..85628ee0 --- /dev/null +++ b/modules/gstama/collapse/functions.nf @@ -0,0 +1,78 @@ +// +// Utility functions used in nf-core DSL2 module files +// + +// +// Extract name of software tool from process name using $task.process +// +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +// +// Extract name of module from process name using $task.process +// +def getProcessName(task_process) { + return task_process.tokenize(':')[-1] +} + +// +// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules +// +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.args3 = args.args3 ?: '' + options.publish_by_meta = args.publish_by_meta ?: [] + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +// +// Tidy up and join elements of a list to return a path string +// +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +// +// Function to save/publish module results +// +def saveFiles(Map args) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + + // Do not publish versions.yml unless running from pytest workflow + if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) { + return null + } + if (ioptions.publish_by_meta) { + def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta + for (key in key_list) { + if (args.meta && key instanceof String) { + def path = key + if (args.meta.containsKey(key)) { + path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] + } + path = path instanceof String ? path : '' + path_list.add(path) + } + } + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } +} diff --git a/modules/gstama/collapse/main.nf b/modules/gstama/collapse/main.nf new file mode 100644 index 00000000..d4167b5e --- /dev/null +++ b/modules/gstama/collapse/main.nf @@ -0,0 +1,52 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process GSTAMA_COLLAPSE { + tag "$meta.id" + label 'process_medium' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "bioconda::gs-tama=1.0.2" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/gs-tama:1.0.2--hdfd78af_0" + } else { + container "quay.io/biocontainers/gs-tama:1.0.2--hdfd78af_0" + } + + input: + tuple val(meta), path(bam) + path fasta + + output: + tuple val(meta), path("*.bed") , emit: bed + tuple val(meta), path("*_trans_read.bed") , emit: bed_trans_reads + tuple val(meta), path("*_local_density_error.txt"), emit: local_density_error + tuple val(meta), path("*_polya.txt") , emit: polya + tuple val(meta), path("*_read.txt") , emit: read + tuple val(meta), path("*_strand_check.txt") , emit: strand_check + tuple val(meta), path("*_trans_report.txt") , emit: trans_report + path "versions.yml" , emit: versions + + tuple val(meta), path("*_varcov.txt") , emit: varcov , optional: true + tuple val(meta), path("*_variants.txt") , emit: variants, optional: true + + script: + def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + """ + tama_collapse.py \\ + -s $bam \\ + -f $fasta \\ + -p ${prefix} \\ + $options.args + + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + ${getSoftwareName(task.process)}: \$( tama_collapse.py -version | grep 'tc_version_date_'|sed 's/tc_version_date_//g' ) + END_VERSIONS + """ +} diff --git a/modules/gstama/collapse/meta.yml b/modules/gstama/collapse/meta.yml new file mode 100644 index 00000000..0b26191f --- /dev/null +++ b/modules/gstama/collapse/meta.yml @@ -0,0 +1,83 @@ +name: GSTAMA_COLLAPSE +description: Collapse redundant transcript models in Iso-Seq data. +keywords: + - tama_collapse.py + - isoseq + - nanopore + - long-read + - transcriptome + - gene model + - TAMA +tools: + - tama_collapse.py: + description: Collapse similar gene model + homepage: https://github.com/sguizard/gs-tama + documentation: https://github.com/GenomeRIK/tama/wiki + tool_dev_url: https://github.com/sguizard/gs-tama + doi: 10.1186/s12864-020-07123-7 + licence: GNU GPL3 + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - bam: + type: file + description: A sorted BAM or sam file of aligned reads + pattern: "*.{bam,sam}" + - fasta: + type: file + description: A fasta file of the genome used for the mapping + pattern: "*.{fasta,fa}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bed: + type: file + description: a bed12 format file containing the final collapsed version of your transcriptome + pattern: "*.bed" + - bed_trans_reads: + type: file + description: This file uses bed12 format to show the transcript model for each read based on the mapping prior to collapsing. This only contains the reads which were accepted according to the defined thresholds. You can use this file to see if there were any strange occurrences during collapsing. It also contains the relationships between reads and collapsed transcript models. The 1st subfield in the 4th column shows the final transcript ID and the 2nd subfield in the 4th column shows the read ID. If you used no_cap mode for collapsing there may be multiple lines for a single read. This happens when a 5' degraded read can match to multiple 5' longer transcript models. + pattern: "*_trans_read.bed" + - local_density_error: + type: file + description: This file contains the log of filtering for local density error around the splice junctions ("-lde") + pattern: "*_local_density_error.txt" + - polya: + type: file + description: This file contains the reads with potential poly A truncation. + pattern: "*_polya.txt" + - read: + type: file + description: This file contains information for all mapped reads from the input SAM/BAM file. It shows both accepted and discarded reads and should match the number of mapped reads in your SAM/BAM file + pattern: "*_read.txt" + - strand_check: + type: file + description: This file shows instances where the sam flag strand information contrasted the GMAP strand information. + pattern: "*_strand_check.txt" + - trans_report: + type: file + description: This file contains collapsing information for each transcript. + pattern: "*_trans_report.txt" + - varcov: + type: file + description: This file contains the coverage information for each variant detected. + pattern: "*_varcov.txt" + - variants: + type: file + description: This file contains the variants called. Variants are only called if 5 or more reads show the variant at a specific locus. If you would like to change the threshold, please make an issue about this in the Github repo. + pattern: "*_variants.txt" + +authors: + - "@sguizard" diff --git a/tests/config/pytest_modules.yml b/tests/config/pytest_modules.yml index dfa00bd0..d1a8e7f4 100644 --- a/tests/config/pytest_modules.yml +++ b/tests/config/pytest_modules.yml @@ -490,6 +490,10 @@ graphmap2/index: - modules/graphmap2/index/** - tests/modules/graphmap2/index/** +gstama/collapse: + - modules/gstama/collapse/** + - tests/modules/gstama/collapse/** + gtdbtk/classifywf: - modules/gtdbtk/classifywf/** - tests/modules/gtdbtk/classifywf/** diff --git a/tests/modules/gstama/collapse/main.nf b/tests/modules/gstama/collapse/main.nf new file mode 100644 index 00000000..70b3c741 --- /dev/null +++ b/tests/modules/gstama/collapse/main.nf @@ -0,0 +1,16 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +include { GSTAMA_COLLAPSE } from '../../../../modules/gstama/collapse/main.nf' addParams( options: [ args:"-x capped -b BAM", suffix:'_tc' ] ) + +workflow test_gstama_collapse { + + input = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['pacbio']['aligned'], checkIfExists: true) + ] + genome = file(params.test_data['homo_sapiens']['genome']['genome2_fasta'], checkIfExists: true) + + GSTAMA_COLLAPSE ( input, genome ) +} diff --git a/tests/modules/gstama/collapse/test.yml b/tests/modules/gstama/collapse/test.yml new file mode 100644 index 00000000..98de6bb3 --- /dev/null +++ b/tests/modules/gstama/collapse/test.yml @@ -0,0 +1,22 @@ +- name: gstama collapse test_gstama_collapse + command: nextflow run tests/modules/gstama/collapse -entry test_gstama_collapse -c tests/config/nextflow.config + tags: + - gstama + - gstama/collapse + files: + - path: output/gstama/test_tc.bed + md5sum: e5105198ed970a33ae0ecaa7bff421d9 + - path: output/gstama/test_tc_local_density_error.txt + md5sum: b917ac1f14eccd590b6881a686f324d5 + - path: output/gstama/test_tc_polya.txt + md5sum: 628ea62b918fc4f31e109f724d714a66 + - path: output/gstama/test_tc_read.txt + md5sum: d2685d7f24cd1611e0770a5ce25422fe + - path: output/gstama/test_tc_strand_check.txt + md5sum: 42cc52b2660b1e0b84e1c9ab37a965ec + - path: output/gstama/test_tc_trans_read.bed + md5sum: 0ca1a32f33ef05242d897d913802554b + - path: output/gstama/test_tc_trans_report.txt + md5sum: 33a86c15ca2acce36b2a5962f4c1adc4 + - path: output/gstama/test_tc_variants.txt + md5sum: 5b1165e9f33faba4f7207013fc27257e