New module: gstama/merge (#813)

* 👌 IMPROVE: Add some pacbio test files

* 🐛 FIX: Add Pacbio index to test_data.config

* 👌 IMPROVE: Re add 10000 data test

* 👌 IMPROVE: Add some pbindex

* 🐛 FIX: Add pbi extension to files

* 📦 NEW: Add galgal6 chr30 test data

* 📦 NEW: Add bamtools module

* 👌 IMPROVE: ignore test data

* 👌 IMPROVE : add test bed files

* 📦 NEW: Add gstama/merge module

* 🐛 FIX: Change process label

* 👌 IMPROVE: do not merge empty bed

* 🐛 FIX: Change 0 lines files detection

* 🐛 FIX: replace spaces by tab

* 🐛 FIX: Remove tuple for report channel and add version output channel

* 👌 IMPROVE: Update to last templates version

* 👌 IMPROVE: Update module to last template version

* 👌 IMPROVE: Final version of test datasets config

* 👌 IMPROVE: Update test

* 👌 IMPROVE: Remove useless index + Fix Typos

* 👌 IMPROVE: Fix Typos

* 👌 IMPROVE: Updates + clean code

- Update to last versions.yml file
- Better output channels
- Update meta.yml

* 👌 IMPROVE: Correct typo

* 👌 IMPROVE: Remove included filelist creation and add an input channel

* 🐛 FIX: Correct typo

* 👌 IMPROVE: Add filelist file

* 🐛 FIX: tama_merge.py emit a version number

* Update modules/gstama/merge/meta.yml

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>

* 👌 IMPROVE: Update meta.yml

* Update main.nf

* Apply suggestions from code review

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>
Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>
This commit is contained in:
Sébastien Guizard 2021-10-23 19:00:39 +01:00 committed by GitHub
parent 481d3c811d
commit d3369789da
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 224 additions and 2 deletions

View file

@ -0,0 +1,78 @@
//
// Utility functions used in nf-core DSL2 module files
//
//
// Extract name of software tool from process name using $task.process
//
def getSoftwareName(task_process) {
return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
}
//
// Extract name of module from process name using $task.process
//
def getProcessName(task_process) {
return task_process.tokenize(':')[-1]
}
//
// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
//
def initOptions(Map args) {
def Map options = [:]
options.args = args.args ?: ''
options.args2 = args.args2 ?: ''
options.args3 = args.args3 ?: ''
options.publish_by_meta = args.publish_by_meta ?: []
options.publish_dir = args.publish_dir ?: ''
options.publish_files = args.publish_files
options.suffix = args.suffix ?: ''
return options
}
//
// Tidy up and join elements of a list to return a path string
//
def getPathFromList(path_list) {
def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries
paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
return paths.join('/')
}
//
// Function to save/publish module results
//
def saveFiles(Map args) {
def ioptions = initOptions(args.options)
def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
// Do not publish versions.yml unless running from pytest workflow
if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) {
return null
}
if (ioptions.publish_by_meta) {
def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta
for (key in key_list) {
if (args.meta && key instanceof String) {
def path = key
if (args.meta.containsKey(key)) {
path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key]
}
path = path instanceof String ? path : ''
path_list.add(path)
}
}
}
if (ioptions.publish_files instanceof Map) {
for (ext in ioptions.publish_files) {
if (args.filename.endsWith(ext.key)) {
def ext_list = path_list.collect()
ext_list.add(ext.value)
return "${getPathFromList(ext_list)}/$args.filename"
}
}
} else if (ioptions.publish_files == null) {
return "${getPathFromList(path_list)}/$args.filename"
}
}

View file

@ -0,0 +1,46 @@
// Import generic module functions
include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions'
params.options = [:]
options = initOptions(params.options)
process GSTAMA_MERGE {
tag "$meta.id"
label 'process_medium'
publishDir "${params.outdir}",
mode: params.publish_dir_mode,
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) }
conda (params.enable_conda ? "bioconda::gs-tama=1.0.2" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/gs-tama:1.0.2--hdfd78af_0"
} else {
container "quay.io/biocontainers/gs-tama:1.0.2--hdfd78af_0"
}
input:
tuple val(meta), path(bed)
path filelist
output:
tuple val(meta), path("*.bed") , emit: bed
tuple val(meta), path("*_gene_report.txt") , emit: gene_report
tuple val(meta), path("*_merge.txt") , emit: merge
tuple val(meta), path("*_trans_report.txt"), emit: trans_report
path "versions.yml" , emit: versions
script:
def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}"
"""
tama_merge.py \\
-f $filelist \\
-d merge_dup \\
-p ${prefix} \\
$options.args
cat <<-END_VERSIONS > versions.yml
${getProcessName(task.process)}:
${getSoftwareName(task.process)}: \$( tama_merge.py -version | head -n1 )
END_VERSIONS
"""
}

View file

@ -0,0 +1,60 @@
name: gstama_merge
description: Merge multiple transcriptomes while maintaining source information.
keywords:
- gstama
- gstama/merge
- long-read
- isoseq
- nanopore
- tama
- trancriptome
- annotation
tools:
- gstama:
description: Gene-Switch Transcriptome Annotation by Modular Algorithms
homepage: https://github.com/sguizard/gs-tama
documentation: https://github.com/GenomeRIK/tama/wiki
tool_dev_url: https://github.com/sguizard/gs-tama
doi: "https://doi.org/10.1186/s12864-020-07123-7"
licence: ['GPL v3 License']
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- bed:
type: file
description: bed12 file generated by TAMA collapse
pattern: "*.bed"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test' ]
- bed:
type: file
description: This is the main merged annotation file. Transcripts are coloured according to the source support for each model. Sources are numbered based on the order supplied in the input filelist file. For example the first file named in the filelist file would have its transcripts coloured in red. If a transcript has multiple sources the colour is shown as magenta.
pattern: "*.bed"
- gene_report:
type: file
description: This contains a report of the genes from the merged file. "num_clusters" refers to the number of source transcripts that were used to make this gene model. "num_final_trans" refers to the number of transcripts in the final gene model.
pattern: "*_gene_report.txt"
- merge:
type: file
description: This contains a bed12 format file which shows the coordinates of each input transcript matched to the merged transcript ID. I used the "txt" extension even though it is a bed file just to avoid confusion with the main bed file. You can use this file to map the final merged transcript models to their pre-merged supporting transcripts. The 1st subfield in the 4th column shows the final merged transcript ID while the 2nd subfield shows the pre-merged transcript ID with source prefix.
pattern: "*_merge.txt"
- trans_report:
type: file
description: This contains the source information for each merged transcript.
pattern: "*_trans_report.txt"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@sguizard"

View file

@ -494,6 +494,10 @@ gstama/collapse:
- modules/gstama/collapse/**
- tests/modules/gstama/collapse/**
gstama/merge:
- modules/gstama/merge/**
- tests/modules/gstama/merge/**
gtdbtk/classifywf:
- modules/gtdbtk/classifywf/**
- tests/modules/gtdbtk/classifywf/**

View file

@ -224,8 +224,9 @@ params {
singletons = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.bam"
aligned = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned.bam"
alignedbai = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned.bam.bai"
genemodel1 = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned_tc.bed"
genemodel2 = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned_tc.2.bed"
genemodel1 = "${test_data_dir}/genomics/homo_sapiens/pacbio/bed/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned_tc.bed"
genemodel2 = "${test_data_dir}/genomics/homo_sapiens/pacbio/bed/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned_tc.2.bed"
filelist = "${test_data_dir}/genomics/homo_sapiens/pacbio/txt/filelist.txt"
}
}
}

View file

@ -0,0 +1,19 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { GSTAMA_MERGE } from '../../../../modules/gstama/merge/main' addParams( options: [suffix:'_merged'] )
workflow test_gstama_merge {
input = [
[ id:'test', single_end:false ], // meta map
[
file(params.test_data['homo_sapiens']['pacbio']['genemodel1'], checkIfExists: true),
file(params.test_data['homo_sapiens']['pacbio']['genemodel2'], checkIfExists: true)
]
]
filelist = file(params.test_data['homo_sapiens']['pacbio']['filelist'], checkIfExists: true)
GSTAMA_MERGE ( input, filelist )
}

View file

@ -0,0 +1,14 @@
- name: gstama merge test_gstama_merge
command: nextflow run tests/modules/gstama/merge -entry test_gstama_merge -c tests/config/nextflow.config
tags:
- gstama
- gstama/merge
files:
- path: output/gstama/test_merged.bed
md5sum: 60ec34e1ff9655d4ce2e83d3f4bbf448
- path: output/gstama/test_merged_gene_report.txt
md5sum: 7029fd183dfd905a233403cfbe44722a
- path: output/gstama/test_merged_merge.txt
md5sum: 4279e59ed5739ce4f2f811568962893f
- path: output/gstama/test_merged_trans_report.txt
md5sum: 97d8346d9eb9da140941656c3a3325cd