New module: gstama/merge (#813)

* 👌 IMPROVE: Add some pacbio test files * 🐛 FIX: Add Pacbio index to test_data.config * 👌 IMPROVE: Re add 10000 data test * 👌 IMPROVE: Add some pbindex * 🐛 FIX: Add pbi extension to files * 📦 NEW: Add galgal6 chr30 test data * 📦 NEW: Add bamtools module * 👌 IMPROVE: ignore test data * 👌 IMPROVE : add test bed files * 📦 NEW: Add gstama/merge module * 🐛 FIX: Change process label * 👌 IMPROVE: do not merge empty bed * 🐛 FIX: Change 0 lines files detection * 🐛 FIX: replace spaces by tab * 🐛 FIX: Remove tuple for report channel and add version output channel * 👌 IMPROVE: Update to last templates version * 👌 IMPROVE: Update module to last template version * 👌 IMPROVE: Final version of test datasets config * 👌 IMPROVE: Update test * 👌 IMPROVE: Remove useless index + Fix Typos * 👌 IMPROVE: Fix Typos * 👌 IMPROVE: Updates + clean code - Update to last versions.yml file - Better output channels - Update meta.yml * 👌 IMPROVE: Correct typo * 👌 IMPROVE: Remove included filelist creation and add an input channel * 🐛 FIX: Correct typo * 👌 IMPROVE: Add filelist file * 🐛 FIX: tama_merge.py emit a version number * Update modules/gstama/merge/meta.yml Co-authored-by: James A. Fellows Yates <jfy133@gmail.com> * 👌 IMPROVE: Update meta.yml * Update main.nf * Apply suggestions from code review Co-authored-by: James A. Fellows Yates <jfy133@gmail.com> Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>
2024-12-22 02:58:17 +00:00 · 2021-10-23 19:00:39 +01:00 · 2021-10-23 19:00:39 +01:00 · d3369789da
commit d3369789da
parent 481d3c811d
7 changed files with 224 additions and 2 deletions
--- a/modules/gstama/merge/functions.nf
+++ b/modules/gstama/merge/functions.nf
@ -0,0 +1,78 @@
+//
+//  Utility functions used in nf-core DSL2 module files
+//
+
+//
+// Extract name of software tool from process name using $task.process
+//
+def getSoftwareName(task_process) {
+    return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
+}
+
+//
+// Extract name of module from process name using $task.process
+//
+def getProcessName(task_process) {
+    return task_process.tokenize(':')[-1]
+}
+
+//
+// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
+//
+def initOptions(Map args) {
+    def Map options = [:]
+    options.args            = args.args ?: ''
+    options.args2           = args.args2 ?: ''
+    options.args3           = args.args3 ?: ''
+    options.publish_by_meta = args.publish_by_meta ?: []
+    options.publish_dir     = args.publish_dir ?: ''
+    options.publish_files   = args.publish_files
+    options.suffix          = args.suffix ?: ''
+    return options
+}
+
+//
+// Tidy up and join elements of a list to return a path string
+//
+def getPathFromList(path_list) {
+    def paths = path_list.findAll { item -> !item?.trim().isEmpty() }      // Remove empty entries
+    paths     = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
+    return paths.join('/')
+}
+
+//
+// Function to save/publish module results
+//
+def saveFiles(Map args) {
+    def ioptions  = initOptions(args.options)
+    def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
+
+    // Do not publish versions.yml unless running from pytest workflow
+    if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) {
+        return null
+    }
+    if (ioptions.publish_by_meta) {
+        def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta
+        for (key in key_list) {
+            if (args.meta && key instanceof String) {
+                def path = key
+                if (args.meta.containsKey(key)) {
+                    path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key]
+                }
+                path = path instanceof String ? path : ''
+                path_list.add(path)
+            }
+        }
+    }
+    if (ioptions.publish_files instanceof Map) {
+        for (ext in ioptions.publish_files) {
+            if (args.filename.endsWith(ext.key)) {
+                def ext_list = path_list.collect()
+                ext_list.add(ext.value)
+                return "${getPathFromList(ext_list)}/$args.filename"
+            }
+        }
+    } else if (ioptions.publish_files == null) {
+        return "${getPathFromList(path_list)}/$args.filename"
+    }
+}
--- a/modules/gstama/merge/main.nf
+++ b/modules/gstama/merge/main.nf
@ -0,0 +1,46 @@
+// Import generic module functions
+include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions'
+
+params.options = [:]
+options        = initOptions(params.options)
+
+process GSTAMA_MERGE {
+    tag "$meta.id"
+    label 'process_medium'
+    publishDir "${params.outdir}",
+        mode: params.publish_dir_mode,
+        saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) }
+
+    conda (params.enable_conda ? "bioconda::gs-tama=1.0.2" : null)
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://depot.galaxyproject.org/singularity/gs-tama:1.0.2--hdfd78af_0"
+    } else {
+        container "quay.io/biocontainers/gs-tama:1.0.2--hdfd78af_0"
+    }
+
+    input:
+    tuple val(meta), path(bed)
+    path filelist
+
+    output:
+    tuple val(meta), path("*.bed")             , emit: bed
+    tuple val(meta), path("*_gene_report.txt") , emit: gene_report
+    tuple val(meta), path("*_merge.txt")       , emit: merge
+    tuple val(meta), path("*_trans_report.txt"), emit: trans_report
+    path "versions.yml"                        , emit: versions
+
+    script:
+    def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}"
+    """
+    tama_merge.py \\
+        -f $filelist \\
+        -d merge_dup \\
+        -p ${prefix} \\
+        $options.args
+
+    cat <<-END_VERSIONS > versions.yml
+    ${getProcessName(task.process)}:
+        ${getSoftwareName(task.process)}: \$( tama_merge.py -version | head -n1 )
+    END_VERSIONS
+    """
+}
--- a/modules/gstama/merge/meta.yml
+++ b/modules/gstama/merge/meta.yml
@ -0,0 +1,60 @@
+name: gstama_merge
+description: Merge multiple transcriptomes while maintaining source information.
+keywords:
+  - gstama
+  - gstama/merge
+  - long-read
+  - isoseq
+  - nanopore
+  - tama
+  - trancriptome
+  - annotation
+tools:
+  - gstama:
+      description: Gene-Switch Transcriptome Annotation by Modular Algorithms
+      homepage: https://github.com/sguizard/gs-tama
+      documentation: https://github.com/GenomeRIK/tama/wiki
+      tool_dev_url: https://github.com/sguizard/gs-tama
+      doi: "https://doi.org/10.1186/s12864-020-07123-7"
+      licence: ['GPL v3 License']
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bed:
+      type: file
+      description: bed12 file generated by TAMA collapse
+      pattern: "*.bed"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test' ]
+  - bed:
+      type: file
+      description: This is the main merged annotation file. Transcripts are coloured according to the source support for each model. Sources are numbered based on the order supplied in the input filelist file. For example the first file named in the filelist file would have its transcripts coloured in red. If a transcript has multiple sources the colour is shown as magenta.
+      pattern: "*.bed"
+  - gene_report:
+      type: file
+      description: This contains a report of the genes from the merged file. "num_clusters" refers to the number of source transcripts that were used to make this gene model. "num_final_trans" refers to the number of transcripts in the final gene model.
+      pattern: "*_gene_report.txt"
+  - merge:
+      type: file
+      description: This contains a bed12 format file which shows the coordinates of each input transcript matched to the merged transcript ID. I used the "txt" extension even though it is a bed file just to avoid confusion with the main bed file. You can use this file to map the final merged transcript models to their pre-merged supporting transcripts. The 1st subfield in the 4th column shows the final merged transcript ID while the 2nd subfield shows the pre-merged transcript ID with source prefix.
+      pattern: "*_merge.txt"
+  - trans_report:
+      type: file
+      description: This contains the source information for each merged transcript.
+      pattern: "*_trans_report.txt"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+authors:
+  - "@sguizard"
--- a/tests/config/pytest_modules.yml
+++ b/tests/config/pytest_modules.yml
@ -494,6 +494,10 @@ gstama/collapse:
  - modules/gstama/collapse/**
  - tests/modules/gstama/collapse/**

+gstama/merge:
+  - modules/gstama/merge/**
+  - tests/modules/gstama/merge/**
+
 gtdbtk/classifywf:
  - modules/gtdbtk/classifywf/**
  - tests/modules/gtdbtk/classifywf/**
--- a/tests/config/test_data.config
+++ b/tests/config/test_data.config
@ -224,8 +224,9 @@ params {
                singletons                                    = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.bam"
                aligned                                       = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned.bam"
                alignedbai                                    = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned.bam.bai"
-                genemodel1                                    = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned_tc.bed"
-                genemodel2                                    = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned_tc.2.bed"
+                genemodel1                                    = "${test_data_dir}/genomics/homo_sapiens/pacbio/bed/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned_tc.bed"
+                genemodel2                                    = "${test_data_dir}/genomics/homo_sapiens/pacbio/bed/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned_tc.2.bed"
+                filelist                                      = "${test_data_dir}/genomics/homo_sapiens/pacbio/txt/filelist.txt"
            }
        }
    }
--- a/tests/modules/gstama/merge/main.nf
+++ b/tests/modules/gstama/merge/main.nf
@ -0,0 +1,19 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { GSTAMA_MERGE } from '../../../../modules/gstama/merge/main' addParams( options: [suffix:'_merged'] )
+
+workflow test_gstama_merge {
+
+    input = [
+        [ id:'test', single_end:false ], // meta map
+        [
+            file(params.test_data['homo_sapiens']['pacbio']['genemodel1'], checkIfExists: true),
+            file(params.test_data['homo_sapiens']['pacbio']['genemodel2'], checkIfExists: true)
+        ]
+    ]
+    filelist = file(params.test_data['homo_sapiens']['pacbio']['filelist'], checkIfExists: true)
+
+    GSTAMA_MERGE ( input, filelist )
+}
--- a/tests/modules/gstama/merge/test.yml
+++ b/tests/modules/gstama/merge/test.yml
@ -0,0 +1,14 @@
+- name: gstama merge test_gstama_merge
+  command: nextflow run tests/modules/gstama/merge -entry test_gstama_merge -c tests/config/nextflow.config
+  tags:
+    - gstama
+    - gstama/merge
+  files:
+    - path: output/gstama/test_merged.bed
+      md5sum: 60ec34e1ff9655d4ce2e83d3f4bbf448
+    - path: output/gstama/test_merged_gene_report.txt
+      md5sum: 7029fd183dfd905a233403cfbe44722a
+    - path: output/gstama/test_merged_merge.txt
+      md5sum: 4279e59ed5739ce4f2f811568962893f
+    - path: output/gstama/test_merged_trans_report.txt
+      md5sum: 97d8346d9eb9da140941656c3a3325cd