Merge pull request #177 from MaxUlysse/master_gatk_createsequencedictionary

Add gatk/createsequencedictionary
2024-09-21 07:42:05 +00:00 · 2021-02-16 22:21:38 +00:00 · 2021-02-16 22:21:38 +00:00 · d6d34a4b5b
commit d6d34a4b5b
parent e558d5fb57 a6d6c0b973
14 changed files with 176 additions and 119 deletions
--- a/.github/filters.yml
+++ b/.github/filters.yml
@ -128,6 +128,10 @@ fastqc:
  - software/fastqc/**
  - tests/software/fastqc/**

+gatk_createsequencedictionary:
+  - software/gatk/createsequencedictionary/**
+  - tests/software/gatk/createsequencedictionary/**
+
 gffread:
  - software/gffread/**
  - tests/software/gffread/**
--- a/deprecated/bwa-mem2/index/main.nf
+++ b/deprecated/bwa-mem2/index/main.nf
@ -1,16 +0,0 @@
-process bwa-mem2_index {
-    tag {fasta}
-
-    container 'quay.io/biocontainers/bwa-mem2:2.0--he513fc3_0'
-
-    input:
-        path(fasta)
-
-    output:
-        path("${fasta}.*")
-
-    script:
-    """
-    bwa-mem2 index ${fasta}
-    """
-}
--- a/deprecated/bwa-mem2/index/meta.yml
+++ b/deprecated/bwa-mem2/index/meta.yml
@ -1,26 +0,0 @@
-name: bwa-mem2 index
-description: create indexes for BWA from a fasta file
-keywords:
-    - index
-tools:
-    - bwa:
-        description: |
-            Bwa-mem2 is the next version of the bwa-mem algorithm in bwa.
-            It produces alignment identical to bwa and is ~1.3-3.1x faster depending on the use-case, dataset and the running machine.
-        homepage: https://github.com/bwa-mem2/bwa-mem2
-        documentation: https://github.com/bwa-mem2/bwa-mem2
-        arxiv: arXiv:1303.3997
-input:
-    -
-        - input:
-            type: file
-            description: Input fasta file
-            pattern: "*.{fasta,fa}"
-output:
-    -
-        - index:
-            type: file
-            description: bwa indexes file
-            pattern: "*.{fasta,fa}.{amb,ann,bwt,pac,sa}"
-authors:
-    - "@maxulysse"
--- a/deprecated/bwa-mem2/index/test/main.nf
+++ b/deprecated/bwa-mem2/index/test/main.nf
@ -1,16 +0,0 @@
-#!/usr/bin/env nextflow
-nextflow.preview.dsl = 2
-include '../../../nf-core/module_testing/check_process_outputs.nf' params(params)
-include '../main.nf' params(params)
-
-// Define input channels
-input = '../../../test-datasets/tools/bwa/index/input/reference.fasta'
-Channel
-  .from(input)
-  .set { ch_input }
-
-// Run the workflow
-workflow {
-    fastqc(ch_input)
-    // .check_output()
-}
--- a/deprecated/bwa-mem2/index/test/nextflow.config
+++ b/deprecated/bwa-mem2/index/test/nextflow.config
@ -1,2 +0,0 @@
-docker.enabled = true
-params.outdir = './results'
--- a/deprecated/gatk/dict/main.nf
+++ b/deprecated/gatk/dict/main.nf
@ -1,19 +0,0 @@
-process gatk_dict {
-    tag {fasta}
-
-    container 'quay.io/biocontainers/gatk4-spark:4.1.4.1--1'
-
-    input:
-        path fasta
-
-    output:
-        path "${fasta.baseName}.dict"
-
-    script:
-    """
-    gatk --java-options "-Xmx${task.memory.giga}g" \
-        CreateSequenceDictionary \
-        --REFERENCE ${fasta} \
-        --OUTPUT ${fasta.baseName}.dict
-    """
-}
--- a/deprecated/gatk/dict/meta.yml
+++ b/deprecated/gatk/dict/meta.yml
@ -1,25 +0,0 @@
-name: gatk dict
-description: create a dictionary file from a fasta file
-keywords:
-    - dictionary
-tools:
-    - gatk:
-        description: |
-            The GATK toolkit offers a wide variety of tools with a primary focus on variant discovery and genotyping, developed in the Data Sciences Platform at the Broad Institute.
-        homepage: https://gatk.broadinstitute.org/hc/en-us
-        documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s
-        doi: 10.1158/1538-7445.AM2017-3590
-input:
-    -
-        - input:
-            type: file
-            description: Input fasta file
-            pattern: "*.{fasta,fa}"
-output:
-    -
-        - dict:
-            type: file
-            description: gatk dictionary file
-            pattern: "*.{fasta,fa}.{dict}"
-authors:
-    - "@maxulysse"
--- a/deprecated/gatk/dict/test/main.nf
+++ b/deprecated/gatk/dict/test/main.nf
@ -1,13 +0,0 @@
-#!/usr/bin/env nextflow
-nextflow.preview.dsl = 2
-include '../../../tests/functions/check_process_outputs.nf' params(params)
-include '../main.nf' params(params)
-
-// Define input channels
-input = '../../../test-datasets/tools/bwa/index/input/reference.fasta'
-
-// Run the workflow
-workflow {
-    gatk_dict(input)
-    // .check_output()
-}
--- a/deprecated/gatk/dict/test/nextflow.config
+++ b/deprecated/gatk/dict/test/nextflow.config
@ -1,2 +0,0 @@
-docker.enabled = true
-params.outdir = './results'
--- a/software/gatk/createsequencedictionary/functions.nf
+++ b/software/gatk/createsequencedictionary/functions.nf
@ -0,0 +1,59 @@
+/*
+ * -----------------------------------------------------
+ *  Utility functions used in nf-core DSL2 module files
+ * -----------------------------------------------------
+ */
+
+/*
+ * Extract name of software tool from process name using $task.process
+ */
+def getSoftwareName(task_process) {
+    return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
+}
+
+/*
+ * Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
+ */
+def initOptions(Map args) {
+    def Map options = [:]
+    options.args          = args.args ?: ''
+    options.args2         = args.args2 ?: ''
+    options.publish_by_id = args.publish_by_id ?: false
+    options.publish_dir   = args.publish_dir ?: ''
+    options.publish_files = args.publish_files
+    options.suffix        = args.suffix ?: ''
+    return options
+}
+
+/*
+ * Tidy up and join elements of a list to return a path string
+ */
+def getPathFromList(path_list) {
+    def paths = path_list.findAll { item -> !item?.trim().isEmpty() }  // Remove empty entries
+    paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
+    return paths.join('/')
+}
+
+/*
+ * Function to save/publish module results
+ */
+def saveFiles(Map args) {
+    if (!args.filename.endsWith('.version.txt')) {
+        def ioptions = initOptions(args.options)
+        def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
+        if (ioptions.publish_by_id) {
+            path_list.add(args.publish_id)
+        }
+        if (ioptions.publish_files instanceof Map) {
+            for (ext in ioptions.publish_files) {
+                if (args.filename.endsWith(ext.key)) {
+                    def ext_list = path_list.collect()
+                    ext_list.add(ext.value)
+                    return "${getPathFromList(ext_list)}/$args.filename"
+                }
+            }
+        } else if (ioptions.publish_files == null) {
+            return "${getPathFromList(path_list)}/$args.filename"
+        }
+    }
+}
--- a/software/gatk/createsequencedictionary/main.nf
+++ b/software/gatk/createsequencedictionary/main.nf
@ -0,0 +1,45 @@
+// Import generic module functions
+include { initOptions; saveFiles; getSoftwareName } from './functions'
+
+params.options = [:]
+def options    = initOptions(params.options)
+
+process GATK_CREATESEQUENCEDICTIONARY {
+    tag "$fasta"
+    label 'process_medium'
+    publishDir "${params.outdir}",
+        mode: params.publish_dir_mode,
+        saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') }
+
+    conda (params.enable_conda ? "bioconda::gatk4=4.1.9.0" : null)
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://depot.galaxyproject.org/singularity/gatk4:4.1.9.0--py39_0"
+    } else {
+        container "quay.io/biocontainers/gatk4:4.1.9.0--py39_0"
+    }
+
+    input:
+    path fasta
+
+    output:
+    path "*.dict"        , emit: dict
+    path "*.version.txt" , emit: version
+
+    script:
+    def software = getSoftwareName(task.process)
+    def avail_mem = 6
+    if (!task.memory) {
+        log.info '[GATK] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.'
+    } else {
+        avail_mem = task.memory.giga
+    }
+    """
+    gatk --java-options "-Xmx${avail_mem}g" \\
+        CreateSequenceDictionary \\
+        --REFERENCE $fasta \\
+        --URI $fasta \\
+        $options.args
+
+    echo \$(gatk CreateSequenceDictionary --version 2>&1) | sed 's/^.*(GATK) v//; s/ HTSJDK.*\$//' > ${software}.version.txt
+    """
+}
--- a/software/gatk/createsequencedictionary/meta.yml
+++ b/software/gatk/createsequencedictionary/meta.yml
@ -0,0 +1,51 @@
+name: gatk_createsequencedictionary
+description: Creates a sequence dictionary for a reference sequence
+keywords:
+    - dictionary
+    - fasta
+tools:
+    - gatk:
+        description: |
+            Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools
+            with a primary focus on variant discovery and genotyping. Its powerful processing engine
+            and high-performance computing features make it capable of taking on projects of any size.
+        homepage: https://gatk.broadinstitute.org/hc/en-us
+        documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s
+        doi: 10.1158/1538-7445.AM2017-3590
+params:
+    - outdir:
+        type: string
+        description: |
+            The pipeline's output directory. By default, the module will
+            output files into `$params.outdir/<SOFTWARE>`
+    - publish_dir_mode:
+        type: string
+        description: |
+            Value for the Nextflow `publishDir` mode parameter.
+            Available: symlink, rellink, link, copy, copyNoFollow, move.
+    - enable_conda:
+        type: boolean
+        description: |
+            Run the module with Conda using the software specified
+            via the `conda` directive
+    - singularity_pull_docker_container:
+        type: boolean
+        description: |
+            Instead of directly downloading Singularity images for use with Singularity,
+            force the workflow to pull and convert Docker containers instead.
+input:
+    - fasta:
+        type: file
+        description: Input fasta file
+        pattern: "*.{fasta,fa}"
+output:
+    - dict:
+        type: file
+        description: gatk dictionary file
+        pattern: "*.{dict}"
+    - version:
+        type: file
+        description: File containing software version
+        pattern: "*.{version.txt}"
+authors:
+    - "@maxulysse"
--- a/tests/software/gatk/createsequencedictionary/main.nf
+++ b/tests/software/gatk/createsequencedictionary/main.nf
@ -0,0 +1,9 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { GATK_CREATESEQUENCEDICTIONARY } from '../../../../software/gatk/createsequencedictionary/main.nf' addParams( options: [:] )
+
+workflow test_gatk_createsequencedictionary {
+    GATK_CREATESEQUENCEDICTIONARY ( file("${launchDir}/tests/data/fasta/E_coli/NC_010473.fa", checkIfExists: true) )
+}
--- a/tests/software/gatk/createsequencedictionary/test.yml
+++ b/tests/software/gatk/createsequencedictionary/test.yml
@ -0,0 +1,8 @@
+- name: gatk createsequencedictionary
+  command: nextflow run ./tests/software/gatk/createsequencedictionary -entry test_gatk_createsequencedictionary -c tests/config/nextflow.config
+  tags:
+    - gatk
+    - gatk_createsequencedictionary
+  files:
+    - path: output/gatk/NC_010473.dict
+      md5sum: 30b5f2501f6eb68b0270cc5626f5be4c