From e0fcbb428dc772e1600ae18d2c527796fbd27646 Mon Sep 17 00:00:00 2001
From: Rike <friederike.hanssen@qbic.uni-tuebingen.de>
Date: Tue, 17 May 2022 19:09:46 +0200
Subject: [PATCH] Add CNNScoreVariants

---
 modules/gatk4/cnnscorevariants/main.nf        | 54 ++++++++++++++
 modules/gatk4/cnnscorevariants/meta.yml       | 72 +++++++++++++++++++
 tests/modules/gatk4/cnnscorevariants/main.nf  | 18 +++++
 .../gatk4/cnnscorevariants/nextflow.config    |  5 ++
 tests/modules/gatk4/cnnscorevariants/test.yml |  8 +++
 5 files changed, 157 insertions(+)
 create mode 100644 modules/gatk4/cnnscorevariants/main.nf
 create mode 100644 modules/gatk4/cnnscorevariants/meta.yml
 create mode 100644 tests/modules/gatk4/cnnscorevariants/main.nf
 create mode 100644 tests/modules/gatk4/cnnscorevariants/nextflow.config
 create mode 100644 tests/modules/gatk4/cnnscorevariants/test.yml

diff --git a/modules/gatk4/cnnscorevariants/main.nf b/modules/gatk4/cnnscorevariants/main.nf
new file mode 100644
index 00000000..35c7853a
--- /dev/null
+++ b/modules/gatk4/cnnscorevariants/main.nf
@@ -0,0 +1,54 @@
+process GATK4_CNNSCOREVARIANTS {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda (params.enable_conda ? "bioconda::gatk4=4.2.6.1" : null)
+    container 'broadinstitute/gatk:4.2.6.1'
+
+    input:
+    tuple val(meta), path(vcf), path(aligned_input), path(intervals)
+    path fasta
+    path fai
+    path dict
+    path architecture
+    path weights
+
+    output:
+    tuple val(meta), path("*.vcf.gz"), emit: vcf
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def aligned_input = aligned_input ? "--input $aligned_input" : ""
+    def interval_command = intervals ? "--intervals $intervals" : ""
+    def architecture = architecture ? "--architecture $architecture" : ""
+    def weights = weights ? "--weights $weights" : ""
+
+    def avail_mem = 3
+    if (!task.memory) {
+        log.info '[GATK CnnScoreVariants] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
+    } else {
+        avail_mem = task.memory.giga
+    }
+    """
+    gatk --java-options "-Xmx${avail_mem}g" CNNScoreVariants \\
+        --variant $vcf \\
+        --output ${prefix}.vcf.gz \\
+        --reference $fasta \\
+        $interval_command \\
+        $aligned_input \\
+        $architecture \\
+        $weights \\
+        --tmp-dir . \\
+        $args
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/gatk4/cnnscorevariants/meta.yml b/modules/gatk4/cnnscorevariants/meta.yml
new file mode 100644
index 00000000..1d47e6e7
--- /dev/null
+++ b/modules/gatk4/cnnscorevariants/meta.yml
@@ -0,0 +1,72 @@
+name: "gatk4_cnnscorevariants"
+description: Apply a Convolutional Neural Net to filter annotated variants
+keywords:
+  - gatk4_cnnscorevariants
+  - gatk4
+  - variants
+tools:
+  - gatk4:
+    description: |
+      Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools
+      with a primary focus on variant discovery and genotyping. Its powerful processing engine
+      and high-performance computing features make it capable of taking on projects of any size.
+    homepage: https://gatk.broadinstitute.org/hc/en-us
+    documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s
+    doi: 10.1158/1538-7445.AM2017-3590
+    licence: ["Apache-2.0"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - vcf:
+      type: file
+      description: VCF file
+      pattern: "*.vcf.gz"
+  - aligned_input:
+      type: file
+      description: BAM/CRAM file from alignment (optional)
+      pattern: "*.{bam,cram}"
+  - intervals:
+      type: file
+      description: Bed file with the genomic regions included in the library (optional)
+  - fasta:
+      type: file
+      description: The reference fasta file
+      pattern: "*.fasta"
+  - fai:
+      type: file
+      description: Index of reference fasta file
+      pattern: "*.fasta.fai"
+  - dict:
+      type: file
+      description: GATK sequence dictionary
+      pattern: "*.dict"
+  - architecture:
+      type: file
+      description: Neural Net architecture configuration json file (optional)
+      pattern: "*.json"
+  - weights:
+      type: file
+      description: Keras model HD5 file with neural net weights. (optional)
+      pattern: "*.hd5"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - vcf:
+      type: file
+      description: Annotated VCF file
+      pattern: "*.vcf"
+
+authors:
+  - "@FriederikeHanssen"
diff --git a/tests/modules/gatk4/cnnscorevariants/main.nf b/tests/modules/gatk4/cnnscorevariants/main.nf
new file mode 100644
index 00000000..d03acb78
--- /dev/null
+++ b/tests/modules/gatk4/cnnscorevariants/main.nf
@@ -0,0 +1,18 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { GATK4_CNNSCOREVARIANTS } from '../../../../modules/gatk4/cnnscorevariants/main.nf'
+
+workflow test_gatk4_cnnscorevariants {
+
+    input     = [ [ id:'test' ], // meta map
+                    file(params.test_data['homo_sapiens']['illumina']['test_genome_vcf'], checkIfExists: true),
+                    [],
+                    []
+                ]
+    fasta  = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)
+    fai    = file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true)
+    dict   = file(params.test_data['homo_sapiens']['genome']['genome_dict'], checkIfExists: true)
+    GATK4_CNNSCOREVARIANTS ( input, fasta, fai, dict, [], [] )
+}
diff --git a/tests/modules/gatk4/cnnscorevariants/nextflow.config b/tests/modules/gatk4/cnnscorevariants/nextflow.config
new file mode 100644
index 00000000..50f50a7a
--- /dev/null
+++ b/tests/modules/gatk4/cnnscorevariants/nextflow.config
@@ -0,0 +1,5 @@
+process {
+
+    publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
+    
+}
\ No newline at end of file
diff --git a/tests/modules/gatk4/cnnscorevariants/test.yml b/tests/modules/gatk4/cnnscorevariants/test.yml
new file mode 100644
index 00000000..1c25e4d0
--- /dev/null
+++ b/tests/modules/gatk4/cnnscorevariants/test.yml
@@ -0,0 +1,8 @@
+- name: gatk4 cnnscorevariants test_gatk4_cnnscorevariants
+  command: nextflow run ./tests/modules/gatk4/cnnscorevariants -entry test_gatk4_cnnscorevariants -c ./tests/config/nextflow.config  -c ./tests/modules/gatk4/cnnscorevariants/nextflow.config
+  tags:
+    - gatk4
+    - gatk4/cnnscorevariants
+  files:
+    - path: output/gatk4/test.vcf
+      contains: '##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">'