Merge pull request #1675 from FriederikeHanssen/gatk_cnnscore

Gatk cnnscore
This commit is contained in:
FriederikeHanssen 2022-05-18 15:44:52 +02:00 committed by GitHub
commit c8ccfe3710
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 165 additions and 0 deletions

View file

@ -0,0 +1,57 @@
process GATK4_CNNSCOREVARIANTS {
tag "$meta.id"
label 'process_low'
//Conda is not supported at the moment: https://github.com/broadinstitute/gatk/issues/7811
if (params.enable_conda) {
exit 1, "Conda environments cannot be used for GATK4/CNNScoreVariants at the moment. Please use docker or singularity containers."
}
container 'broadinstitute/gatk:4.2.6.1' //Biocontainers is missing a package
input:
tuple val(meta), path(vcf), path(aligned_input), path(intervals)
path fasta
path fai
path dict
path architecture
path weights
output:
tuple val(meta), path("*.vcf.gz"), emit: vcf
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def aligned_input = aligned_input ? "--input $aligned_input" : ""
def interval_command = intervals ? "--intervals $intervals" : ""
def architecture = architecture ? "--architecture $architecture" : ""
def weights = weights ? "--weights $weights" : ""
def avail_mem = 3
if (!task.memory) {
log.info '[GATK CnnScoreVariants] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
} else {
avail_mem = task.memory.giga
}
"""
gatk --java-options "-Xmx${avail_mem}g" CNNScoreVariants \\
--variant $vcf \\
--output ${prefix}.vcf.gz \\
--reference $fasta \\
$interval_command \\
$aligned_input \\
$architecture \\
$weights \\
--tmp-dir . \\
$args
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//')
END_VERSIONS
"""
}

View file

@ -0,0 +1,72 @@
name: "gatk4_cnnscorevariants"
description: Apply a Convolutional Neural Net to filter annotated variants
keywords:
- gatk4_cnnscorevariants
- gatk4
- variants
tools:
- gatk4:
description: |
Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools
with a primary focus on variant discovery and genotyping. Its powerful processing engine
and high-performance computing features make it capable of taking on projects of any size.
homepage: https://gatk.broadinstitute.org/hc/en-us
documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s
doi: 10.1158/1538-7445.AM2017-3590
licence: ["Apache-2.0"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- vcf:
type: file
description: VCF file
pattern: "*.vcf.gz"
- aligned_input:
type: file
description: BAM/CRAM file from alignment (optional)
pattern: "*.{bam,cram}"
- intervals:
type: file
description: Bed file with the genomic regions included in the library (optional)
- fasta:
type: file
description: The reference fasta file
pattern: "*.fasta"
- fai:
type: file
description: Index of reference fasta file
pattern: "*.fasta.fai"
- dict:
type: file
description: GATK sequence dictionary
pattern: "*.dict"
- architecture:
type: file
description: Neural Net architecture configuration json file (optional)
pattern: "*.json"
- weights:
type: file
description: Keras model HD5 file with neural net weights. (optional)
pattern: "*.hd5"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- vcf:
type: file
description: Annotated VCF file
pattern: "*.vcf"
authors:
- "@FriederikeHanssen"

View file

@ -731,6 +731,10 @@ gatk4/calculatecontamination:
- modules/gatk4/calculatecontamination/**
- tests/modules/gatk4/calculatecontamination/**
gatk4/cnnscorevariants:
- modules/gatk4/cnnscorevariants/**
- tests/modules/gatk4/cnnscorevariants/**
gatk4/combinegvcfs:
- modules/gatk4/combinegvcfs/**
- tests/modules/gatk4/combinegvcfs/**

View file

@ -0,0 +1,18 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { GATK4_CNNSCOREVARIANTS } from '../../../../modules/gatk4/cnnscorevariants/main.nf'
workflow test_gatk4_cnnscorevariants {
input = [ [ id:'test' ], // meta map
file(params.test_data['homo_sapiens']['illumina']['test_genome_vcf'], checkIfExists: true),
[],
[]
]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)
fai = file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['homo_sapiens']['genome']['genome_dict'], checkIfExists: true)
GATK4_CNNSCOREVARIANTS ( input, fasta, fai, dict, [], [] )
}

View file

@ -0,0 +1,5 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
}

View file

@ -0,0 +1,9 @@
- name: gatk4 cnnscorevariants test_gatk4_cnnscorevariants
command: nextflow run ./tests/modules/gatk4/cnnscorevariants -entry test_gatk4_cnnscorevariants -c ./tests/config/nextflow.config -c ./tests/modules/gatk4/cnnscorevariants/nextflow.config
tags:
- gatk4
- gatk4/cnnscorevariants
files:
- path: output/gatk4/test.vcf.gz
contains:
- "##ALT=<ID=NON_REF,Description="