Gatk4 combinegvcfs (#1342)

* add gatk4/combinegvcfs module

* update gatk4/combinegvcfs

* loop to create a string adding -V to each vcf file

* add contains for variable md5

* rm whitespace

* meta in output

* fix indentations

* fix indentations

* move tmpdir to args and update conda version

Co-authored-by: Peri <rrx8@cdc.gov>
Co-authored-by: Maxime U. Garcia <max.u.garcia@gmail.com>
This commit is contained in:
Sateesh 2022-02-23 10:29:29 -05:00 committed by GitHub
parent 938387d10d
commit a25423dbb9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 145 additions and 0 deletions

View file

@ -0,0 +1,47 @@
process GATK4_COMBINEGVCFS {
tag "$meta.id"
label 'process_low'
conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/gatk4:4.2.5.0--hdfd78af_0' :
'quay.io/biocontainers/gatk4:4.2.5.0--hdfd78af_0' }"
input:
tuple val(meta), path(vcf), path(vcf_idx)
path (fasta)
path (fasta_fai)
path (fasta_dict)
output:
tuple val(meta), path("*.combined.g.vcf.gz"), emit: combined_gvcf
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def avail_mem = 3
if (!task.memory) {
log.info '[GATK COMBINEGVCFS] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
} else {
avail_mem = task.memory.giga
}
def input_files = vcf.collect{"-V ${it}"}.join(' ') // add '-V' to each vcf file
"""
gatk \\
--java-options "-Xmx${avail_mem}g" \\
CombineGVCFs \\
-R ${fasta} \\
-O ${prefix}.combined.g.vcf.gz \\
${args} \\
${input_files}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//')
END_VERSIONS
"""
}

View file

@ -0,0 +1,54 @@
name: gatk4_combinegvcfs
description: Combine per-sample gVCF files produced by HaplotypeCaller into a multi-sample gVCF file
keywords:
- gvcf
- gatk4
- vcf
- combinegvcfs
- Short_Variant_Discovery
tools:
- gatk4:
description: Genome Analysis Toolkit (GATK4). Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools
with a primary focus on variant discovery and genotyping. Its powerful processing engine
and high-performance computing features make it capable of taking on projects of any size.
homepage: https://gatk.broadinstitute.org/hc/en-us
documentation: https://gatk.broadinstitute.org/hc/en-us/articles/360037593911-CombineGVCFs
tool_dev_url: https://github.com/broadinstitute/gatk
doi: 10.1158/1538-7445.AM2017-3590
licence: ['Apache-2.0']
input:
- fasta:
type: file
description: The reference fasta file
pattern: "*.fasta"
- fai:
type: file
description: FASTA index file
pattern: "*.{fai}"
- dict:
type: file
description: FASTA dictionary file
pattern: "*.{dict}"
- vcf:
type: file
description: Compressed VCF files
pattern: "*.vcf.gz"
- vcf_idx:
type: file
description: VCF Index file
pattern: "*.{fai}"
output:
- gvcf:
type: file
description: Compressed Combined GVCF file
pattern: "*.combined.g.vcf.gz"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@sateeshperi"
- "@mjcipriano"
- "@hseabolt"

View file

@ -576,6 +576,10 @@ gatk4/calculatecontamination:
- modules/gatk4/calculatecontamination/** - modules/gatk4/calculatecontamination/**
- tests/modules/gatk4/calculatecontamination/** - tests/modules/gatk4/calculatecontamination/**
gatk4/combinegvcfs:
- modules/gatk4/combinegvcfs/**
- tests/modules/gatk4/combinegvcfs/**
gatk4/createsequencedictionary: gatk4/createsequencedictionary:
- modules/gatk4/createsequencedictionary/** - modules/gatk4/createsequencedictionary/**
- tests/modules/gatk4/createsequencedictionary/** - tests/modules/gatk4/createsequencedictionary/**

View file

@ -0,0 +1,24 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { GATK4_COMBINEGVCFS } from '../../../../modules/gatk4/combinegvcfs/main.nf'
workflow test_gatk4_combinegvcfs {
input = [ [ id:'test', single_end:false ], // meta map
[ file(params.test_data['homo_sapiens']['illumina']['test_genome_vcf'], checkIfExists: true),
file(params.test_data['homo_sapiens']['illumina']['test2_genome_vcf'], checkIfExists: true) ],
[ file(params.test_data['homo_sapiens']['illumina']['test_genome_vcf_idx'], checkIfExists: true),
file(params.test_data['homo_sapiens']['illumina']['test2_genome_vcf_idx'], checkIfExists: true) ]
]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)
fasta_fai = file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true)
fasta_dict = file(params.test_data['homo_sapiens']['genome']['genome_dict'], checkIfExists: true)
GATK4_COMBINEGVCFS ( input, fasta, fasta_fai, fasta_dict )
}

View file

@ -0,0 +1,6 @@
process {
ext.args = "--tmp-dir ."
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
}

View file

@ -0,0 +1,10 @@
- name: gatk4 combinegvcfs test_gatk4_combinegvcfs
command: nextflow run tests/modules/gatk4/combinegvcfs -entry test_gatk4_combinegvcfs -c tests/config/nextflow.config
tags:
- gatk4
- gatk4/combinegvcfs
files:
- path: output/gatk4/test.combined.g.vcf.gz
contains: ['VCFv4.2']
- path: output/gatk4/versions.yml
md5sum: 49d9c467f84b6a99a4da3ef161af26bd