Add module: gatk4/leftalignandtrimvariants (#1808)

* Added: gatk4/leftalignandtrimvariants

Additions:
 - GATK4/LeftAlignAndTrimVariants module
 - Use sars-ncov2 test data as this normalises a larger INDEL correctly.

Fixes #1801

* fixup: Added index to output spec

* fixup: Pattern of tbi output corrected to 'tbi'

* gatk4/leftalignandtrimvariants: Added intervals

Changes:
 - gatk4/leftalignandtrimvariants now supports optional interval as BED
 file
 - Tests added with and without interval. Not test BED file excludes all
 variants so no variants are actually normalised.

Fixes #1801

* fixup: leftalignandtrimvariants vcf->tbi fix

* fixup: gatk4/leftalignandtrimvariants Intervals added to meta.yml
This commit is contained in:
Adam Talbot 2022-06-28 09:44:08 +01:00 committed by GitHub
parent b573ff053e
commit 009f7c691c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 187 additions and 0 deletions

View file

@ -0,0 +1,48 @@
process GATK4_LEFTALIGNANDTRIMVARIANTS {
tag "$meta.id"
label 'process_low'
conda (params.enable_conda ? "bioconda::gatk4=4.2.6.1" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/gatk4:4.2.6.1--hdfd78af_0':
'quay.io/biocontainers/gatk4:4.2.6.1--hdfd78af_0' }"
input:
tuple val(meta), path(vcf), path(tbi), path(intervals)
path fasta
path fai
path dict
output:
tuple val(meta), path("*.vcf.gz"), emit: vcf
tuple val(meta), path("*.tbi") , emit: tbi
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def interval_command = intervals ? "--intervals $intervals" : ""
def avail_mem = 3
if (!task.memory) {
log.info '[GATK LeftAlignAndTrimVariants] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
} else {
avail_mem = task.memory.toGiga()
}
"""
gatk --java-options "-Xmx${avail_mem}G" LeftAlignAndTrimVariants \\
$interval_command \\
--variant $vcf \\
--output ${prefix}.vcf.gz \\
--reference $fasta \\
--tmp-dir . \\
$args
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//')
END_VERSIONS
"""
}

View file

@ -0,0 +1,69 @@
name: "gatk4_leftalignandtrimvariants"
description: Left align and trim variants using GATK4 LeftAlignAndTrimVariants.
keywords:
- normalize
- norm
- vcf
tools:
- gatk4:
description: |
Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools
with a primary focus on variant discovery and genotyping. Its powerful processing engine
and high-performance computing features make it capable of taking on projects of any size.
homepage: https://gatk.broadinstitute.org/hc/en-us
documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s
doi: 10.1158/1538-7445.AM2017-3590
licence: ["Apache-2.0"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- vcf:
type: file
description: |
The vcf file to be normalized
e.g. 'file1.vcf.gz'
- tbi:
type: file
description: |
Index of the vcf file to be normalized
e.g. 'file1.vcf.gz.tbi'
- intervals:
type: file
description: Bed file with the genomic regions included in the library (optional)
- fasta:
type: file
description: The reference fasta file
pattern: "*.fasta"
- fai:
type: file
description: Index of reference fasta file
pattern: "*.fasta.fai"
- dict:
type: file
description: GATK sequence dictionary
pattern: "*.dict"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- vcf:
type: file
description: VCF normalized output file
pattern: "*.{vcf.gz}"
- tbi:
type: file
description: Tbi index for VCF file
pattern: "*.tbi"
authors:
- "@adamrtalbot"

View file

@ -863,6 +863,10 @@ gatk4/learnreadorientationmodel:
- modules/gatk4/learnreadorientationmodel/** - modules/gatk4/learnreadorientationmodel/**
- tests/modules/gatk4/learnreadorientationmodel/** - tests/modules/gatk4/learnreadorientationmodel/**
gatk4/leftalignandtrimvariants:
- modules/gatk4/leftalignandtrimvariants/**
- tests/modules/gatk4/leftalignandtrimvariants/**
gatk4/markduplicates: gatk4/markduplicates:
- modules/gatk4/markduplicates/** - modules/gatk4/markduplicates/**
- tests/modules/gatk4/markduplicates/** - tests/modules/gatk4/markduplicates/**

View file

@ -0,0 +1,35 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { GATK4_LEFTALIGNANDTRIMVARIANTS } from '../../../../modules/gatk4/leftalignandtrimvariants/main.nf'
workflow test_gatk4_leftalignandtrimvariants_interval {
input = [ [ id:'test' ], // meta map
file(params.test_data['sarscov2']['illumina']['test_vcf_gz'], checkIfExists: true),
file(params.test_data['sarscov2']['illumina']['test_vcf_gz_tbi'], checkIfExists: true),
file(params.test_data['sarscov2']['genome']['test_bed'], checkIfExists: true),
]
fasta = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
fai = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['sarscov2']['genome']['genome_dict'], checkIfExists: true)
GATK4_LEFTALIGNANDTRIMVARIANTS ( input, fasta, fai, dict )
}
workflow test_gatk4_leftalignandtrimvariants_no_interval {
input = [ [ id:'test' ], // meta map
file(params.test_data['sarscov2']['illumina']['test_vcf_gz'], checkIfExists: true),
file(params.test_data['sarscov2']['illumina']['test_vcf_gz_tbi'], checkIfExists: true),
[]
]
fasta = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
fai = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['sarscov2']['genome']['genome_dict'], checkIfExists: true)
GATK4_LEFTALIGNANDTRIMVARIANTS ( input, fasta, fai, dict )
}

View file

@ -0,0 +1,9 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
withName: 'GATK4_LEFTALIGNANDTRIMVARIANTS' {
ext.args = "--split-multi-allelics --dont-trim-alleles --keep-original-ac"
ext.prefix = { "${meta.id}.normalised" }
}
}

View file

@ -0,0 +1,22 @@
- name: gatk4 leftalignandtrimvariants test_gatk4_leftalignandtrimvariants_interval
command: nextflow run ./tests/modules/gatk4/leftalignandtrimvariants -entry test_gatk4_leftalignandtrimvariants_interval -c ./tests/config/nextflow.config -c ./tests/modules/gatk4/leftalignandtrimvariants/nextflow.config
tags:
- gatk4
- gatk4/leftalignandtrimvariants
files:
- path: output/gatk4/test.normalised.vcf.gz
contains:
- "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"
- path: output/gatk4/test.normalised.vcf.gz.tbi
- name: gatk4 leftalignandtrimvariants test_gatk4_leftalignandtrimvariants_no_interval
command: nextflow run ./tests/modules/gatk4/leftalignandtrimvariants -entry test_gatk4_leftalignandtrimvariants_no_interval -c ./tests/config/nextflow.config -c ./tests/modules/gatk4/leftalignandtrimvariants/nextflow.config
tags:
- gatk4
- gatk4/leftalignandtrimvariants
files:
- path: output/gatk4/test.normalised.vcf.gz
contains:
- "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"
- "MT192765.1\t10502\t.\tTAGATTATGACTGTGTCTCTTTTTGTTACATGCACCA\tTAGAT"
- path: output/gatk4/test.normalised.vcf.gz.tbi