Variant recalibration (#1885)

* update tests

* update

* update

* make the manta inputs consistant for germline/somatic/tumoronly

* match chromosomes to cram file (chr21)

* undo genotypegvfs

* undo genotypegvfs

* update VariantRecalibrator

* lint

* add '--resource:' tag

Co-authored-by: Smith Nicholas <smith@in.tum.de>
Co-authored-by: FriederikeHanssen <Friederike.hanssen@qbic.uni-tuebingen.de>
This commit is contained in:
nickhsmith 2022-07-15 09:21:34 +02:00 committed by GitHub
parent 9deff5222e
commit edfe28a5e0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 42 additions and 52 deletions

View file

@ -8,8 +8,10 @@ process GATK4_VARIANTRECALIBRATOR {
'quay.io/biocontainers/gatk4:4.2.6.1--hdfd78af_0' }"
input:
tuple val(meta), path(vcf), path(tbi)
tuple path(vcfs), path(tbis), val(labels)
tuple val(meta), path(vcf), path(tbi) // input vcf and tbi of variants to recalibrate
path resource_vcf // resource vcf
path resource_tbi // resource tbi
val labels // string (or list of strings) containing dedicated resource labels already formatted with '--resource:' tag
path fasta
path fai
path dict
@ -28,7 +30,7 @@ process GATK4_VARIANTRECALIBRATOR {
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def reference_command = fasta ? "--reference $fasta " : ''
def resource_command = labels.collect{"--resource:$it"}.join(' ')
def labels_command = labels.join(' ')
def avail_mem = 3
if (!task.memory) {
@ -42,8 +44,8 @@ process GATK4_VARIANTRECALIBRATOR {
--output ${prefix}.recal \\
--tranches-file ${prefix}.tranches \\
$reference_command \\
$resource_command \\
--tmp-dir . \\
$labels_command \\
$args
cat <<-END_VERSIONS > versions.yml

View file

@ -33,6 +33,17 @@ input:
type: file
description: tbi file matching with -vcf
pattern: "*.vcf.gz.tbi"
- resource_vcf:
type: file
description: all resource vcf files that are used with the corresponding '--resource' label
pattern: "*.vcf.gz"
- resource_tbi:
type: file
description: all resource tbi files that are used with the corresponding '--resource' label
pattern: "*.vcf.gz.tbi"
- labels:
type: string
description: necessary arguments for GATK VariantRecalibrator. Specified to directly match the resources provided. More information can be found at https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator
- fasta:
type: file
description: The reference fasta file
@ -45,34 +56,6 @@ input:
type: file
description: GATK sequence dictionary
pattern: "*.dict"
- allelespecific:
type: boolean
description: specify whether to use allele specific annotations
pattern: "{true,false}"
- resvcfs:
type: list
description: resource files to be used as truth, training and known sites resources, this imports the files into the module, file names are specified again in the resource_labels to be called via the command.
pattern: "*/hapmap_3.3.hg38_chr21.vcf.gz"
- restbis:
type: list
description: tbis for the corresponding vcfs files to be used as truth, training and known resources.
pattern: "*/hapmap_3.3.hg38_chr21.vcf.gz.tbi"
- reslabels:
type: list
description: labels for the resource files to be used as truth, training and known sites resources, label should include an identifier,which kind of resource(s) it is, prior value and name of the file.
pattern: "hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.hg38_chr21.vcf.gz"
- annotation:
type: list
description: specify which annotations should be used for calculations.
pattern: "['QD', 'MQ', 'FS', 'SOR']"
- mode:
type: string
description: specifies which recalibration mode to employ (SNP is default, BOTH is intended for testing only)
pattern: "{SNP,INDEL,BOTH}"
- rscript:
type: boolean
description: specify whether to generate rscript.plot output file
pattern: "{true,false}"
output:
- recal:
type: file
@ -96,3 +79,4 @@ output:
pattern: "*.versions.yml"
authors:
- "@GCJMackenzie"
- "@nickhsmith"

View file

@ -12,28 +12,30 @@ workflow test_gatk4_variantrecalibrator {
file(params.test_data['homo_sapiens']['illumina']['test2_haplotc_ann_vcf_gz_tbi'], checkIfExists: true)
]
resources = [[
resources_vcf = [
file(params.test_data['homo_sapiens']['genome']['hapmap_3_3_hg38_21_vcf_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['res_1000g_omni2_5_hg38_21_vcf_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['res_1000g_phase1_snps_hg38_21_vcf_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'], checkIfExists: true)
], [
]
resources_tbi = [
file(params.test_data['homo_sapiens']['genome']['hapmap_3_3_hg38_21_vcf_gz_tbi'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['res_1000g_omni2_5_hg38_21_vcf_gz_tbi'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['res_1000g_phase1_snps_hg38_21_vcf_gz_tbi'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz_tbi'], checkIfExists: true)
], [
'hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.hg38.vcf.gz',
'omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.hg38.vcf.gz',
'1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.hg38.vcf.gz',
'dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp_138.hg38.vcf.gz'
]]
]
labels = [
'--resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.hg38.vcf.gz',
'--resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.hg38.vcf.gz',
'--resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.hg38.vcf.gz',
'--resource:dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp_138.hg38.vcf.gz'
]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true)
fai = file(params.test_data['homo_sapiens']['genome']['genome_21_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['homo_sapiens']['genome']['genome_21_dict'], checkIfExists: true)
GATK4_VARIANTRECALIBRATOR_NO_ALLELESPECIFICTY(input, resources, fasta, fai, dict)
GATK4_VARIANTRECALIBRATOR_NO_ALLELESPECIFICTY(input, resources_vcf, resources_tbi, labels, fasta, fai, dict)
}
workflow test_gatk4_variantrecalibrator_allele_specific {
@ -43,26 +45,28 @@ workflow test_gatk4_variantrecalibrator_allele_specific {
file(params.test_data['homo_sapiens']['illumina']['test2_haplotc_ann_vcf_gz_tbi'], checkIfExists: true)
]
resources = [[
resources_vcf = [
file(params.test_data['homo_sapiens']['genome']['hapmap_3_3_hg38_21_vcf_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['res_1000g_omni2_5_hg38_21_vcf_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['res_1000g_phase1_snps_hg38_21_vcf_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'], checkIfExists: true)
], [
]
resources_tbi = [
file(params.test_data['homo_sapiens']['genome']['hapmap_3_3_hg38_21_vcf_gz_tbi'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['res_1000g_omni2_5_hg38_21_vcf_gz_tbi'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['res_1000g_phase1_snps_hg38_21_vcf_gz_tbi'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz_tbi'], checkIfExists: true)
], [
'hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.hg38.vcf.gz',
'omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.hg38.vcf.gz',
'1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.hg38.vcf.gz',
'dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp_138.hg38.vcf.gz'
]]
]
labels = [
'--resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.hg38.vcf.gz',
'--resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.hg38.vcf.gz',
'--resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.hg38.vcf.gz',
'--resource:dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp_138.hg38.vcf.gz'
]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true)
fai = file(params.test_data['homo_sapiens']['genome']['genome_21_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['homo_sapiens']['genome']['genome_21_dict'], checkIfExists: true)
GATK4_VARIANTRECALIBRATOR_WITH_ALLELESPECIFICTY(input, resources, fasta, fai, dict)
GATK4_VARIANTRECALIBRATOR_WITH_ALLELESPECIFICTY(input, resources_vcf, resources_tbi, labels, fasta, fai, dict)
}

View file

@ -3,10 +3,10 @@ process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
withName: GATK4_VARIANTRECALIBRATOR {
ext.args = '--mode SNP -an QD -an MQ -an FS -an SOR'
ext.args = '-mode SNP -an QD -an MQ -an FS -an SOR'
}
withName: GATK4_VARIANTRECALIBRATOR_WITH_ALLELESPECIFICTY {
ext.args = '--mode SNP -an QD -an MQ -an FS -AS'
ext.args = '-mode SNP -an QD -an MQ -an FS -AS'
}
}