add module: Variantrecalibrator (#1088)

* committing to pull updated nf-core files

* saving changes to checout other branch

* committing progress so far, difficulty with test data

* uploading to be used as draft PR

* fix linting error in meta.yml

* attempt to group reference inputs together

* updated input format for resources

* meta.yml updated with new resource names

* added output channel for recal index

* module only takes single vcf file input now

* committing to checkout

* update to new syntax, remove indel test for now

* updated to use memory options and new test data

* Update modules/gatk4/variantrecalibrator/main.nf

Co-authored-by: FriederikeHanssen <Friederike.hanssen@qbic.uni-tuebingen.de>

* Update main.nf

* Update modules/gatk4/variantrecalibrator/main.nf

Co-authored-by: FriederikeHanssen <Friederike.hanssen@qbic.uni-tuebingen.de>

* remove duplicate test keys from test_data.config

Co-authored-by: GCJMackenzie <gavin.mackenzie@nibsc.org>
Co-authored-by: FriederikeHanssen <Friederike.hanssen@qbic.uni-tuebingen.de>
This commit is contained in:
GCJMackenzie 2021-12-16 10:54:49 +00:00 committed by GitHub
parent 9f8d9fb615
commit 54e0ac4ed9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 275 additions and 0 deletions

View file

@ -0,0 +1,62 @@
process GATK4_VARIANTRECALIBRATOR {
tag "$meta.id"
label 'process_low'
conda (params.enable_conda ? "bioconda::gatk4=4.2.3.0" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/gatk4:4.2.3.0--hdfd78af_0' :
'quay.io/biocontainers/gatk4:4.2.3.0--hdfd78af_0' }"
input:
tuple val(meta), path(vcf) , path(tbi)
path fasta
path fai
path dict
val allelespecific
tuple path(resvcfs), path(restbis), val(reslabels)
val annotation
val mode
val create_rscript
output:
tuple val(meta), path("*.recal") , emit: recal
tuple val(meta), path("*.idx") , emit: idx
tuple val(meta), path("*.tranches"), emit: tranches
tuple val(meta), path("*plots.R") , emit: plots, optional:true
path "versions.yml" , emit: versions
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
refCommand = fasta ? "-R ${fasta} " : ''
alleleSpecificCommand = allelespecific ? '-AS' : ''
resourceCommand = '--resource:' + reslabels.join( ' --resource:')
annotationCommand = '-an ' + annotation.join( ' -an ')
modeCommand = mode ? "--mode ${mode} " : 'SNP'
rscriptCommand = create_rscript ? "--rscript-file ${prefix}.plots.R" : ''
def avail_mem = 3
if (!task.memory) {
log.info '[GATK VariantRecalibrator] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
} else {
avail_mem = task.memory.giga
}
"""
gatk --java-options "-Xmx${avail_mem}g" VariantRecalibrator \\
${refCommand} \\
-V ${vcf} \\
${alleleSpecificCommand} \\
${resourceCommand} \\
${annotationCommand} \\
${modeCommand} \\
-O ${prefix}.recal \\
--tranches-file ${prefix}.tranches \\
${rscriptCommand}\\
$args
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//')
END_VERSIONS
"""
}

View file

@ -0,0 +1,98 @@
name: gatk4_variantrecalibrator
description: |
Build a recalibration model to score variant quality for filtering purposes.
It is highly recommended to follow GATK best practices when using this module,
the gaussian mixture model requires a large number of samples to be used for the
tool to produce optimal results. For example, 30 samples for exome data. For more details see
https://gatk.broadinstitute.org/hc/en-us/articles/4402736812443-Which-training-sets-arguments-should-I-use-for-running-VQSR-
keywords:
- VariantRecalibrator
- gatk4
- recalibration_model
tools:
- gatk4:
description: |
Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools
with a primary focus on variant discovery and genotyping. Its powerful processing engine
and high-performance computing features make it capable of taking on projects of any size.
homepage: https://gatk.broadinstitute.org/hc/en-us
documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s
doi: 10.1158/1538-7445.AM2017-3590
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test' ]
- vcf:
type: file
description: input vcf file containing the variants to be recalibrated
pattern: "*.vcf.gz"
- tbi:
type: file
description: tbi file matching with -vcf
pattern: "*.vcf.gz.tbi"
- fasta:
type: file
description: The reference fasta file
pattern: "*.fasta"
- fai:
type: file
description: Index of reference fasta file
pattern: "fasta.fai"
- dict:
type: file
description: GATK sequence dictionary
pattern: "*.dict"
- allelespecific:
type: boolean
description: specify whether to use allele specific annotations
pattern: "{true,false}"
- resvcfs:
type: list
description: resource files to be used as truth, training and known sites resources, this imports the files into the module, file names are specified again in the resource_labels to be called via the command.
pattern: '*/hapmap_3.3.hg38_chr21.vcf.gz'
- restbis:
type: list
description: tbis for the corresponding vcfs files to be used as truth, training and known resources.
pattern: '*/hapmap_3.3.hg38_chr21.vcf.gz.tbi'
- reslabels:
type: list
description: labels for the resource files to be used as truth, training and known sites resources, label should include an identifier,which kind of resource(s) it is, prior value and name of the file.
pattern: "hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.hg38_chr21.vcf.gz"
- annotation:
type: list
description: specify which annotations should be used for calculations.
pattern: "['QD', 'MQ', 'FS', 'SOR']"
- mode:
type: string
description: specifies which recalibration mode to employ (SNP is default, BOTH is intended for testing only)
pattern: "{SNP,INDEL,BOTH}"
- rscript:
type: boolean
description: specify whether to generate rscript.plot output file
pattern: "{true,false}"
output:
- recal:
type: file
description: Output recal file used by ApplyVQSR
pattern: "*.recal"
- idx:
type: file
description: Index file for the recal output file
pattern: "*.idx"
- tranches:
type: file
description: Output tranches file used by ApplyVQSR
pattern: "*.tranches"
- plots:
type: file
description: Optional output rscript file to aid in visualization of the input data and learned model.
pattern: "*plots.R"
- version:
type: file
description: File containing software versions
pattern: "*.versions.yml"
authors:
- "@GCJMackenzie"

View file

@ -604,6 +604,10 @@ gatk4/variantfiltration:
- modules/gatk4/variantfiltration/**
- tests/modules/gatk4/variantfiltration/**
gatk4/variantrecalibrator:
- modules/gatk4/variantrecalibrator/**
- tests/modules/gatk4/variantrecalibrator/**
genmap/index:
- modules/genmap/index/**
- tests/modules/genmap/index/**

View file

@ -0,0 +1,81 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { GATK4_VARIANTRECALIBRATOR } from '../../../../modules/gatk4/variantrecalibrator/main.nf'
workflow test_gatk4_variantrecalibrator {
input = [ [ id:'test' ], // meta map
file(params.test_data['homo_sapiens']['illumina']['test2_haplotc_ann_vcf_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['illumina']['test2_haplotc_ann_vcf_gz_tbi'], checkIfExists: true)
]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true)
fai = file(params.test_data['homo_sapiens']['genome']['genome_21_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['homo_sapiens']['genome']['genome_21_dict'], checkIfExists: true)
allelespecific = false
resources = [
[
file(params.test_data['homo_sapiens']['genome']['hapmap_3_3_hg38_21_vcf_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['res_1000g_omni2_5_hg38_21_vcf_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['res_1000g_phase1_snps_hg38_21_vcf_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'], checkIfExists: true)
],
[
file(params.test_data['homo_sapiens']['genome']['hapmap_3_3_hg38_21_vcf_gz_tbi'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['res_1000g_omni2_5_hg38_21_vcf_gz_tbi'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['res_1000g_phase1_snps_hg38_21_vcf_gz_tbi'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz_tbi'], checkIfExists: true)
],
[
'hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.hg38.vcf.gz',
'omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.hg38.vcf.gz',
'1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.hg38.vcf.gz',
'dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp_138.hg38.vcf.gz'
]
]
annotation = ['QD', 'MQ', 'FS', 'SOR']
mode = 'SNP'
create_rscript = false
GATK4_VARIANTRECALIBRATOR ( input, fasta, fai, dict, allelespecific, resources, annotation, mode, create_rscript)
}
workflow test_gatk4_variantrecalibrator_allele_specific {
input = [ [ id:'test' ], // meta map
file(params.test_data['homo_sapiens']['illumina']['test2_haplotc_ann_vcf_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['illumina']['test2_haplotc_ann_vcf_gz_tbi'], checkIfExists: true)
]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true)
fai = file(params.test_data['homo_sapiens']['genome']['genome_21_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['homo_sapiens']['genome']['genome_21_dict'], checkIfExists: true)
allelespecific = true
resources = [
[
file(params.test_data['homo_sapiens']['genome']['hapmap_3_3_hg38_21_vcf_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['res_1000g_omni2_5_hg38_21_vcf_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['res_1000g_phase1_snps_hg38_21_vcf_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'], checkIfExists: true)
],
[
file(params.test_data['homo_sapiens']['genome']['hapmap_3_3_hg38_21_vcf_gz_tbi'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['res_1000g_omni2_5_hg38_21_vcf_gz_tbi'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['res_1000g_phase1_snps_hg38_21_vcf_gz_tbi'], checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz_tbi'], checkIfExists: true)
],
[
'hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.hg38.vcf.gz',
'omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.hg38.vcf.gz',
'1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.hg38.vcf.gz',
'dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp_138.hg38.vcf.gz'
]
]
annotation = ['QD', 'MQ', 'FS']
mode = 'SNP'
create_rscript = false
GATK4_VARIANTRECALIBRATOR ( input, fasta, fai, dict, allelespecific, resources, annotation, mode, create_rscript)
}

View file

@ -0,0 +1,5 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
}

View file

@ -0,0 +1,25 @@
- name: gatk4 variantrecalibrator test_gatk4_variantrecalibrator
command: nextflow run tests/modules/gatk4/variantrecalibrator -entry test_gatk4_variantrecalibrator -c tests/config/nextflow.config -c ./tests/modules/gatk4/variantrecalibrator/nextflow.config
tags:
- gatk4
- gatk4/variantrecalibrator
files:
- path: output/gatk4/test.recal
contains:
- "#CHROM POS ID REF ALT QUAL FILTER INFO"
- path: output/gatk4/test.recal.idx
- path: output/gatk4/test.tranches
md5sum: d238e97bf996863969dac7751e345549
- name: gatk4 variantrecalibrator test_gatk4_variantrecalibrator_allele_specific
command: nextflow run tests/modules/gatk4/variantrecalibrator -entry test_gatk4_variantrecalibrator_allele_specific -c tests/config/nextflow.config -c ./tests/modules/gatk4/variantrecalibrator/nextflow.config
tags:
- gatk4
- gatk4/variantrecalibrator
files:
- path: output/gatk4/test.recal
contains:
- "#CHROM POS ID REF ALT QUAL FILTER INFO"
- path: output/gatk4/test.recal.idx
- path: output/gatk4/test.tranches
md5sum: 444438d46716593634a6817958099292