From 54e0ac4ed95da110caccd4f31017ee449a5ea9ba Mon Sep 17 00:00:00 2001 From: GCJMackenzie <43276267+GCJMackenzie@users.noreply.github.com> Date: Thu, 16 Dec 2021 10:54:49 +0000 Subject: [PATCH] add module: Variantrecalibrator (#1088) * committing to pull updated nf-core files * saving changes to checout other branch * committing progress so far, difficulty with test data * uploading to be used as draft PR * fix linting error in meta.yml * attempt to group reference inputs together * updated input format for resources * meta.yml updated with new resource names * added output channel for recal index * module only takes single vcf file input now * committing to checkout * update to new syntax, remove indel test for now * updated to use memory options and new test data * Update modules/gatk4/variantrecalibrator/main.nf Co-authored-by: FriederikeHanssen * Update main.nf * Update modules/gatk4/variantrecalibrator/main.nf Co-authored-by: FriederikeHanssen * remove duplicate test keys from test_data.config Co-authored-by: GCJMackenzie Co-authored-by: FriederikeHanssen --- modules/gatk4/variantrecalibrator/main.nf | 62 ++++++++++++ modules/gatk4/variantrecalibrator/meta.yml | 98 +++++++++++++++++++ tests/config/pytest_modules.yml | 4 + .../modules/gatk4/variantrecalibrator/main.nf | 81 +++++++++++++++ .../gatk4/variantrecalibrator/nextflow.config | 5 + .../gatk4/variantrecalibrator/test.yml | 25 +++++ 6 files changed, 275 insertions(+) create mode 100644 modules/gatk4/variantrecalibrator/main.nf create mode 100644 modules/gatk4/variantrecalibrator/meta.yml create mode 100644 tests/modules/gatk4/variantrecalibrator/main.nf create mode 100644 tests/modules/gatk4/variantrecalibrator/nextflow.config create mode 100644 tests/modules/gatk4/variantrecalibrator/test.yml diff --git a/modules/gatk4/variantrecalibrator/main.nf b/modules/gatk4/variantrecalibrator/main.nf new file mode 100644 index 00000000..5641d6de --- /dev/null +++ b/modules/gatk4/variantrecalibrator/main.nf @@ -0,0 +1,62 @@ +process GATK4_VARIANTRECALIBRATOR { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::gatk4=4.2.3.0" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.2.3.0--hdfd78af_0' : + 'quay.io/biocontainers/gatk4:4.2.3.0--hdfd78af_0' }" + + input: + tuple val(meta), path(vcf) , path(tbi) + path fasta + path fai + path dict + val allelespecific + tuple path(resvcfs), path(restbis), val(reslabels) + val annotation + val mode + val create_rscript + + output: + tuple val(meta), path("*.recal") , emit: recal + tuple val(meta), path("*.idx") , emit: idx + tuple val(meta), path("*.tranches"), emit: tranches + tuple val(meta), path("*plots.R") , emit: plots, optional:true + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + refCommand = fasta ? "-R ${fasta} " : '' + alleleSpecificCommand = allelespecific ? '-AS' : '' + resourceCommand = '--resource:' + reslabels.join( ' --resource:') + annotationCommand = '-an ' + annotation.join( ' -an ') + modeCommand = mode ? "--mode ${mode} " : 'SNP' + rscriptCommand = create_rscript ? "--rscript-file ${prefix}.plots.R" : '' + + def avail_mem = 3 + if (!task.memory) { + log.info '[GATK VariantRecalibrator] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = task.memory.giga + } + """ + gatk --java-options "-Xmx${avail_mem}g" VariantRecalibrator \\ + ${refCommand} \\ + -V ${vcf} \\ + ${alleleSpecificCommand} \\ + ${resourceCommand} \\ + ${annotationCommand} \\ + ${modeCommand} \\ + -O ${prefix}.recal \\ + --tranches-file ${prefix}.tranches \\ + ${rscriptCommand}\\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/gatk4/variantrecalibrator/meta.yml b/modules/gatk4/variantrecalibrator/meta.yml new file mode 100644 index 00000000..92416a58 --- /dev/null +++ b/modules/gatk4/variantrecalibrator/meta.yml @@ -0,0 +1,98 @@ +name: gatk4_variantrecalibrator +description: | + Build a recalibration model to score variant quality for filtering purposes. + It is highly recommended to follow GATK best practices when using this module, + the gaussian mixture model requires a large number of samples to be used for the + tool to produce optimal results. For example, 30 samples for exome data. For more details see + https://gatk.broadinstitute.org/hc/en-us/articles/4402736812443-Which-training-sets-arguments-should-I-use-for-running-VQSR- +keywords: + - VariantRecalibrator + - gatk4 + - recalibration_model +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - vcf: + type: file + description: input vcf file containing the variants to be recalibrated + pattern: "*.vcf.gz" + - tbi: + type: file + description: tbi file matching with -vcf + pattern: "*.vcf.gz.tbi" + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" + - allelespecific: + type: boolean + description: specify whether to use allele specific annotations + pattern: "{true,false}" + - resvcfs: + type: list + description: resource files to be used as truth, training and known sites resources, this imports the files into the module, file names are specified again in the resource_labels to be called via the command. + pattern: '*/hapmap_3.3.hg38_chr21.vcf.gz' + - restbis: + type: list + description: tbis for the corresponding vcfs files to be used as truth, training and known resources. + pattern: '*/hapmap_3.3.hg38_chr21.vcf.gz.tbi' + - reslabels: + type: list + description: labels for the resource files to be used as truth, training and known sites resources, label should include an identifier,which kind of resource(s) it is, prior value and name of the file. + pattern: "hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.hg38_chr21.vcf.gz" + - annotation: + type: list + description: specify which annotations should be used for calculations. + pattern: "['QD', 'MQ', 'FS', 'SOR']" + - mode: + type: string + description: specifies which recalibration mode to employ (SNP is default, BOTH is intended for testing only) + pattern: "{SNP,INDEL,BOTH}" + - rscript: + type: boolean + description: specify whether to generate rscript.plot output file + pattern: "{true,false}" +output: + - recal: + type: file + description: Output recal file used by ApplyVQSR + pattern: "*.recal" + - idx: + type: file + description: Index file for the recal output file + pattern: "*.idx" + - tranches: + type: file + description: Output tranches file used by ApplyVQSR + pattern: "*.tranches" + - plots: + type: file + description: Optional output rscript file to aid in visualization of the input data and learned model. + pattern: "*plots.R" + - version: + type: file + description: File containing software versions + pattern: "*.versions.yml" +authors: + - "@GCJMackenzie" diff --git a/tests/config/pytest_modules.yml b/tests/config/pytest_modules.yml index c261a481..7e3d8f82 100644 --- a/tests/config/pytest_modules.yml +++ b/tests/config/pytest_modules.yml @@ -604,6 +604,10 @@ gatk4/variantfiltration: - modules/gatk4/variantfiltration/** - tests/modules/gatk4/variantfiltration/** +gatk4/variantrecalibrator: + - modules/gatk4/variantrecalibrator/** + - tests/modules/gatk4/variantrecalibrator/** + genmap/index: - modules/genmap/index/** - tests/modules/genmap/index/** diff --git a/tests/modules/gatk4/variantrecalibrator/main.nf b/tests/modules/gatk4/variantrecalibrator/main.nf new file mode 100644 index 00000000..bbc1dff5 --- /dev/null +++ b/tests/modules/gatk4/variantrecalibrator/main.nf @@ -0,0 +1,81 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +include { GATK4_VARIANTRECALIBRATOR } from '../../../../modules/gatk4/variantrecalibrator/main.nf' + +workflow test_gatk4_variantrecalibrator { + + input = [ [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test2_haplotc_ann_vcf_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test2_haplotc_ann_vcf_gz_tbi'], checkIfExists: true) + ] + + fasta = file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true) + fai = file(params.test_data['homo_sapiens']['genome']['genome_21_fasta_fai'], checkIfExists: true) + dict = file(params.test_data['homo_sapiens']['genome']['genome_21_dict'], checkIfExists: true) + allelespecific = false + resources = [ + [ + file(params.test_data['homo_sapiens']['genome']['hapmap_3_3_hg38_21_vcf_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['genome']['res_1000g_omni2_5_hg38_21_vcf_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['genome']['res_1000g_phase1_snps_hg38_21_vcf_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'], checkIfExists: true) + ], + [ + file(params.test_data['homo_sapiens']['genome']['hapmap_3_3_hg38_21_vcf_gz_tbi'], checkIfExists: true), + file(params.test_data['homo_sapiens']['genome']['res_1000g_omni2_5_hg38_21_vcf_gz_tbi'], checkIfExists: true), + file(params.test_data['homo_sapiens']['genome']['res_1000g_phase1_snps_hg38_21_vcf_gz_tbi'], checkIfExists: true), + file(params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz_tbi'], checkIfExists: true) + ], + [ + 'hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.hg38.vcf.gz', + 'omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.hg38.vcf.gz', + '1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.hg38.vcf.gz', + 'dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp_138.hg38.vcf.gz' + ] + ] + annotation = ['QD', 'MQ', 'FS', 'SOR'] + mode = 'SNP' + create_rscript = false + + GATK4_VARIANTRECALIBRATOR ( input, fasta, fai, dict, allelespecific, resources, annotation, mode, create_rscript) +} + +workflow test_gatk4_variantrecalibrator_allele_specific { + + input = [ [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test2_haplotc_ann_vcf_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test2_haplotc_ann_vcf_gz_tbi'], checkIfExists: true) + ] + + fasta = file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true) + fai = file(params.test_data['homo_sapiens']['genome']['genome_21_fasta_fai'], checkIfExists: true) + dict = file(params.test_data['homo_sapiens']['genome']['genome_21_dict'], checkIfExists: true) + allelespecific = true + resources = [ + [ + file(params.test_data['homo_sapiens']['genome']['hapmap_3_3_hg38_21_vcf_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['genome']['res_1000g_omni2_5_hg38_21_vcf_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['genome']['res_1000g_phase1_snps_hg38_21_vcf_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'], checkIfExists: true) + ], + [ + file(params.test_data['homo_sapiens']['genome']['hapmap_3_3_hg38_21_vcf_gz_tbi'], checkIfExists: true), + file(params.test_data['homo_sapiens']['genome']['res_1000g_omni2_5_hg38_21_vcf_gz_tbi'], checkIfExists: true), + file(params.test_data['homo_sapiens']['genome']['res_1000g_phase1_snps_hg38_21_vcf_gz_tbi'], checkIfExists: true), + file(params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz_tbi'], checkIfExists: true) + ], + [ + 'hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.hg38.vcf.gz', + 'omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.hg38.vcf.gz', + '1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.hg38.vcf.gz', + 'dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp_138.hg38.vcf.gz' + ] + ] + annotation = ['QD', 'MQ', 'FS'] + mode = 'SNP' + create_rscript = false + + GATK4_VARIANTRECALIBRATOR ( input, fasta, fai, dict, allelespecific, resources, annotation, mode, create_rscript) +} diff --git a/tests/modules/gatk4/variantrecalibrator/nextflow.config b/tests/modules/gatk4/variantrecalibrator/nextflow.config new file mode 100644 index 00000000..19934e76 --- /dev/null +++ b/tests/modules/gatk4/variantrecalibrator/nextflow.config @@ -0,0 +1,5 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + +} \ No newline at end of file diff --git a/tests/modules/gatk4/variantrecalibrator/test.yml b/tests/modules/gatk4/variantrecalibrator/test.yml new file mode 100644 index 00000000..42b18e36 --- /dev/null +++ b/tests/modules/gatk4/variantrecalibrator/test.yml @@ -0,0 +1,25 @@ +- name: gatk4 variantrecalibrator test_gatk4_variantrecalibrator + command: nextflow run tests/modules/gatk4/variantrecalibrator -entry test_gatk4_variantrecalibrator -c tests/config/nextflow.config -c ./tests/modules/gatk4/variantrecalibrator/nextflow.config + tags: + - gatk4 + - gatk4/variantrecalibrator + files: + - path: output/gatk4/test.recal + contains: + - "#CHROM POS ID REF ALT QUAL FILTER INFO" + - path: output/gatk4/test.recal.idx + - path: output/gatk4/test.tranches + md5sum: d238e97bf996863969dac7751e345549 + +- name: gatk4 variantrecalibrator test_gatk4_variantrecalibrator_allele_specific + command: nextflow run tests/modules/gatk4/variantrecalibrator -entry test_gatk4_variantrecalibrator_allele_specific -c tests/config/nextflow.config -c ./tests/modules/gatk4/variantrecalibrator/nextflow.config + tags: + - gatk4 + - gatk4/variantrecalibrator + files: + - path: output/gatk4/test.recal + contains: + - "#CHROM POS ID REF ALT QUAL FILTER INFO" + - path: output/gatk4/test.recal.idx + - path: output/gatk4/test.tranches + md5sum: 444438d46716593634a6817958099292