Add gatk somatic paired calling subworkflow (#1067)

* initial commit to setup branch

* workflow finished

* Update nextflow.config

* tumour to tumor, getpileup passed as nomral and tumor

* paired_somatic renamed to tumor_normal_somatic

* Apply suggestions from code review

Co-authored-by: Maxime U. Garcia <maxime.garcia@scilifelab.se>

* Update subworkflows/nf-core/gatk_tumor_normal_somatic_variant_calling/main.nf

Co-authored-by: Maxime U. Garcia <maxime.garcia@scilifelab.se>

* updated index names in meta.yml

* changed index file names in main script and test

* Apply suggestions from code review

Co-authored-by: Maxime U. Garcia <maxime.garcia@scilifelab.se>

* Apply suggestions from code review

* fixed bug from changes

* Apply suggestions from code review

* tests should now work after the yml update

* Update pytest_modules.yml

Co-authored-by: GCJMackenzie <gavin.mackenzie@nibsc.org>
Co-authored-by: Maxime U. Garcia <maxime.garcia@scilifelab.se>
Co-authored-by: Maxime U. Garcia <max.u.garcia@gmail.com>
This commit is contained in:
GCJMackenzie 2021-11-17 10:34:07 +00:00 committed by GitHub
parent 071b1d50a8
commit 5b975cc20d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 311 additions and 1 deletions

View file

@ -0,0 +1,109 @@
//
// Run GATK mutect2 in tumor normal mode, getepileupsummaries, calculatecontamination, learnreadorientationmodel and filtermutectcalls
//
params.mutect2_options = [:]
params.learnorientation_options = [:]
params.getpileup_tumor_options = [suffix: '_tumor']
params.getpileup_normal_options = [suffix: '_normal']
params.calccontam_options = [:]
params.filtercalls_options = [suffix: '_filtered']
include { GATK4_MUTECT2 as MUTECT2 } from '../../../modules/gatk4/mutect2/main' addParams( options: params.mutect2_options )
include { GATK4_LEARNREADORIENTATIONMODEL as LEARNREADORIENTATIONMODEL } from '../../../modules/gatk4/learnreadorientationmodel/main' addParams( options: params.learnorientation_options )
include { GATK4_GETPILEUPSUMMARIES as GETPILEUPSUMMARIES_TUMOR } from '../../../modules/gatk4/getpileupsummaries/main' addParams( options: params.getpileup_tumor_options )
include { GATK4_GETPILEUPSUMMARIES as GETPILEUPSUMMARIES_NORMAL} from '../../../modules/gatk4/getpileupsummaries/main' addParams( options: params.getpileup_normal_options )
include { GATK4_CALCULATECONTAMINATION as CALCULATECONTAMINATION } from '../../../modules/gatk4/calculatecontamination/main' addParams( options: params.calccontam_options )
include { GATK4_FILTERMUTECTCALLS as FILTERMUTECTCALLS } from '../../../modules/gatk4/filtermutectcalls/main' addParams( options: params.filtercalls_options )
workflow GATK_TUMOR_NORMAL_SOMATIC_VARIANT_CALLING {
take:
input // channel: [ val(meta), [ input ], [ input_index ], [which_norm] ]
fasta // channel: /path/to/reference/fasta
fai // channel: /path/to/reference/fasta/index
dict // channel: /path/to/reference/fasta/dictionary
germline_resource // channel: /path/to/germline/resource
germline_resource_tbi // channel: /path/to/germline/index
panel_of_normals // channel: /path/to/panel/of/normals
panel_of_normals_tbi // channel: /path/to/panel/of/normals/index
interval_file // channel: /path/to/interval/file
main:
ch_versions = Channel.empty()
//
//Perform variant calling using mutect2 module in tumor single mode.
//
mutect2_input = channel.from(input)
MUTECT2 ( mutect2_input, false, false, false, [], fasta, fai, dict, germline_resource, germline_resource_tbi, panel_of_normals, panel_of_normals_tbi )
ch_versions = ch_versions.mix(MUTECT2.out.versions)
//
//Generate artifactpriors using learnreadorientationmodel on the f1r2 output of mutect2.
//
ch_learnread_in = MUTECT2.out.f1r2.collect()
LEARNREADORIENTATIONMODEL (ch_learnread_in)
ch_versions = ch_versions.mix(LEARNREADORIENTATIONMODEL.out.versions)
//
//Generate pileup summary tables using getepileupsummaries. tumor sample should always be passed in as the first input and input list entries of ch_mutect2_in,
//to ensure correct file order for calculatecontamination.
//
pileup_tumor_input = channel.from(input).map {
meta, input_file, input_index, which_norm ->
[meta, input_file[0], input_index[0]]
}
pileup_normal_input = channel.from(input).map {
meta, input_file, input_index, which_norm ->
[meta, input_file[1], input_index[1]]
}
GETPILEUPSUMMARIES_TUMOR ( pileup_tumor_input, germline_resource, germline_resource_tbi, interval_file )
GETPILEUPSUMMARIES_NORMAL ( pileup_normal_input, germline_resource, germline_resource_tbi, interval_file )
ch_versions = ch_versions.mix(GETPILEUPSUMMARIES_NORMAL.out.versions)
//
//Contamination and segmentation tables created using calculatecontamination on the pileup summary table.
//
ch_pileup_tumor = GETPILEUPSUMMARIES_TUMOR.out.table.collect()
ch_pileup_normal = GETPILEUPSUMMARIES_NORMAL.out.table.collect()
ch_calccon_in = ch_pileup_tumor.combine(ch_pileup_normal, by: 0)
CALCULATECONTAMINATION ( ch_calccon_in, true )
ch_versions = ch_versions.mix(CALCULATECONTAMINATION.out.versions)
//
//Mutect2 calls filtered by filtermutectcalls using the artifactpriors, contamination and segmentation tables.
//
ch_vcf = MUTECT2.out.vcf.collect()
ch_tbi = MUTECT2.out.tbi.collect()
ch_stats = MUTECT2.out.stats.collect()
ch_orientation = LEARNREADORIENTATIONMODEL.out.artifactprior.collect()
ch_segment = CALCULATECONTAMINATION.out.segmentation.collect()
ch_contamination = CALCULATECONTAMINATION.out.contamination.collect()
//[] is used as a placeholder for optional input to specify the contamination estimate as a value, since the contamination table is used, this is not needed.
ch_contamination.add([])
ch_filtermutect_in = ch_vcf.combine(ch_tbi, by: 0).combine(ch_stats, by: 0).combine(ch_orientation, by: 0).combine(ch_segment, by: 0).combine(ch_contamination, by: 0)
FILTERMUTECTCALLS ( ch_filtermutect_in, fasta, fai, dict )
ch_versions = ch_versions.mix(FILTERMUTECTCALLS.out.versions)
emit:
mutect2_vcf = MUTECT2.out.vcf.collect() // channel: [ val(meta), [ vcf ] ]
mutect2_tbi = MUTECT2.out.tbi.collect() // channel: [ val(meta), [ tbi ] ]
mutect2_stats = MUTECT2.out.stats.collect() // channel: [ val(meta), [ stats ] ]
mutect2_f1r2 = MUTECT2.out.f1r2.collect() // channel: [ val(meta), [ f1r2 ] ]
artifact_priors = LEARNREADORIENTATIONMODEL.out.artifactprior.collect() // channel: [ val(meta), [ artifactprior ] ]
pileup_table_tumor = GETPILEUPSUMMARIES_TUMOR.out.table.collect() // channel: [ val(meta), [ table_tumor ] ]
pileup_table_normal = GETPILEUPSUMMARIES_NORMAL.out.table.collect() // channel: [ val(meta), [ table_normal ] ]
contamination_table = CALCULATECONTAMINATION.out.contamination.collect() // channel: [ val(meta), [ contamination ] ]
segmentation_table = CALCULATECONTAMINATION.out.segmentation.collect() // channel: [ val(meta), [ segmentation ] ]
filtered_vcf = FILTERMUTECTCALLS.out.vcf.collect() // channel: [ val(meta), [ vcf ] ]
filtered_tbi = FILTERMUTECTCALLS.out.tbi.collect() // channel: [ val(meta), [ tbi ] ]
filtered_stats = FILTERMUTECTCALLS.out.stats.collect() // channel: [ val(meta), [ stats ] ]
versions = ch_versions // channel: [ versions.yml ]
}

View file

@ -0,0 +1,127 @@
name: gatk_tumor_normal_somatic_variant_calling
description: |
Perform variant calling on a paired tumor normal set of samples using mutect2 tumor normal mode.
f1r2 output of mutect2 is run through learnreadorientationmodel to get the artifact priors.
Run the input bam files through getpileupsummarries and then calculatecontamination to get the contamination and segmentation tables.
Filter the mutect2 output vcf using filtermutectcalls, artifact priors and the contamination & segmentation tables for additional filtering.
keywords:
- gatk4
- mutect2
- learnreadorientationmodel
- getpileupsummaries
- calculatecontamination
- filtermutectcalls
- variant_calling
- tumor_only
- filtered_vcf
modules:
- gatk4/mutect2
- gatk4/learnreadorientationmodel
- gatk4/getpileupsummaries
- gatk4/calculatecontamination
- gatk4/filtermutectcalls
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test' ]
- input:
type: list
description: list containing the tumor and normal BAM files, in that order, also able to take CRAM as an input
pattern: "[ *.{bam/cram} ]"
- input_index:
type: list
description: list containing the tumor and normal BAM file indexes, in that order, also able to take CRAM index as an input
pattern: "[ *.{bam.bai/cram.crai} ]"
- which_norm:
type: list
description: optional list of sample headers contained in the normal sample input file.
pattern: "testN"
- fasta:
type: file
description: The reference fasta file
pattern: "*.fasta"
- fai:
type: file
description: Index of reference fasta file
pattern: "*.fasta.fai"
- dict:
type: file
description: GATK sequence dictionary
pattern: "*.dict"
- germline_resource:
type: file
description: Population vcf of germline sequencing, containing allele fractions.
pattern: "*.vcf.gz"
- germline_resource_tbi:
type: file
description: Index file for the germline resource.
pattern: "*.vcf.gz.tbi"
- panel_of_normals:
type: file
description: vcf file to be used as a panel of normals.
pattern: "*.vcf.gz"
- panel_of_normals_tbi:
type: file
description: Index for the panel of normals.
pattern: "*.vcf.gz.tbi"
- interval_file:
type: file
description: File containing intervals.
pattern: "*.interval_list"
output:
- versions:
type: file
description: File containing software versions
pattern: 'versions.yml'
- mutect2_vcf:
type: file
description: Compressed vcf file to be used for variant_calling.
pattern: "[ *.vcf.gz ]"
- mutect2_tbi:
type: file
description: Indexes of the mutect2_vcf file
pattern: "[ *vcf.gz.tbi ]"
- mutect2_stats:
type: file
description: Stats files for the mutect2 vcf
pattern: "[ *vcf.gz.stats ]"
- mutect2_f1r2:
type: file
description: file containing information to be passed to LearnReadOrientationModel.
pattern: "*.f1r2.tar.gz"
- artifact_priors:
type: file
description: file containing artifact-priors to be used by filtermutectcalls.
pattern: "*.tar.gz"
- pileup_table_tumor:
type: file
description: File containing the tumor pileup summary table, kept separate as calculatecontamination needs them individually specified.
pattern: "*_tumor.pileups.table"
- pileup_table_normal:
type: file
description: File containing the normal pileup summary table, kept separate as calculatecontamination needs them individually specified.
pattern: "*_normal.pileups.table"
- contamination_table:
type: file
description: File containing the contamination table.
pattern: "*.contamination.table"
- segmentation_table:
type: file
description: Output table containing segmentation of tumor minor allele fractions.
pattern: "*.segmentation.table"
- filtered_vcf:
type: file
description: file containing filtered mutect2 calls.
pattern: "*.vcf.gz"
- filtered_tbi:
type: file
description: tbi file that pairs with filtered vcf.
pattern: "*.vcf.gz.tbi"
- filtered_stats:
type: file
description: file containing statistics of the filtermutectcalls run.
pattern: "*.filteringStats.tsv"
authors:
- '@GCJMackenzie'

View file

@ -0,0 +1,6 @@
params.mutect2_options = [:]
params.learnorientation_options = [:]
params.getpileup_tumor_options = [:]
params.getpileup_normal_options = [:]
params.calccontam_options = [:]
params.filtercalls_options = [:]

View file

@ -522,7 +522,7 @@ gatk4/intervallisttools:
- modules/gatk4/intervallisttools/** - modules/gatk4/intervallisttools/**
- tests/modules/gatk4/intervallisttools/** - tests/modules/gatk4/intervallisttools/**
gatk4/learnreadorientationmodel: gatk4/learnreadorientationmodel: &gatk4/learnreadorientationmodel
- modules/gatk4/learnreadorientationmodel/** - modules/gatk4/learnreadorientationmodel/**
- tests/modules/gatk4/learnreadorientationmodel/** - tests/modules/gatk4/learnreadorientationmodel/**
@ -1427,6 +1427,15 @@ subworkflows/gatk_create_som_pon:
- *gatk4/genomicsdbimport - *gatk4/genomicsdbimport
- *gatk4/createsomaticpanelofnormals - *gatk4/createsomaticpanelofnormals
subworkflows/gatk_tumor_normal_somatic_variant_calling:
- subworkflows/nf-core/gatk_tumor_normal_somatic_variant_calling/**
- tests/subworkflows/nf-core/gatk_tumor_normal_somatic_variant_calling/**
- *gatk4/mutect2
- *gatk4/learnreadorientationmodel
- *gatk4/getpileupsummaries
- *gatk4/calculatecontamination
- *gatk4/filtermutectcalls
subworkflows/gatk_tumor_only_somatic_variant_calling: subworkflows/gatk_tumor_only_somatic_variant_calling:
- subworkflows/nf-core/gatk_tumor_only_somatic_variant_calling/** - subworkflows/nf-core/gatk_tumor_only_somatic_variant_calling/**
- tests/subworkflows/nf-core/gatk_tumor_only_somatic_variant_calling/** - tests/subworkflows/nf-core/gatk_tumor_only_somatic_variant_calling/**

View file

@ -0,0 +1,25 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { GATK_TUMOR_NORMAL_SOMATIC_VARIANT_CALLING } from '../../../../subworkflows/nf-core/gatk_tumor_normal_somatic_variant_calling/main' addParams( [:] )
workflow test_gatk_tumor_normal_somatic_variant_calling {
input = [
[ [ id:'test'], // meta map
[ file(params.test_data['homo_sapiens']['illumina']['test2_paired_end_recalibrated_sorted_bam'], checkIfExists: true) , file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_bam'], checkIfExists: true)],
[ file(params.test_data['homo_sapiens']['illumina']['test2_paired_end_recalibrated_sorted_bam_bai'], checkIfExists: true) , file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_bam_bai'], checkIfExists: true)],
["testN"]
]
]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)
fai = file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['homo_sapiens']['genome']['genome_dict'], checkIfExists: true)
germline_resource = file(params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_vcf_gz'], checkIfExists: true)
germline_resource_tbi = file(params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_vcf_gz_tbi'], checkIfExists: true)
panel_of_normals = file(params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_vcf_gz'], checkIfExists: true)
panel_of_normals_tbi = file(params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_vcf_gz_tbi'], checkIfExists: true)
interval_file = file(params.test_data['homo_sapiens']['genome']['genome_interval_list'], checkIfExists: true)
GATK_TUMOR_NORMAL_SOMATIC_VARIANT_CALLING ( input, fasta, fai, dict, germline_resource, germline_resource_tbi, panel_of_normals, panel_of_normals_tbi, interval_file )
}

View file

@ -0,0 +1,34 @@
- name: gatk_tumor_normal_somatic_variant_calling
command: nextflow run ./tests/subworkflows/nf-core/gatk_tumor_normal_somatic_variant_calling -entry test_gatk_tumor_normal_somatic_variant_calling -c tests/config/nextflow.config
tags:
- subworkflows/gatk_tumor_normal_somatic_variant_calling
# Modules
# - gatk4/mutect2
# - gatk4/learnreadorientationmodel
# - gatk4/getpileupsummaries
# - gatk4/calculatecontamination
# - gatk4/filtermutectcalls
files:
# gatk4 mutect2
- path: ./output/mutect2/test.vcf.gz
- path: ./output/mutect2/test.vcf.gz.stats
md5sum: 6ecb874e6a95aa48233587b876c2a7a9
- path: ./output/mutect2/test.vcf.gz.tbi
- path: ./output/mutect2/test.f1r2.tar.gz
# gatk4 learnreadorientationmodel
- path: ./output/learnreadorientationmodel/test.tar.gz
# gatk4 getpileupsummaries
- path: ./output/getpileupsummaries/test_tumor.pileups.table
md5sum: 8b1b4c8ab831eca50ee9e940463a741f
- path: ./output/getpileupsummaries/test_normal.pileups.table
md5sum: 0d19674bef2ff0700d5b02b3463dd210
# gatk4 calculatecontamination
- path: ./output/calculatecontamination/test.contamination.table
md5sum: 5fdcf1728cf98985ce31c038eb24e05c
- path: ./output/calculatecontamination/test.segmentation.table
md5sum: 91f28bfe4727a3256810927fc5eba92f
# gatk4 filtermutectcalls
- path: ./output/filtermutectcalls/test_filtered.vcf.gz
- path: ./output/filtermutectcalls/test_filtered.vcf.gz.filteringStats.tsv
md5sum: 98e1b87a52999eb8f429ef4a7877eb3f
- path: ./output/filtermutectcalls/test_filtered.vcf.gz.tbi