Add gatk somatic tumour calling subworkflow (#1064)

* initial commit to set up new branch

* save changes to checkout

* workflow working, still needs test.yml and meta.yml, also fix versions file

* subworkflow finished

* Update pytest_subworkflows.yml

* Update pytest_subworkflows.yml

* Update pytest_subworkflows.yml

* fix config subworkflow name

* Update main.nf

* Update pytest_subworkflows.yml

* fixed md5sum issue likely caused by gatk version update

* tumour changed to tumor

* old dir deleted

* Comments added to explain use of placeholders '[]'

* updated index names, input channel renamed to input

* Apply suggestions from code review

* updated to perform new subworkflow testing

Co-authored-by: GCJMackenzie <gavin.mackenzie@nibsc.org>
Co-authored-by: Maxime U. Garcia <maxime.garcia@scilifelab.se>
This commit is contained in:
GCJMackenzie 2021-11-17 10:07:17 +00:00 committed by GitHub
parent 2d4549122b
commit 071b1d50a8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 264 additions and 4 deletions

View file

@ -0,0 +1,88 @@
//
// Run GATK mutect2 in tumor only mode, getepileupsummaries, calculatecontamination and filtermutectcalls
//
params.mutect2_options = [:]
params.getpileup_options = [:]
params.calccontam_options = [:]
params.filtercalls_options = [suffix: '_filtered']
include { GATK4_MUTECT2 as MUTECT2 } from '../../../modules/gatk4/mutect2/main' addParams( options: params.mutect2_options )
include { GATK4_GETPILEUPSUMMARIES as GETPILEUPSUMMARIES } from '../../../modules/gatk4/getpileupsummaries/main' addParams( options: params.getpileup_options )
include { GATK4_CALCULATECONTAMINATION as CALCULATECONTAMINATION } from '../../../modules/gatk4/calculatecontamination/main' addParams( options: params.calccontam_options )
include { GATK4_FILTERMUTECTCALLS as FILTERMUTECTCALLS } from '../../../modules/gatk4/filtermutectcalls/main' addParams( options: params.filtercalls_options )
workflow GATK_TUMOR_ONLY_SOMATIC_VARIANT_CALLING {
take:
input // channel: [ val(meta), [ input ], [ input_index ], [] ]
fasta // channel: /path/to/reference/fasta
fai // channel: /path/to/reference/fasta/index
dict // channel: /path/to/reference/fasta/dictionary
germline_resource // channel: /path/to/germline/resource
germline_resource_tbi // channel: /path/to/germline/index
panel_of_normals // channel: /path/to/panel/of/normals
panel_of_normals_tbi // channel: /path/to/panel/of/normals/index
interval_file // channel: /path/to/interval/file
main:
ch_versions = Channel.empty()
mutect2_input = channel.from(input)
//
//Perform variant calling using mutect2 module in tumor single mode.
//
MUTECT2 ( mutect2_input , true , false , false , [] , fasta , fai , dict , germline_resource , germline_resource_tbi , panel_of_normals , panel_of_normals_tbi )
ch_versions = ch_versions.mix(MUTECT2.out.versions)
//
//Generate pileup summary table using getepileupsummaries.
//
pileup_input = channel.from(input).map {
meta, input_file, input_index, which_norm ->
[meta, input_file[0], input_index[0]]
}
GETPILEUPSUMMARIES ( pileup_input , germline_resource , germline_resource_tbi , interval_file )
ch_versions = ch_versions.mix(GETPILEUPSUMMARIES.out.versions)
//
//Contamination and segmentation tables created using calculatecontamination on the pileup summary table.
//
ch_pileup = GETPILEUPSUMMARIES.out.table.collect()
//[] is a placeholder for the optional input where the matched normal sample would be passed in for tumor-normal samples, which is not necessary for this workflow.
ch_pileup.add([])
CALCULATECONTAMINATION ( ch_pileup, true )
ch_versions = ch_versions.mix(CALCULATECONTAMINATION.out.versions)
//
//Mutect2 calls filtered by filtermutectcalls using the contamination and segmentation tables.
//
ch_vcf = MUTECT2.out.vcf.collect()
ch_tbi = MUTECT2.out.tbi.collect()
ch_stats = MUTECT2.out.stats.collect()
//[] is added as a placeholder for the optional input file artifact priors, which is only used for tumor-normal samples and therefor isn't needed in this workflow.
ch_stats.add([])
ch_segment = CALCULATECONTAMINATION.out.segmentation.collect()
ch_contamination = CALCULATECONTAMINATION.out.contamination.collect()
//[] is added as a placeholder for entering a contamination estimate value, which is not needed as this workflow uses the contamination table instead.
ch_contamination.add([])
ch_filtermutect_in = ch_vcf.combine(ch_tbi, by: 0).combine(ch_stats, by: 0).combine(ch_segment, by: 0).combine(ch_contamination, by: 0)
FILTERMUTECTCALLS ( ch_filtermutect_in, fasta, fai, dict )
ch_versions = ch_versions.mix(FILTERMUTECTCALLS.out.versions)
emit:
mutect2_vcf = MUTECT2.out.vcf.collect() // channel: [ val(meta), [ vcf ] ]
mutect2_index = MUTECT2.out.tbi.collect() // channel: [ val(meta), [ tbi ] ]
mutect2_stats = MUTECT2.out.stats.collect() // channel: [ val(meta), [ stats ] ]
pileup_table = GETPILEUPSUMMARIES.out.table.collect() // channel: [ val(meta), [ table ] ]
contamination_table = CALCULATECONTAMINATION.out.contamination.collect() // channel: [ val(meta), [ contamination ] ]
segmentation_table = CALCULATECONTAMINATION.out.segmentation.collect() // channel: [ val(meta), [ segmentation ] ]
filtered_vcf = FILTERMUTECTCALLS.out.vcf.collect() // channel: [ val(meta), [ vcf ] ]
filtered_index = FILTERMUTECTCALLS.out.tbi.collect() // channel: [ val(meta), [ tbi ] ]
filtered_stats = FILTERMUTECTCALLS.out.stats.collect() // channel: [ val(meta), [ stats ] ]
versions = ch_versions // channel: [ versions.yml ]
}

View file

@ -0,0 +1,108 @@
name: gatk_tumor_only_somatic_variant_calling
description: |
Perform variant calling on a single tumor sample using mutect2 tumor only mode.
Run the input bam file through getpileupsummarries and then calculatecontaminationto get the contamination and segmentation tables.
Filter the mutect2 output vcf using filtermutectcalls and the contamination & segmentation tables for additional filtering.
keywords:
- gatk4
- mutect2
- getpileupsummaries
- calculatecontamination
- filtermutectcalls
- variant_calling
- tumor_only
- filtered_vcf
modules:
- gatk4/mutect2
- gatk4/getpileupsummaries
- gatk4/calculatecontamination
- gatk4/filtermutectcalls
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test' ]
- input:
type: list
description: list containing one BAM file, also able to take CRAM as an input
pattern: "[ *.{bam/cram} ]"
- input_index:
type: list
description: list containing one BAM file indexe, also able to take CRAM index as an input
pattern: "[ *.{bam.bai/cram.crai} ]"
- fasta:
type: file
description: The reference fasta file
pattern: "*.fasta"
- fai:
type: file
description: Index of reference fasta file
pattern: "*.fasta.fai"
- dict:
type: file
description: GATK sequence dictionary
pattern: "*.dict"
- germline_resource:
type: file
description: Population vcf of germline sequencing, containing allele fractions.
pattern: "*.vcf.gz"
- germline_resource_tbi:
type: file
description: Index file for the germline resource.
pattern: "*.vcf.gz.tbi"
- panel_of_normals:
type: file
description: vcf file to be used as a panel of normals.
pattern: "*.vcf.gz"
- panel_of_normals_tbi:
type: file
description: Index for the panel of normals.
pattern: "*.vcf.gz.tbi"
- interval_file:
type: file
description: File containing intervals.
pattern: "*.interval_list"
output:
- versions:
type: file
description: File containing software versions
pattern: 'versions.yml'
- mutect2_vcf:
type: file
description: Compressed vcf file to be used for variant_calling.
pattern: "[ *.vcf.gz ]"
- mutect2_tbi:
type: file
description: Indexes of the mutect2_vcf file
pattern: "[ *vcf.gz.tbi ]"
- mutect2_stats:
type: file
description: Stats files for the mutect2 vcf
pattern: "[ *vcf.gz.stats ]"
- pileup_table:
type: file
description: File containing the pileup summary table.
pattern: "*.pileups.table"
- contamination_table:
type: file
description: File containing the contamination table.
pattern: "*.contamination.table"
- segmentation_table:
type: file
description: Output table containing segmentation of tumor minor allele fractions.
pattern: "*.segmentation.table"
- filtered_vcf:
type: file
description: file containing filtered mutect2 calls.
pattern: "*.vcf.gz"
- filtered_tbi:
type: file
description: tbi file that pairs with filtered vcf.
pattern: "*.vcf.gz.tbi"
- filtered_stats:
type: file
description: file containing statistics of the filtermutectcalls run.
pattern: "*.filteringStats.tsv"
authors:
- '@GCJMackenzie'

View file

@ -0,0 +1,4 @@
params.mutect2_options = [:]
params.getpileup_options = [:]
params.calccontam_options = [:]
params.filtercalls_options = [:]

View file

@ -474,7 +474,7 @@ gatk4/bedtointervallist:
- modules/gatk4/bedtointervallist/**
- tests/modules/gatk4/bedtointervallist/**
gatk4/calculatecontamination:
gatk4/calculatecontamination: &gatk4/calculatecontamination
- modules/gatk4/calculatecontamination/**
- tests/modules/gatk4/calculatecontamination/**
@ -494,7 +494,7 @@ gatk4/fastqtosam:
- modules/gatk4/fastqtosam/**
- tests/modules/gatk4/fastqtosam/**
gatk4/filtermutectcalls:
gatk4/filtermutectcalls: &gatk4/filtermutectcalls
- modules/gatk4/filtermutectcalls/**
- tests/modules/gatk4/filtermutectcalls/**
@ -506,7 +506,7 @@ gatk4/genotypegvcfs:
- modules/gatk4/genotypegvcfs/**
- tests/modules/gatk4/genotypegvcfs/**
gatk4/getpileupsummaries:
gatk4/getpileupsummaries: &gatk4/getpileupsummaries
- modules/gatk4/getpileupsummaries/**
- tests/modules/gatk4/getpileupsummaries/**
@ -538,7 +538,7 @@ gatk4/mergevcfs:
- modules/gatk4/mergevcfs/**
- tests/modules/gatk4/mergevcfs/**
gatk4/mutect2:
gatk4/mutect2: &gatk4/mutect2
- modules/gatk4/mutect2/**
- tests/modules/gatk4/mutect2/**
@ -1426,3 +1426,11 @@ subworkflows/gatk_create_som_pon:
- tests/subworkflows/nf-core/gatk_create_som_pon/**
- *gatk4/genomicsdbimport
- *gatk4/createsomaticpanelofnormals
subworkflows/gatk_tumor_only_somatic_variant_calling:
- subworkflows/nf-core/gatk_tumor_only_somatic_variant_calling/**
- tests/subworkflows/nf-core/gatk_tumor_only_somatic_variant_calling/**
- *gatk4/mutect2
- *gatk4/getpileupsummaries
- *gatk4/calculatecontamination
- *gatk4/filtermutectcalls

View file

@ -0,0 +1,24 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { GATK_TUMOR_ONLY_SOMATIC_VARIANT_CALLING } from '../../../../subworkflows/nf-core/gatk_tumor_only_somatic_variant_calling/main' addParams( [:] )
workflow test_gatk_tumor_only_somatic_variant_calling {
input = [
[[ id:'test' ], // meta map
[file(params.test_data['homo_sapiens']['illumina']['test2_paired_end_recalibrated_sorted_bam'], checkIfExists: true)],
[file(params.test_data['homo_sapiens']['illumina']['test2_paired_end_recalibrated_sorted_bam_bai'], checkIfExists: true)],
[] ]
]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)
fai = file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['homo_sapiens']['genome']['genome_dict'], checkIfExists: true)
germline_resource = file(params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_vcf_gz'], checkIfExists: true)
germline_resource_tbi = file(params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_vcf_gz_tbi'], checkIfExists: true)
panel_of_normals = file(params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_vcf_gz'], checkIfExists: true)
panel_of_normals_tbi = file(params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_vcf_gz_tbi'], checkIfExists: true)
interval_file = file(params.test_data['homo_sapiens']['genome']['genome_interval_list'], checkIfExists: true)
GATK_TUMOR_ONLY_SOMATIC_VARIANT_CALLING ( input, fasta, fai, dict, germline_resource, germline_resource_tbi, panel_of_normals, panel_of_normals_tbi, interval_file )
}

View file

@ -0,0 +1,28 @@
- name: gatk_tumor_only_somatic_variant_calling
command: nextflow run ./tests/subworkflows/nf-core/gatk_tumor_only_somatic_variant_calling -entry test_gatk_tumor_only_somatic_variant_calling -c tests/config/nextflow.config
tags:
- subworkflows/gatk_tumor_only_somatic_variant_calling
# Modules
# - gatk4/mutect2
# - gatk4/getpileupsummaries
# - gatk4/calculatecontamination
# - gatk4/filtermutectcalls
files:
# gatk4 mutect2
- path: ./output/mutect2/test.vcf.gz
- path: ./output/mutect2/test.vcf.gz.stats
md5sum: 106c5828b02b906c97922618b6072169
- path: ./output/mutect2/test.vcf.gz.tbi
# gatk4 getpileupsummaries
- path: ./output/getpileupsummaries/test.pileups.table
md5sum: 8b1b4c8ab831eca50ee9e940463a741f
# gatk4 calculatecontamination
- path: ./output/calculatecontamination/test.contamination.table
md5sum: 5fdcf1728cf98985ce31c038eb24e05c
- path: ./output/calculatecontamination/test.segmentation.table
md5sum: 91f28bfe4727a3256810927fc5eba92f
# gatk4 filtermutectcalls
- path: ./output/filtermutectcalls/test_filtered.vcf.gz
- path: ./output/filtermutectcalls/test_filtered.vcf.gz.filteringStats.tsv
md5sum: 8731945490960546719ce4a71a151e4f
- path: ./output/filtermutectcalls/test_filtered.vcf.gz.tbi