Add panel of normals subworkflow (#1044)

* commiting changes to switch branch

* commit to setup remote branch

* first draft of the sompon workflow

* keep branch in line with gendb bugfixing

* Update test.yml

* tidy up main.nf

* fixed md5sum

Co-authored-by: GCJMackenzie <gavin.mackenzie@nibsc.org>
This commit is contained in:
GCJMackenzie 2021-11-09 10:16:43 +00:00 committed by GitHub
parent e0ada7d219
commit 9573cb1bec
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 205 additions and 0 deletions

View file

@ -0,0 +1,58 @@
//
// Run GATK mutect2, genomicsdbimport and createsomaticpanelofnormals
//
params.mutect2_options = [args: '--max-mnp-distance 0']
params.gendbimport_options = [:]
params.createsompon_options = [:]
include { GATK4_MUTECT2 } from '../../../modules/gatk4/mutect2/main' addParams( options: params.mutect2_options )
include { GATK4_GENOMICSDBIMPORT } from '../../../modules/gatk4/genomicsdbimport/main' addParams( options: params.gendbimport_options )
include { GATK4_CREATESOMATICPANELOFNORMALS } from '../../../modules/gatk4/createsomaticpanelofnormals/main' addParams( options: params.createsompon_options )
workflow GATK_CREATE_SOM_PON {
take:
ch_mutect2_in // channel: [ val(meta), [ input ], [ input_index ], [] ]
fasta // channel: /path/to/reference/fasta
fastaidx // channel: /path/to/reference/fasta/index
dict // channel: /path/to/reference/fasta/dictionary
pon_name // channel: name for panel of normals
interval_file // channel: /path/to/interval/file
main:
ch_versions = Channel.empty()
input = channel.from(ch_mutect2_in)
//
//Perform variant calling for each sample using mutect2 module in panel of normals mode.
//
GATK4_MUTECT2 ( input , false , true, false , [] , fasta , fastaidx , dict , [], [] , [] , [] )
ch_versions = ch_versions.mix(GATK4_MUTECT2.out.versions.first())
//
//Convert all sample vcfs into a genomicsdb workspace using genomicsdbimport.
//
ch_vcf = GATK4_MUTECT2.out.vcf.collect{it[1]}.toList()
ch_index = GATK4_MUTECT2.out.tbi.collect{it[1]}.toList()
gendb_input = Channel.of([[ id:pon_name ]]).combine(ch_vcf).combine(ch_index).combine([interval_file]).combine(['']).combine([dict])
GATK4_GENOMICSDBIMPORT ( gendb_input, false, false, false )
ch_versions = ch_versions.mix(GATK4_GENOMICSDBIMPORT.out.versions.first())
//
//Panel of normals made from genomicsdb workspace using createsomaticpanelofnormals.
//
GATK4_GENOMICSDBIMPORT.out.genomicsdb.view()
GATK4_CREATESOMATICPANELOFNORMALS ( GATK4_GENOMICSDBIMPORT.out.genomicsdb, fasta, fastaidx, dict )
ch_versions = ch_versions.mix(GATK4_CREATESOMATICPANELOFNORMALS.out.versions.first())
emit:
mutect2_vcf = GATK4_MUTECT2.out.vcf.collect() // channel: [ val(meta), [ vcf ] ]
mutect2_index = GATK4_MUTECT2.out.tbi.collect() // channel: [ val(meta), [ tbi ] ]
mutect2_stats = GATK4_MUTECT2.out.stats.collect() // channel: [ val(meta), [ stats ] ]
genomicsdb = GATK4_GENOMICSDBIMPORT.out.genomicsdb // channel: [ val(meta), [ genomicsdb ] ]
pon_vcf = GATK4_CREATESOMATICPANELOFNORMALS.out.vcf // channel: [ val(meta), [ vcf.gz ] ]
pon_index = GATK4_CREATESOMATICPANELOFNORMALS.out.tbi // channel: [ val(meta), [ tbi ] ]
versions = ch_versions // channel: [ versions.yml ]
}

View file

@ -0,0 +1,75 @@
name: gatk_create_som_pon
description: Perform variant calling on a set of normal samples using mutect2 panel of normals mode. Group them into a genomicsdbworkspace using genomicsdbimport, then use this to create a panel of normals using createsomaticpanelofnormals.
keywords:
- gatk4
- mutect2
- genomicsdbimport
- createsomaticpanelofnormals
- variant_calling
- genomicsdb_workspace
- panel_of_normals
modules:
- gatk4/mutect2
- gatk4/genomicsdbimport
- gatk4/createsomaticpanelofnormals
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test' ]
- input:
type: list
description: list of BAM files, also able to take CRAM as an input
pattern: "[ *.{bam/cram} ]"
- input_index:
type: list
description: list of BAM file indexes, also able to take CRAM indexes as an input
pattern: "[ *.{bam.bai/cram.crai} ]"
- fasta:
type: file
description: The reference fasta file
pattern: "*.fasta"
- fastaidx:
type: file
description: Index of reference fasta file
pattern: "fasta.fai"
- dict:
type: file
description: GATK sequence dictionary
pattern: "*.dict"
- pon_name:
type: String
Description: name to be used for the genomicsdb workspace and panel of normals as meta_id has the individual sample names and a name for the combined files is reuired here.
pattern: "example_name"
output:
- versions:
type: file
description: File containing software versions
pattern: 'versions.yml'
- mutect2_vcf:
type: list
description: List of compressed vcf files to be used to make the gendb workspace
pattern: "[ *.vcf.gz ]"
- mutect2_index:
type: list
description: List of indexes of mutect2_vcf files
pattern: "[ *vcf.gz.tbi ]"
- mutect2_stats:
type: list
description: List of stats files that pair with mutect2_vcf files
pattern: "[ *vcf.gz.stats ]"
- genomicsdb:
type: directory
description: Directory containing the files that compose the genomicsdb workspace.
pattern: "path/name_of_workspace"
- pon_vcf:
type: file
description: Panel of normal as compressed vcf file
pattern: "*.vcf.gz"
- pon_index:
type: file
description: Index of pon_vcf file
pattern: "*vcf.gz.tbi"
authors:
- '@GCJMackenzie'

View file

@ -0,0 +1,3 @@
params.mutect2_options = [:]
params.gendbimport_options = [:]
params.createsompon_options = [:]

View file

@ -14,3 +14,8 @@ subworkflows/sra_fastq:
- subworkflows/nf-core/sra_fastq/** - subworkflows/nf-core/sra_fastq/**
- tests/subworkflows/nf-core/sra_fastq/** - tests/subworkflows/nf-core/sra_fastq/**
subworkflows/gatk_create_som_pon:
- subworkflows/nf-core/gatk_create_som_pon/**
- tests/subworkflows/nf-core/gatk_create_som_pon/**

View file

@ -0,0 +1,26 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { GATK_CREATE_SOM_PON } from '../../../../subworkflows/nf-core/gatk_create_som_pon/main' addParams( [:] )
workflow test_gatk_create_som_pon {
ch_mutect2_in = [
[[ id:'test1' ], // meta map
[file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_bam'], checkIfExists: true)],
[file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_bam_bai'], checkIfExists: true)],
[] ],
[[ id:'test2' ], // meta map
[file(params.test_data['homo_sapiens']['illumina']['test2_paired_end_recalibrated_sorted_bam'], checkIfExists: true)],
[file(params.test_data['homo_sapiens']['illumina']['test2_paired_end_recalibrated_sorted_bam_bai'], checkIfExists: true)],
[] ]
]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)
fastaidx = file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['homo_sapiens']['genome']['genome_dict'], checkIfExists: true)
pon_name = "test_panel"
interval_file = file(params.test_data['homo_sapiens']['genome']['genome_interval_list'], checkIfExists: true)
GATK_CREATE_SOM_PON ( ch_mutect2_in, fasta, fastaidx, dict, pon_name, interval_file )
}

View file

@ -0,0 +1,38 @@
- name: gatk_create_som_pon
command: nextflow run ./tests/subworkflows/nf-core/gatk_create_som_pon -entry test_gatk_create_som_pon -c tests/config/nextflow.config
tags:
- subworkflows/gatk_create_som_pon
# Modules
- gatk4
- gatk4
- gatk4/genomicsdbimport
- gatk4/createsomaticpanelofnormals
files:
# gatk4 mutect2
- path: output/gatk4/test1.vcf.gz
- path: output/gatk4/test1.vcf.gz.stats
md5sum: 4f77301a125913170b8e9e7828b4ca3f
- path: output/gatk4/test1.vcf.gz.tbi
- path: output/gatk4/test2.vcf.gz
- path: output/gatk4/test2.vcf.gz.stats
md5sum: 106c5828b02b906c97922618b6072169
- path: output/gatk4/test2.vcf.gz.tbi
# gatk4 genomicsdbimport
- path: output/gatk4/test_panel/__tiledb_workspace.tdb
md5sum: d41d8cd98f00b204e9800998ecf8427e
- path: output/gatk4/test_panel/callset.json
md5sum: 2ab411773b7267de61f8c04939de2a99
- path: output/gatk4/test_panel/chr22$1$40001/.__consolidation_lock
md5sum: d41d8cd98f00b204e9800998ecf8427e
- path: output/gatk4/test_panel/chr22$1$40001/__array_schema.tdb
- path: output/gatk4/test_panel/chr22$1$40001/genomicsdb_meta_dir/genomicsdb_column_bounds.json
md5sum: 2502f79658bc000578ebcfddfc1194c0
- path: output/gatk4/test_panel/vcfheader.vcf
contains:
- "FORMAT=<ID=AD,Number=R,Type=Integer,Description="
- path: output/gatk4/test_panel/vidmap.json
md5sum: ee4f6815c433caa8ab101ec45ff328a6
# gatk4 createsomaticpanelofnormals
- path: output/gatk4/test_panel.vcf.gz
- path: output/gatk4/test_panel.vcf.gz.tbi
md5sum: d7e2524ba4bf7538dbee3e225a74b0da