Add MultiVCFAnalyzer (#1845)

* Add MultiVCFAnalyzer

* Fix versions

* Fix tests due to md5sum var

* Apply suggestions from code review

* Linting

* Apply suggestions from code review

Co-authored-by: Robert A. Petit III <robbie.petit@gmail.com>

Co-authored-by: Robert A. Petit III <robbie.petit@gmail.com>
This commit is contained in:
James A. Fellows Yates 2022-07-05 09:47:10 +02:00 committed by GitHub
parent 6702d2e145
commit aed45dd766
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 275 additions and 0 deletions

View file

@ -0,0 +1,72 @@
process MULTIVCFANALYZER {
tag '$fasta'
label 'process_medium'
conda (params.enable_conda ? "bioconda::multivcfanalyzer=0.85.2" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/multivcfanalyzer:0.85.2--hdfd78af_1':
'quay.io/biocontainers/multivcfanalyzer:0.85.2--hdfd78af_1' }"
input:
path vcfs
path fasta
path snpeff_results
path gff
val allele_freqs
val genotype_quality
val coverage
val homozygous_freq
val heterozygous_freq
path gff_exclude
output:
path('fullAlignment.fasta.gz') , emit: full_alignment
path('info.txt') , emit: info_txt
path('snpAlignment.fasta.gz') , emit: snp_alignment
path('snpAlignmentIncludingRefGenome.fasta.gz') , emit: snp_genome_alignment
path('snpStatistics.tsv') , emit: snpstatistics
path('snpTable.tsv') , emit: snptable
path('snpTableForSnpEff.tsv') , emit: snptable_snpeff
path('snpTableWithUncertaintyCalls.tsv') , emit: snptable_uncertainty
path('structureGenotypes.tsv') , emit: structure_genotypes
path('structureGenotypes_noMissingData-Columns.tsv') , emit: structure_genotypes_nomissing
path('MultiVCFAnalyzer.json') , emit: json
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
// def args = task.ext.args ?: '' // MultiVCFAnalyzer has strict and input ordering and all are mandatory. Deactivating $args to prevent breakage of input
def args2 = task.ext.args2 ?: ''
def cmd_snpeff_results = snpeff_results ? "${snpeff_results}" : "NA"
def cmd_gff = gff ? "${gff}" : "NA"
def cmd_allele_freqs = allele_freqs ? "T" : "F"
def cmd_gff_exclude = gff_exclude ? "${gff}" : "NA"
"""
multivcfanalyzer \\
${cmd_snpeff_results} \\
${fasta} \\
${cmd_gff} \\
. \
${cmd_allele_freqs} \\
${genotype_quality} \\
${coverage} \\
${homozygous_freq} \\
${heterozygous_freq} \\
${cmd_gff_exclude} \\
${vcfs.join(" ")}
gzip \\
$args2 \\
fullAlignment.fasta snpAlignment.fasta snpAlignmentIncludingRefGenome.fasta
cat <<-END_VERSIONS > versions.yml
"${task.process}":
multivcfanalyzer: \$(echo \$(multivcfanalyzer --help | head -n 1) | cut -f 3 -d ' ' )
END_VERSIONS
"""
}

View file

@ -0,0 +1,122 @@
name: "multivcfanalyzer"
description: SNP table generator from GATK UnifiedGenotyper with functionality geared for aDNA
keywords:
- vcf
- ancient DNA
- aDNA
- SNP
- GATK UnifiedGenotyper
- SNP table
tools:
- "multivcfanalyzer":
description: "MultiVCFAnalyzer is a VCF file post-processing tool tailored for aDNA. License on Github repository."
homepage: "https://github.com/alexherbig/MultiVCFAnalyzer"
documentation: "https://github.com/alexherbig/MultiVCFAnalyzer"
tool_dev_url: "https://github.com/alexherbig/MultiVCFAnalyzer"
doi: "10.1038/nature13591"
licence: "['GPL >=3']"
input:
- vcfs:
type: file
description: One or a list of uncompressed VCF file
pattern: "*.vcf"
- fasta:
type: file
description: Reference genome VCF was generated against
pattern: "*.{fasta,fna,fa}"
- snpeff_results:
type: file
description: Results from snpEff in txt format (Optional)
pattern: "*.txt"
- gff:
type: file
description: GFF file corresponding to reference genome fasta (Optional)
pattern: "*.gff"
- allele_freqs:
type: boolean
description: |
Whether to include the percentage of reads a given allele is
present in in the SNP table.
- genotype_quality:
type: integer
description: |
Minimum GATK genotyping threshold threshold of which a SNP call
falling under is 'discarded'
- coverage:
type: integer
description: |
Minimum number of a reads that a position must be covered by to be
reported
- homozygous_freq:
type: number
description: Fraction of reads a base must have to be called 'homozygous'
- heterozygous_freq:
type: mumber
description: |
Fraction of which whereby if a call falls above this value, and lower
than the homozygous threshold, a base will be called 'heterozygous'.
- gff_exclude:
type: file
description: |
file listing positions that will be 'filtered' (i.e. ignored)
(Optional)
pattern: "*.vcf"
output:
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- bam:
type: file
description: Sorted BAM/CRAM/SAM file
pattern: "*.{bam,cram,sam}"
- full_alignment:
type: file
description: Fasta a fasta file of all positions contained in the VCF files i.e. including ref calls
pattern: ".fasta.gz"
- info_txt:
type: file
description: Information about the run
pattern: ".txt"
- snp_alignment:
type: file
description: A fasta file of just SNP positions with samples only
pattern: ".fasta.gz"
- snp_genome_alignment:
type: file
description: A fasta file of just SNP positions with reference genome
pattern: ".fasta.gz"
- snpstatistics:
type: file
description: Some basic statistics about the SNP calls of each sample
pattern: ".tsv"
- snptable:
type: file
description: Basic SNP table of combined positions taken from each VCF file
pattern: ".tsv"
- snptable_snpeff:
type: file
description: Input file for SnpEff
pattern: ".tsv"
- snptable_uncertainty:
type: file
description: Same as above, but with lower case characters indicating uncertain calls
pattern: ".tsv"
- structure_genotypes:
type: file
description: Input file for STRUCTURE
pattern: ".tsv"
- structure_genotypes_nomissing:
type: file
description: Alternate input file for STRUCTURE
pattern: ".tsv"
- json:
type: file
description: Summary statistics in MultiQC JSON format
pattern: ".json"
authors:
- "@jfy133"

View file

@ -1499,6 +1499,10 @@ multiqc:
- modules/multiqc/**
- tests/modules/multiqc/**
multivcfanalyzer:
- modules/multivcfanalyzer/**
- tests/modules/multivcfanalyzer/**
mummer:
- modules/mummer/**
- tests/modules/mummer/**

View file

@ -0,0 +1,41 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { GATK_UNIFIEDGENOTYPER } from '../../../modules/gatk/unifiedgenotyper/main.nf'
include { GUNZIP } from '../../../modules/gunzip/main.nf'
include { MULTIVCFANALYZER } from '../../../modules/multivcfanalyzer/main.nf'
workflow test_multivcfanalyzer {
input = Channel.of([ [ id:'test' ], // meta map
file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true),
file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true),
],
[ [ id:'test2' ], // meta map
file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true),
file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true),
],
)
fasta = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
fai = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['sarscov2']['genome']['genome_dict'], checkIfExists: true)
GATK_UNIFIEDGENOTYPER ( input, fasta, fai, dict, [], [], [], [])
mva_vcf = GUNZIP ( GATK_UNIFIEDGENOTYPER.out.vcf ).gunzip
.map{it[1]}
.collect()
.dump()
snpeff_results = []
gff = []
allele_freqs = true
genotype_quality = 30
coverage = 5
homozygous_freq = 0.8
heterozygous_freq = 0.2
gff_exclude = []
MULTIVCFANALYZER ( mva_vcf, fasta, snpeff_results, gff, allele_freqs, genotype_quality, coverage, homozygous_freq, heterozygous_freq, gff_exclude )
}

View file

@ -0,0 +1,5 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
}

View file

@ -0,0 +1,31 @@
- name: multivcfanalyzer test_multivcfanalyzer
command: nextflow run ./tests/modules/multivcfanalyzer -entry test_multivcfanalyzer -c ./tests/config/nextflow.config -c ./tests/modules/multivcfanalyzer/nextflow.config
tags:
- multivcfanalyzer
files:
- path: output/multivcfanalyzer/MultiVCFAnalyzer.json
md5sum: c841c9f04c6114911f308ea09a08980e
- path: output/multivcfanalyzer/fullAlignment.fasta.gz
contains:
- ">Reference_MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome"
- path: output/multivcfanalyzer/info.txt
contains:
- "Run finished"
- path: output/multivcfanalyzer/snpAlignment.fasta.gz
contains:
- "test.vcf"
- path: output/multivcfanalyzer/snpAlignmentIncludingRefGenome.fasta.gz
contains:
- ">Reference_MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome"
- path: output/multivcfanalyzer/snpStatistics.tsv
contains: ["statistics", "test.vcf", "test2.vcf"]
- path: output/multivcfanalyzer/snpTable.tsv
contains: ["Position", "test.vcf", "test2.vcf"]
- path: output/multivcfanalyzer/snpTableForSnpEff.tsv
md5sum: 8d7ab4ec98a89d290e301d6feae461aa
- path: output/multivcfanalyzer/snpTableWithUncertaintyCalls.tsv
contains: ["Position", "test.vcf", "test2.vcf"]
- path: output/multivcfanalyzer/structureGenotypes.tsv
contains: ["test.vcf", "test2.vcf"]
- path: output/multivcfanalyzer/structureGenotypes_noMissingData-Columns.tsv
contains: ["test.vcf", "test2.vcf"]