mirror of
https://github.com/MillironX/nf-core_modules.git
synced 2025-01-09 23:31:13 -05:00
Add MultiVCFAnalyzer (#1845)
* Add MultiVCFAnalyzer * Fix versions * Fix tests due to md5sum var * Apply suggestions from code review * Linting * Apply suggestions from code review Co-authored-by: Robert A. Petit III <robbie.petit@gmail.com> Co-authored-by: Robert A. Petit III <robbie.petit@gmail.com>
This commit is contained in:
parent
6702d2e145
commit
aed45dd766
6 changed files with 275 additions and 0 deletions
72
modules/multivcfanalyzer/main.nf
Normal file
72
modules/multivcfanalyzer/main.nf
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
process MULTIVCFANALYZER {
|
||||||
|
tag '$fasta'
|
||||||
|
label 'process_medium'
|
||||||
|
|
||||||
|
conda (params.enable_conda ? "bioconda::multivcfanalyzer=0.85.2" : null)
|
||||||
|
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
|
||||||
|
'https://depot.galaxyproject.org/singularity/multivcfanalyzer:0.85.2--hdfd78af_1':
|
||||||
|
'quay.io/biocontainers/multivcfanalyzer:0.85.2--hdfd78af_1' }"
|
||||||
|
|
||||||
|
input:
|
||||||
|
path vcfs
|
||||||
|
path fasta
|
||||||
|
path snpeff_results
|
||||||
|
path gff
|
||||||
|
val allele_freqs
|
||||||
|
val genotype_quality
|
||||||
|
val coverage
|
||||||
|
val homozygous_freq
|
||||||
|
val heterozygous_freq
|
||||||
|
path gff_exclude
|
||||||
|
|
||||||
|
|
||||||
|
output:
|
||||||
|
path('fullAlignment.fasta.gz') , emit: full_alignment
|
||||||
|
path('info.txt') , emit: info_txt
|
||||||
|
path('snpAlignment.fasta.gz') , emit: snp_alignment
|
||||||
|
path('snpAlignmentIncludingRefGenome.fasta.gz') , emit: snp_genome_alignment
|
||||||
|
path('snpStatistics.tsv') , emit: snpstatistics
|
||||||
|
path('snpTable.tsv') , emit: snptable
|
||||||
|
path('snpTableForSnpEff.tsv') , emit: snptable_snpeff
|
||||||
|
path('snpTableWithUncertaintyCalls.tsv') , emit: snptable_uncertainty
|
||||||
|
path('structureGenotypes.tsv') , emit: structure_genotypes
|
||||||
|
path('structureGenotypes_noMissingData-Columns.tsv') , emit: structure_genotypes_nomissing
|
||||||
|
path('MultiVCFAnalyzer.json') , emit: json
|
||||||
|
path "versions.yml" , emit: versions
|
||||||
|
|
||||||
|
when:
|
||||||
|
task.ext.when == null || task.ext.when
|
||||||
|
|
||||||
|
script:
|
||||||
|
// def args = task.ext.args ?: '' // MultiVCFAnalyzer has strict and input ordering and all are mandatory. Deactivating $args to prevent breakage of input
|
||||||
|
def args2 = task.ext.args2 ?: ''
|
||||||
|
|
||||||
|
def cmd_snpeff_results = snpeff_results ? "${snpeff_results}" : "NA"
|
||||||
|
def cmd_gff = gff ? "${gff}" : "NA"
|
||||||
|
def cmd_allele_freqs = allele_freqs ? "T" : "F"
|
||||||
|
def cmd_gff_exclude = gff_exclude ? "${gff}" : "NA"
|
||||||
|
|
||||||
|
"""
|
||||||
|
multivcfanalyzer \\
|
||||||
|
${cmd_snpeff_results} \\
|
||||||
|
${fasta} \\
|
||||||
|
${cmd_gff} \\
|
||||||
|
. \
|
||||||
|
${cmd_allele_freqs} \\
|
||||||
|
${genotype_quality} \\
|
||||||
|
${coverage} \\
|
||||||
|
${homozygous_freq} \\
|
||||||
|
${heterozygous_freq} \\
|
||||||
|
${cmd_gff_exclude} \\
|
||||||
|
${vcfs.join(" ")}
|
||||||
|
|
||||||
|
gzip \\
|
||||||
|
$args2 \\
|
||||||
|
fullAlignment.fasta snpAlignment.fasta snpAlignmentIncludingRefGenome.fasta
|
||||||
|
|
||||||
|
cat <<-END_VERSIONS > versions.yml
|
||||||
|
"${task.process}":
|
||||||
|
multivcfanalyzer: \$(echo \$(multivcfanalyzer --help | head -n 1) | cut -f 3 -d ' ' )
|
||||||
|
END_VERSIONS
|
||||||
|
"""
|
||||||
|
}
|
122
modules/multivcfanalyzer/meta.yml
Normal file
122
modules/multivcfanalyzer/meta.yml
Normal file
|
@ -0,0 +1,122 @@
|
||||||
|
name: "multivcfanalyzer"
|
||||||
|
description: SNP table generator from GATK UnifiedGenotyper with functionality geared for aDNA
|
||||||
|
keywords:
|
||||||
|
- vcf
|
||||||
|
- ancient DNA
|
||||||
|
- aDNA
|
||||||
|
- SNP
|
||||||
|
- GATK UnifiedGenotyper
|
||||||
|
- SNP table
|
||||||
|
tools:
|
||||||
|
- "multivcfanalyzer":
|
||||||
|
description: "MultiVCFAnalyzer is a VCF file post-processing tool tailored for aDNA. License on Github repository."
|
||||||
|
homepage: "https://github.com/alexherbig/MultiVCFAnalyzer"
|
||||||
|
documentation: "https://github.com/alexherbig/MultiVCFAnalyzer"
|
||||||
|
tool_dev_url: "https://github.com/alexherbig/MultiVCFAnalyzer"
|
||||||
|
doi: "10.1038/nature13591"
|
||||||
|
licence: "['GPL >=3']"
|
||||||
|
|
||||||
|
input:
|
||||||
|
- vcfs:
|
||||||
|
type: file
|
||||||
|
description: One or a list of uncompressed VCF file
|
||||||
|
pattern: "*.vcf"
|
||||||
|
- fasta:
|
||||||
|
type: file
|
||||||
|
description: Reference genome VCF was generated against
|
||||||
|
pattern: "*.{fasta,fna,fa}"
|
||||||
|
- snpeff_results:
|
||||||
|
type: file
|
||||||
|
description: Results from snpEff in txt format (Optional)
|
||||||
|
pattern: "*.txt"
|
||||||
|
- gff:
|
||||||
|
type: file
|
||||||
|
description: GFF file corresponding to reference genome fasta (Optional)
|
||||||
|
pattern: "*.gff"
|
||||||
|
- allele_freqs:
|
||||||
|
type: boolean
|
||||||
|
description: |
|
||||||
|
Whether to include the percentage of reads a given allele is
|
||||||
|
present in in the SNP table.
|
||||||
|
- genotype_quality:
|
||||||
|
type: integer
|
||||||
|
description: |
|
||||||
|
Minimum GATK genotyping threshold threshold of which a SNP call
|
||||||
|
falling under is 'discarded'
|
||||||
|
- coverage:
|
||||||
|
type: integer
|
||||||
|
description: |
|
||||||
|
Minimum number of a reads that a position must be covered by to be
|
||||||
|
reported
|
||||||
|
- homozygous_freq:
|
||||||
|
type: number
|
||||||
|
description: Fraction of reads a base must have to be called 'homozygous'
|
||||||
|
- heterozygous_freq:
|
||||||
|
type: mumber
|
||||||
|
description: |
|
||||||
|
Fraction of which whereby if a call falls above this value, and lower
|
||||||
|
than the homozygous threshold, a base will be called 'heterozygous'.
|
||||||
|
- gff_exclude:
|
||||||
|
type: file
|
||||||
|
description: |
|
||||||
|
file listing positions that will be 'filtered' (i.e. ignored)
|
||||||
|
(Optional)
|
||||||
|
pattern: "*.vcf"
|
||||||
|
|
||||||
|
output:
|
||||||
|
- versions:
|
||||||
|
type: file
|
||||||
|
description: File containing software versions
|
||||||
|
pattern: "versions.yml"
|
||||||
|
- bam:
|
||||||
|
type: file
|
||||||
|
description: Sorted BAM/CRAM/SAM file
|
||||||
|
pattern: "*.{bam,cram,sam}"
|
||||||
|
|
||||||
|
- full_alignment:
|
||||||
|
type: file
|
||||||
|
description: Fasta a fasta file of all positions contained in the VCF files i.e. including ref calls
|
||||||
|
pattern: ".fasta.gz"
|
||||||
|
- info_txt:
|
||||||
|
type: file
|
||||||
|
description: Information about the run
|
||||||
|
pattern: ".txt"
|
||||||
|
- snp_alignment:
|
||||||
|
type: file
|
||||||
|
description: A fasta file of just SNP positions with samples only
|
||||||
|
pattern: ".fasta.gz"
|
||||||
|
- snp_genome_alignment:
|
||||||
|
type: file
|
||||||
|
description: A fasta file of just SNP positions with reference genome
|
||||||
|
pattern: ".fasta.gz"
|
||||||
|
- snpstatistics:
|
||||||
|
type: file
|
||||||
|
description: Some basic statistics about the SNP calls of each sample
|
||||||
|
pattern: ".tsv"
|
||||||
|
- snptable:
|
||||||
|
type: file
|
||||||
|
description: Basic SNP table of combined positions taken from each VCF file
|
||||||
|
pattern: ".tsv"
|
||||||
|
- snptable_snpeff:
|
||||||
|
type: file
|
||||||
|
description: Input file for SnpEff
|
||||||
|
pattern: ".tsv"
|
||||||
|
- snptable_uncertainty:
|
||||||
|
type: file
|
||||||
|
description: Same as above, but with lower case characters indicating uncertain calls
|
||||||
|
pattern: ".tsv"
|
||||||
|
- structure_genotypes:
|
||||||
|
type: file
|
||||||
|
description: Input file for STRUCTURE
|
||||||
|
pattern: ".tsv"
|
||||||
|
- structure_genotypes_nomissing:
|
||||||
|
type: file
|
||||||
|
description: Alternate input file for STRUCTURE
|
||||||
|
pattern: ".tsv"
|
||||||
|
- json:
|
||||||
|
type: file
|
||||||
|
description: Summary statistics in MultiQC JSON format
|
||||||
|
pattern: ".json"
|
||||||
|
|
||||||
|
authors:
|
||||||
|
- "@jfy133"
|
|
@ -1499,6 +1499,10 @@ multiqc:
|
||||||
- modules/multiqc/**
|
- modules/multiqc/**
|
||||||
- tests/modules/multiqc/**
|
- tests/modules/multiqc/**
|
||||||
|
|
||||||
|
multivcfanalyzer:
|
||||||
|
- modules/multivcfanalyzer/**
|
||||||
|
- tests/modules/multivcfanalyzer/**
|
||||||
|
|
||||||
mummer:
|
mummer:
|
||||||
- modules/mummer/**
|
- modules/mummer/**
|
||||||
- tests/modules/mummer/**
|
- tests/modules/mummer/**
|
||||||
|
|
41
tests/modules/multivcfanalyzer/main.nf
Normal file
41
tests/modules/multivcfanalyzer/main.nf
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
#!/usr/bin/env nextflow
|
||||||
|
|
||||||
|
nextflow.enable.dsl = 2
|
||||||
|
|
||||||
|
include { GATK_UNIFIEDGENOTYPER } from '../../../modules/gatk/unifiedgenotyper/main.nf'
|
||||||
|
include { GUNZIP } from '../../../modules/gunzip/main.nf'
|
||||||
|
include { MULTIVCFANALYZER } from '../../../modules/multivcfanalyzer/main.nf'
|
||||||
|
|
||||||
|
workflow test_multivcfanalyzer {
|
||||||
|
|
||||||
|
input = Channel.of([ [ id:'test' ], // meta map
|
||||||
|
file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true),
|
||||||
|
file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true),
|
||||||
|
],
|
||||||
|
[ [ id:'test2' ], // meta map
|
||||||
|
file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true),
|
||||||
|
file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
fasta = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
|
||||||
|
fai = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true)
|
||||||
|
dict = file(params.test_data['sarscov2']['genome']['genome_dict'], checkIfExists: true)
|
||||||
|
|
||||||
|
GATK_UNIFIEDGENOTYPER ( input, fasta, fai, dict, [], [], [], [])
|
||||||
|
|
||||||
|
mva_vcf = GUNZIP ( GATK_UNIFIEDGENOTYPER.out.vcf ).gunzip
|
||||||
|
.map{it[1]}
|
||||||
|
.collect()
|
||||||
|
.dump()
|
||||||
|
|
||||||
|
snpeff_results = []
|
||||||
|
gff = []
|
||||||
|
allele_freqs = true
|
||||||
|
genotype_quality = 30
|
||||||
|
coverage = 5
|
||||||
|
homozygous_freq = 0.8
|
||||||
|
heterozygous_freq = 0.2
|
||||||
|
gff_exclude = []
|
||||||
|
|
||||||
|
MULTIVCFANALYZER ( mva_vcf, fasta, snpeff_results, gff, allele_freqs, genotype_quality, coverage, homozygous_freq, heterozygous_freq, gff_exclude )
|
||||||
|
}
|
5
tests/modules/multivcfanalyzer/nextflow.config
Normal file
5
tests/modules/multivcfanalyzer/nextflow.config
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
process {
|
||||||
|
|
||||||
|
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
|
||||||
|
|
||||||
|
}
|
31
tests/modules/multivcfanalyzer/test.yml
Normal file
31
tests/modules/multivcfanalyzer/test.yml
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
- name: multivcfanalyzer test_multivcfanalyzer
|
||||||
|
command: nextflow run ./tests/modules/multivcfanalyzer -entry test_multivcfanalyzer -c ./tests/config/nextflow.config -c ./tests/modules/multivcfanalyzer/nextflow.config
|
||||||
|
tags:
|
||||||
|
- multivcfanalyzer
|
||||||
|
files:
|
||||||
|
- path: output/multivcfanalyzer/MultiVCFAnalyzer.json
|
||||||
|
md5sum: c841c9f04c6114911f308ea09a08980e
|
||||||
|
- path: output/multivcfanalyzer/fullAlignment.fasta.gz
|
||||||
|
contains:
|
||||||
|
- ">Reference_MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome"
|
||||||
|
- path: output/multivcfanalyzer/info.txt
|
||||||
|
contains:
|
||||||
|
- "Run finished"
|
||||||
|
- path: output/multivcfanalyzer/snpAlignment.fasta.gz
|
||||||
|
contains:
|
||||||
|
- "test.vcf"
|
||||||
|
- path: output/multivcfanalyzer/snpAlignmentIncludingRefGenome.fasta.gz
|
||||||
|
contains:
|
||||||
|
- ">Reference_MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome"
|
||||||
|
- path: output/multivcfanalyzer/snpStatistics.tsv
|
||||||
|
contains: ["statistics", "test.vcf", "test2.vcf"]
|
||||||
|
- path: output/multivcfanalyzer/snpTable.tsv
|
||||||
|
contains: ["Position", "test.vcf", "test2.vcf"]
|
||||||
|
- path: output/multivcfanalyzer/snpTableForSnpEff.tsv
|
||||||
|
md5sum: 8d7ab4ec98a89d290e301d6feae461aa
|
||||||
|
- path: output/multivcfanalyzer/snpTableWithUncertaintyCalls.tsv
|
||||||
|
contains: ["Position", "test.vcf", "test2.vcf"]
|
||||||
|
- path: output/multivcfanalyzer/structureGenotypes.tsv
|
||||||
|
contains: ["test.vcf", "test2.vcf"]
|
||||||
|
- path: output/multivcfanalyzer/structureGenotypes_noMissingData-Columns.tsv
|
||||||
|
contains: ["test.vcf", "test2.vcf"]
|
Loading…
Reference in a new issue