mirror of
https://github.com/MillironX/nf-core_modules.git
synced 2024-11-13 05:13:09 +00:00
Add MultiVCFAnalyzer (#1845)
* Add MultiVCFAnalyzer * Fix versions * Fix tests due to md5sum var * Apply suggestions from code review * Linting * Apply suggestions from code review Co-authored-by: Robert A. Petit III <robbie.petit@gmail.com> Co-authored-by: Robert A. Petit III <robbie.petit@gmail.com>
This commit is contained in:
parent
6702d2e145
commit
aed45dd766
6 changed files with 275 additions and 0 deletions
72
modules/multivcfanalyzer/main.nf
Normal file
72
modules/multivcfanalyzer/main.nf
Normal file
|
@ -0,0 +1,72 @@
|
|||
process MULTIVCFANALYZER {
|
||||
tag '$fasta'
|
||||
label 'process_medium'
|
||||
|
||||
conda (params.enable_conda ? "bioconda::multivcfanalyzer=0.85.2" : null)
|
||||
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
|
||||
'https://depot.galaxyproject.org/singularity/multivcfanalyzer:0.85.2--hdfd78af_1':
|
||||
'quay.io/biocontainers/multivcfanalyzer:0.85.2--hdfd78af_1' }"
|
||||
|
||||
input:
|
||||
path vcfs
|
||||
path fasta
|
||||
path snpeff_results
|
||||
path gff
|
||||
val allele_freqs
|
||||
val genotype_quality
|
||||
val coverage
|
||||
val homozygous_freq
|
||||
val heterozygous_freq
|
||||
path gff_exclude
|
||||
|
||||
|
||||
output:
|
||||
path('fullAlignment.fasta.gz') , emit: full_alignment
|
||||
path('info.txt') , emit: info_txt
|
||||
path('snpAlignment.fasta.gz') , emit: snp_alignment
|
||||
path('snpAlignmentIncludingRefGenome.fasta.gz') , emit: snp_genome_alignment
|
||||
path('snpStatistics.tsv') , emit: snpstatistics
|
||||
path('snpTable.tsv') , emit: snptable
|
||||
path('snpTableForSnpEff.tsv') , emit: snptable_snpeff
|
||||
path('snpTableWithUncertaintyCalls.tsv') , emit: snptable_uncertainty
|
||||
path('structureGenotypes.tsv') , emit: structure_genotypes
|
||||
path('structureGenotypes_noMissingData-Columns.tsv') , emit: structure_genotypes_nomissing
|
||||
path('MultiVCFAnalyzer.json') , emit: json
|
||||
path "versions.yml" , emit: versions
|
||||
|
||||
when:
|
||||
task.ext.when == null || task.ext.when
|
||||
|
||||
script:
|
||||
// def args = task.ext.args ?: '' // MultiVCFAnalyzer has strict and input ordering and all are mandatory. Deactivating $args to prevent breakage of input
|
||||
def args2 = task.ext.args2 ?: ''
|
||||
|
||||
def cmd_snpeff_results = snpeff_results ? "${snpeff_results}" : "NA"
|
||||
def cmd_gff = gff ? "${gff}" : "NA"
|
||||
def cmd_allele_freqs = allele_freqs ? "T" : "F"
|
||||
def cmd_gff_exclude = gff_exclude ? "${gff}" : "NA"
|
||||
|
||||
"""
|
||||
multivcfanalyzer \\
|
||||
${cmd_snpeff_results} \\
|
||||
${fasta} \\
|
||||
${cmd_gff} \\
|
||||
. \
|
||||
${cmd_allele_freqs} \\
|
||||
${genotype_quality} \\
|
||||
${coverage} \\
|
||||
${homozygous_freq} \\
|
||||
${heterozygous_freq} \\
|
||||
${cmd_gff_exclude} \\
|
||||
${vcfs.join(" ")}
|
||||
|
||||
gzip \\
|
||||
$args2 \\
|
||||
fullAlignment.fasta snpAlignment.fasta snpAlignmentIncludingRefGenome.fasta
|
||||
|
||||
cat <<-END_VERSIONS > versions.yml
|
||||
"${task.process}":
|
||||
multivcfanalyzer: \$(echo \$(multivcfanalyzer --help | head -n 1) | cut -f 3 -d ' ' )
|
||||
END_VERSIONS
|
||||
"""
|
||||
}
|
122
modules/multivcfanalyzer/meta.yml
Normal file
122
modules/multivcfanalyzer/meta.yml
Normal file
|
@ -0,0 +1,122 @@
|
|||
name: "multivcfanalyzer"
|
||||
description: SNP table generator from GATK UnifiedGenotyper with functionality geared for aDNA
|
||||
keywords:
|
||||
- vcf
|
||||
- ancient DNA
|
||||
- aDNA
|
||||
- SNP
|
||||
- GATK UnifiedGenotyper
|
||||
- SNP table
|
||||
tools:
|
||||
- "multivcfanalyzer":
|
||||
description: "MultiVCFAnalyzer is a VCF file post-processing tool tailored for aDNA. License on Github repository."
|
||||
homepage: "https://github.com/alexherbig/MultiVCFAnalyzer"
|
||||
documentation: "https://github.com/alexherbig/MultiVCFAnalyzer"
|
||||
tool_dev_url: "https://github.com/alexherbig/MultiVCFAnalyzer"
|
||||
doi: "10.1038/nature13591"
|
||||
licence: "['GPL >=3']"
|
||||
|
||||
input:
|
||||
- vcfs:
|
||||
type: file
|
||||
description: One or a list of uncompressed VCF file
|
||||
pattern: "*.vcf"
|
||||
- fasta:
|
||||
type: file
|
||||
description: Reference genome VCF was generated against
|
||||
pattern: "*.{fasta,fna,fa}"
|
||||
- snpeff_results:
|
||||
type: file
|
||||
description: Results from snpEff in txt format (Optional)
|
||||
pattern: "*.txt"
|
||||
- gff:
|
||||
type: file
|
||||
description: GFF file corresponding to reference genome fasta (Optional)
|
||||
pattern: "*.gff"
|
||||
- allele_freqs:
|
||||
type: boolean
|
||||
description: |
|
||||
Whether to include the percentage of reads a given allele is
|
||||
present in in the SNP table.
|
||||
- genotype_quality:
|
||||
type: integer
|
||||
description: |
|
||||
Minimum GATK genotyping threshold threshold of which a SNP call
|
||||
falling under is 'discarded'
|
||||
- coverage:
|
||||
type: integer
|
||||
description: |
|
||||
Minimum number of a reads that a position must be covered by to be
|
||||
reported
|
||||
- homozygous_freq:
|
||||
type: number
|
||||
description: Fraction of reads a base must have to be called 'homozygous'
|
||||
- heterozygous_freq:
|
||||
type: mumber
|
||||
description: |
|
||||
Fraction of which whereby if a call falls above this value, and lower
|
||||
than the homozygous threshold, a base will be called 'heterozygous'.
|
||||
- gff_exclude:
|
||||
type: file
|
||||
description: |
|
||||
file listing positions that will be 'filtered' (i.e. ignored)
|
||||
(Optional)
|
||||
pattern: "*.vcf"
|
||||
|
||||
output:
|
||||
- versions:
|
||||
type: file
|
||||
description: File containing software versions
|
||||
pattern: "versions.yml"
|
||||
- bam:
|
||||
type: file
|
||||
description: Sorted BAM/CRAM/SAM file
|
||||
pattern: "*.{bam,cram,sam}"
|
||||
|
||||
- full_alignment:
|
||||
type: file
|
||||
description: Fasta a fasta file of all positions contained in the VCF files i.e. including ref calls
|
||||
pattern: ".fasta.gz"
|
||||
- info_txt:
|
||||
type: file
|
||||
description: Information about the run
|
||||
pattern: ".txt"
|
||||
- snp_alignment:
|
||||
type: file
|
||||
description: A fasta file of just SNP positions with samples only
|
||||
pattern: ".fasta.gz"
|
||||
- snp_genome_alignment:
|
||||
type: file
|
||||
description: A fasta file of just SNP positions with reference genome
|
||||
pattern: ".fasta.gz"
|
||||
- snpstatistics:
|
||||
type: file
|
||||
description: Some basic statistics about the SNP calls of each sample
|
||||
pattern: ".tsv"
|
||||
- snptable:
|
||||
type: file
|
||||
description: Basic SNP table of combined positions taken from each VCF file
|
||||
pattern: ".tsv"
|
||||
- snptable_snpeff:
|
||||
type: file
|
||||
description: Input file for SnpEff
|
||||
pattern: ".tsv"
|
||||
- snptable_uncertainty:
|
||||
type: file
|
||||
description: Same as above, but with lower case characters indicating uncertain calls
|
||||
pattern: ".tsv"
|
||||
- structure_genotypes:
|
||||
type: file
|
||||
description: Input file for STRUCTURE
|
||||
pattern: ".tsv"
|
||||
- structure_genotypes_nomissing:
|
||||
type: file
|
||||
description: Alternate input file for STRUCTURE
|
||||
pattern: ".tsv"
|
||||
- json:
|
||||
type: file
|
||||
description: Summary statistics in MultiQC JSON format
|
||||
pattern: ".json"
|
||||
|
||||
authors:
|
||||
- "@jfy133"
|
|
@ -1499,6 +1499,10 @@ multiqc:
|
|||
- modules/multiqc/**
|
||||
- tests/modules/multiqc/**
|
||||
|
||||
multivcfanalyzer:
|
||||
- modules/multivcfanalyzer/**
|
||||
- tests/modules/multivcfanalyzer/**
|
||||
|
||||
mummer:
|
||||
- modules/mummer/**
|
||||
- tests/modules/mummer/**
|
||||
|
|
41
tests/modules/multivcfanalyzer/main.nf
Normal file
41
tests/modules/multivcfanalyzer/main.nf
Normal file
|
@ -0,0 +1,41 @@
|
|||
#!/usr/bin/env nextflow
|
||||
|
||||
nextflow.enable.dsl = 2
|
||||
|
||||
include { GATK_UNIFIEDGENOTYPER } from '../../../modules/gatk/unifiedgenotyper/main.nf'
|
||||
include { GUNZIP } from '../../../modules/gunzip/main.nf'
|
||||
include { MULTIVCFANALYZER } from '../../../modules/multivcfanalyzer/main.nf'
|
||||
|
||||
workflow test_multivcfanalyzer {
|
||||
|
||||
input = Channel.of([ [ id:'test' ], // meta map
|
||||
file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true),
|
||||
file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true),
|
||||
],
|
||||
[ [ id:'test2' ], // meta map
|
||||
file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true),
|
||||
file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true),
|
||||
],
|
||||
)
|
||||
fasta = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
|
||||
fai = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true)
|
||||
dict = file(params.test_data['sarscov2']['genome']['genome_dict'], checkIfExists: true)
|
||||
|
||||
GATK_UNIFIEDGENOTYPER ( input, fasta, fai, dict, [], [], [], [])
|
||||
|
||||
mva_vcf = GUNZIP ( GATK_UNIFIEDGENOTYPER.out.vcf ).gunzip
|
||||
.map{it[1]}
|
||||
.collect()
|
||||
.dump()
|
||||
|
||||
snpeff_results = []
|
||||
gff = []
|
||||
allele_freqs = true
|
||||
genotype_quality = 30
|
||||
coverage = 5
|
||||
homozygous_freq = 0.8
|
||||
heterozygous_freq = 0.2
|
||||
gff_exclude = []
|
||||
|
||||
MULTIVCFANALYZER ( mva_vcf, fasta, snpeff_results, gff, allele_freqs, genotype_quality, coverage, homozygous_freq, heterozygous_freq, gff_exclude )
|
||||
}
|
5
tests/modules/multivcfanalyzer/nextflow.config
Normal file
5
tests/modules/multivcfanalyzer/nextflow.config
Normal file
|
@ -0,0 +1,5 @@
|
|||
process {
|
||||
|
||||
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
|
||||
|
||||
}
|
31
tests/modules/multivcfanalyzer/test.yml
Normal file
31
tests/modules/multivcfanalyzer/test.yml
Normal file
|
@ -0,0 +1,31 @@
|
|||
- name: multivcfanalyzer test_multivcfanalyzer
|
||||
command: nextflow run ./tests/modules/multivcfanalyzer -entry test_multivcfanalyzer -c ./tests/config/nextflow.config -c ./tests/modules/multivcfanalyzer/nextflow.config
|
||||
tags:
|
||||
- multivcfanalyzer
|
||||
files:
|
||||
- path: output/multivcfanalyzer/MultiVCFAnalyzer.json
|
||||
md5sum: c841c9f04c6114911f308ea09a08980e
|
||||
- path: output/multivcfanalyzer/fullAlignment.fasta.gz
|
||||
contains:
|
||||
- ">Reference_MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome"
|
||||
- path: output/multivcfanalyzer/info.txt
|
||||
contains:
|
||||
- "Run finished"
|
||||
- path: output/multivcfanalyzer/snpAlignment.fasta.gz
|
||||
contains:
|
||||
- "test.vcf"
|
||||
- path: output/multivcfanalyzer/snpAlignmentIncludingRefGenome.fasta.gz
|
||||
contains:
|
||||
- ">Reference_MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome"
|
||||
- path: output/multivcfanalyzer/snpStatistics.tsv
|
||||
contains: ["statistics", "test.vcf", "test2.vcf"]
|
||||
- path: output/multivcfanalyzer/snpTable.tsv
|
||||
contains: ["Position", "test.vcf", "test2.vcf"]
|
||||
- path: output/multivcfanalyzer/snpTableForSnpEff.tsv
|
||||
md5sum: 8d7ab4ec98a89d290e301d6feae461aa
|
||||
- path: output/multivcfanalyzer/snpTableWithUncertaintyCalls.tsv
|
||||
contains: ["Position", "test.vcf", "test2.vcf"]
|
||||
- path: output/multivcfanalyzer/structureGenotypes.tsv
|
||||
contains: ["test.vcf", "test2.vcf"]
|
||||
- path: output/multivcfanalyzer/structureGenotypes_noMissingData-Columns.tsv
|
||||
contains: ["test.vcf", "test2.vcf"]
|
Loading…
Reference in a new issue