From aed45dd76675cd30067da559f38f54f087ad9dcf Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Tue, 5 Jul 2022 09:47:10 +0200 Subject: [PATCH] Add MultiVCFAnalyzer (#1845) * Add MultiVCFAnalyzer * Fix versions * Fix tests due to md5sum var * Apply suggestions from code review * Linting * Apply suggestions from code review Co-authored-by: Robert A. Petit III Co-authored-by: Robert A. Petit III --- modules/multivcfanalyzer/main.nf | 72 +++++++++++ modules/multivcfanalyzer/meta.yml | 122 ++++++++++++++++++ tests/config/pytest_modules.yml | 4 + tests/modules/multivcfanalyzer/main.nf | 41 ++++++ .../modules/multivcfanalyzer/nextflow.config | 5 + tests/modules/multivcfanalyzer/test.yml | 31 +++++ 6 files changed, 275 insertions(+) create mode 100644 modules/multivcfanalyzer/main.nf create mode 100644 modules/multivcfanalyzer/meta.yml create mode 100644 tests/modules/multivcfanalyzer/main.nf create mode 100644 tests/modules/multivcfanalyzer/nextflow.config create mode 100644 tests/modules/multivcfanalyzer/test.yml diff --git a/modules/multivcfanalyzer/main.nf b/modules/multivcfanalyzer/main.nf new file mode 100644 index 00000000..1958ce65 --- /dev/null +++ b/modules/multivcfanalyzer/main.nf @@ -0,0 +1,72 @@ +process MULTIVCFANALYZER { + tag '$fasta' + label 'process_medium' + + conda (params.enable_conda ? "bioconda::multivcfanalyzer=0.85.2" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multivcfanalyzer:0.85.2--hdfd78af_1': + 'quay.io/biocontainers/multivcfanalyzer:0.85.2--hdfd78af_1' }" + + input: + path vcfs + path fasta + path snpeff_results + path gff + val allele_freqs + val genotype_quality + val coverage + val homozygous_freq + val heterozygous_freq + path gff_exclude + + + output: + path('fullAlignment.fasta.gz') , emit: full_alignment + path('info.txt') , emit: info_txt + path('snpAlignment.fasta.gz') , emit: snp_alignment + path('snpAlignmentIncludingRefGenome.fasta.gz') , emit: snp_genome_alignment + path('snpStatistics.tsv') , emit: snpstatistics + path('snpTable.tsv') , emit: snptable + path('snpTableForSnpEff.tsv') , emit: snptable_snpeff + path('snpTableWithUncertaintyCalls.tsv') , emit: snptable_uncertainty + path('structureGenotypes.tsv') , emit: structure_genotypes + path('structureGenotypes_noMissingData-Columns.tsv') , emit: structure_genotypes_nomissing + path('MultiVCFAnalyzer.json') , emit: json + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // def args = task.ext.args ?: '' // MultiVCFAnalyzer has strict and input ordering and all are mandatory. Deactivating $args to prevent breakage of input + def args2 = task.ext.args2 ?: '' + + def cmd_snpeff_results = snpeff_results ? "${snpeff_results}" : "NA" + def cmd_gff = gff ? "${gff}" : "NA" + def cmd_allele_freqs = allele_freqs ? "T" : "F" + def cmd_gff_exclude = gff_exclude ? "${gff}" : "NA" + + """ + multivcfanalyzer \\ + ${cmd_snpeff_results} \\ + ${fasta} \\ + ${cmd_gff} \\ + . \ + ${cmd_allele_freqs} \\ + ${genotype_quality} \\ + ${coverage} \\ + ${homozygous_freq} \\ + ${heterozygous_freq} \\ + ${cmd_gff_exclude} \\ + ${vcfs.join(" ")} + + gzip \\ + $args2 \\ + fullAlignment.fasta snpAlignment.fasta snpAlignmentIncludingRefGenome.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multivcfanalyzer: \$(echo \$(multivcfanalyzer --help | head -n 1) | cut -f 3 -d ' ' ) + END_VERSIONS + """ +} diff --git a/modules/multivcfanalyzer/meta.yml b/modules/multivcfanalyzer/meta.yml new file mode 100644 index 00000000..76a63901 --- /dev/null +++ b/modules/multivcfanalyzer/meta.yml @@ -0,0 +1,122 @@ +name: "multivcfanalyzer" +description: SNP table generator from GATK UnifiedGenotyper with functionality geared for aDNA +keywords: + - vcf + - ancient DNA + - aDNA + - SNP + - GATK UnifiedGenotyper + - SNP table +tools: + - "multivcfanalyzer": + description: "MultiVCFAnalyzer is a VCF file post-processing tool tailored for aDNA. License on Github repository." + homepage: "https://github.com/alexherbig/MultiVCFAnalyzer" + documentation: "https://github.com/alexherbig/MultiVCFAnalyzer" + tool_dev_url: "https://github.com/alexherbig/MultiVCFAnalyzer" + doi: "10.1038/nature13591" + licence: "['GPL >=3']" + +input: + - vcfs: + type: file + description: One or a list of uncompressed VCF file + pattern: "*.vcf" + - fasta: + type: file + description: Reference genome VCF was generated against + pattern: "*.{fasta,fna,fa}" + - snpeff_results: + type: file + description: Results from snpEff in txt format (Optional) + pattern: "*.txt" + - gff: + type: file + description: GFF file corresponding to reference genome fasta (Optional) + pattern: "*.gff" + - allele_freqs: + type: boolean + description: | + Whether to include the percentage of reads a given allele is + present in in the SNP table. + - genotype_quality: + type: integer + description: | + Minimum GATK genotyping threshold threshold of which a SNP call + falling under is 'discarded' + - coverage: + type: integer + description: | + Minimum number of a reads that a position must be covered by to be + reported + - homozygous_freq: + type: number + description: Fraction of reads a base must have to be called 'homozygous' + - heterozygous_freq: + type: mumber + description: | + Fraction of which whereby if a call falls above this value, and lower + than the homozygous threshold, a base will be called 'heterozygous'. + - gff_exclude: + type: file + description: | + file listing positions that will be 'filtered' (i.e. ignored) + (Optional) + pattern: "*.vcf" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + + - full_alignment: + type: file + description: Fasta a fasta file of all positions contained in the VCF files i.e. including ref calls + pattern: ".fasta.gz" + - info_txt: + type: file + description: Information about the run + pattern: ".txt" + - snp_alignment: + type: file + description: A fasta file of just SNP positions with samples only + pattern: ".fasta.gz" + - snp_genome_alignment: + type: file + description: A fasta file of just SNP positions with reference genome + pattern: ".fasta.gz" + - snpstatistics: + type: file + description: Some basic statistics about the SNP calls of each sample + pattern: ".tsv" + - snptable: + type: file + description: Basic SNP table of combined positions taken from each VCF file + pattern: ".tsv" + - snptable_snpeff: + type: file + description: Input file for SnpEff + pattern: ".tsv" + - snptable_uncertainty: + type: file + description: Same as above, but with lower case characters indicating uncertain calls + pattern: ".tsv" + - structure_genotypes: + type: file + description: Input file for STRUCTURE + pattern: ".tsv" + - structure_genotypes_nomissing: + type: file + description: Alternate input file for STRUCTURE + pattern: ".tsv" + - json: + type: file + description: Summary statistics in MultiQC JSON format + pattern: ".json" + +authors: + - "@jfy133" diff --git a/tests/config/pytest_modules.yml b/tests/config/pytest_modules.yml index d7953cd1..6a8eeb54 100644 --- a/tests/config/pytest_modules.yml +++ b/tests/config/pytest_modules.yml @@ -1499,6 +1499,10 @@ multiqc: - modules/multiqc/** - tests/modules/multiqc/** +multivcfanalyzer: + - modules/multivcfanalyzer/** + - tests/modules/multivcfanalyzer/** + mummer: - modules/mummer/** - tests/modules/mummer/** diff --git a/tests/modules/multivcfanalyzer/main.nf b/tests/modules/multivcfanalyzer/main.nf new file mode 100644 index 00000000..07e6831b --- /dev/null +++ b/tests/modules/multivcfanalyzer/main.nf @@ -0,0 +1,41 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +include { GATK_UNIFIEDGENOTYPER } from '../../../modules/gatk/unifiedgenotyper/main.nf' +include { GUNZIP } from '../../../modules/gunzip/main.nf' +include { MULTIVCFANALYZER } from '../../../modules/multivcfanalyzer/main.nf' + +workflow test_multivcfanalyzer { + + input = Channel.of([ [ id:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true), + ], + [ [ id:'test2' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true), + ], + ) + fasta = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + fai = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + dict = file(params.test_data['sarscov2']['genome']['genome_dict'], checkIfExists: true) + + GATK_UNIFIEDGENOTYPER ( input, fasta, fai, dict, [], [], [], []) + + mva_vcf = GUNZIP ( GATK_UNIFIEDGENOTYPER.out.vcf ).gunzip + .map{it[1]} + .collect() + .dump() + + snpeff_results = [] + gff = [] + allele_freqs = true + genotype_quality = 30 + coverage = 5 + homozygous_freq = 0.8 + heterozygous_freq = 0.2 + gff_exclude = [] + + MULTIVCFANALYZER ( mva_vcf, fasta, snpeff_results, gff, allele_freqs, genotype_quality, coverage, homozygous_freq, heterozygous_freq, gff_exclude ) +} diff --git a/tests/modules/multivcfanalyzer/nextflow.config b/tests/modules/multivcfanalyzer/nextflow.config new file mode 100644 index 00000000..50f50a7a --- /dev/null +++ b/tests/modules/multivcfanalyzer/nextflow.config @@ -0,0 +1,5 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + +} \ No newline at end of file diff --git a/tests/modules/multivcfanalyzer/test.yml b/tests/modules/multivcfanalyzer/test.yml new file mode 100644 index 00000000..8db0136f --- /dev/null +++ b/tests/modules/multivcfanalyzer/test.yml @@ -0,0 +1,31 @@ +- name: multivcfanalyzer test_multivcfanalyzer + command: nextflow run ./tests/modules/multivcfanalyzer -entry test_multivcfanalyzer -c ./tests/config/nextflow.config -c ./tests/modules/multivcfanalyzer/nextflow.config + tags: + - multivcfanalyzer + files: + - path: output/multivcfanalyzer/MultiVCFAnalyzer.json + md5sum: c841c9f04c6114911f308ea09a08980e + - path: output/multivcfanalyzer/fullAlignment.fasta.gz + contains: + - ">Reference_MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome" + - path: output/multivcfanalyzer/info.txt + contains: + - "Run finished" + - path: output/multivcfanalyzer/snpAlignment.fasta.gz + contains: + - "test.vcf" + - path: output/multivcfanalyzer/snpAlignmentIncludingRefGenome.fasta.gz + contains: + - ">Reference_MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome" + - path: output/multivcfanalyzer/snpStatistics.tsv + contains: ["statistics", "test.vcf", "test2.vcf"] + - path: output/multivcfanalyzer/snpTable.tsv + contains: ["Position", "test.vcf", "test2.vcf"] + - path: output/multivcfanalyzer/snpTableForSnpEff.tsv + md5sum: 8d7ab4ec98a89d290e301d6feae461aa + - path: output/multivcfanalyzer/snpTableWithUncertaintyCalls.tsv + contains: ["Position", "test.vcf", "test2.vcf"] + - path: output/multivcfanalyzer/structureGenotypes.tsv + contains: ["test.vcf", "test2.vcf"] + - path: output/multivcfanalyzer/structureGenotypes_noMissingData-Columns.tsv + contains: ["test.vcf", "test2.vcf"]