From aed45dd76675cd30067da559f38f54f087ad9dcf Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates" <jfy133@gmail.com>
Date: Tue, 5 Jul 2022 09:47:10 +0200
Subject: [PATCH] Add MultiVCFAnalyzer (#1845)

* Add MultiVCFAnalyzer

* Fix versions

* Fix tests due to md5sum var

* Apply suggestions from code review

* Linting

* Apply suggestions from code review

Co-authored-by: Robert A. Petit III <robbie.petit@gmail.com>

Co-authored-by: Robert A. Petit III <robbie.petit@gmail.com>
---
 modules/multivcfanalyzer/main.nf              |  72 +++++++++++
 modules/multivcfanalyzer/meta.yml             | 122 ++++++++++++++++++
 tests/config/pytest_modules.yml               |   4 +
 tests/modules/multivcfanalyzer/main.nf        |  41 ++++++
 .../modules/multivcfanalyzer/nextflow.config  |   5 +
 tests/modules/multivcfanalyzer/test.yml       |  31 +++++
 6 files changed, 275 insertions(+)
 create mode 100644 modules/multivcfanalyzer/main.nf
 create mode 100644 modules/multivcfanalyzer/meta.yml
 create mode 100644 tests/modules/multivcfanalyzer/main.nf
 create mode 100644 tests/modules/multivcfanalyzer/nextflow.config
 create mode 100644 tests/modules/multivcfanalyzer/test.yml

diff --git a/modules/multivcfanalyzer/main.nf b/modules/multivcfanalyzer/main.nf
new file mode 100644
index 00000000..1958ce65
--- /dev/null
+++ b/modules/multivcfanalyzer/main.nf
@@ -0,0 +1,72 @@
+process MULTIVCFANALYZER {
+    tag '$fasta'
+    label 'process_medium'
+
+    conda (params.enable_conda ? "bioconda::multivcfanalyzer=0.85.2" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/multivcfanalyzer:0.85.2--hdfd78af_1':
+        'quay.io/biocontainers/multivcfanalyzer:0.85.2--hdfd78af_1' }"
+
+    input:
+    path vcfs
+    path fasta
+    path snpeff_results
+    path gff
+    val allele_freqs
+    val genotype_quality
+    val coverage
+    val homozygous_freq
+    val heterozygous_freq
+    path gff_exclude
+
+
+    output:
+    path('fullAlignment.fasta.gz')                       , emit: full_alignment
+    path('info.txt')                                     , emit: info_txt
+    path('snpAlignment.fasta.gz')                        , emit: snp_alignment
+    path('snpAlignmentIncludingRefGenome.fasta.gz')      , emit: snp_genome_alignment
+    path('snpStatistics.tsv')                            , emit: snpstatistics
+    path('snpTable.tsv')                                 , emit: snptable
+    path('snpTableForSnpEff.tsv')                        , emit: snptable_snpeff
+    path('snpTableWithUncertaintyCalls.tsv')             , emit: snptable_uncertainty
+    path('structureGenotypes.tsv')                       , emit: structure_genotypes
+    path('structureGenotypes_noMissingData-Columns.tsv') , emit: structure_genotypes_nomissing
+    path('MultiVCFAnalyzer.json')                        , emit: json
+    path "versions.yml"                                  , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    // def args = task.ext.args ?: '' // MultiVCFAnalyzer has strict and input ordering and all are mandatory. Deactivating $args to prevent breakage of input
+    def args2 = task.ext.args2 ?: ''
+
+    def cmd_snpeff_results = snpeff_results ? "${snpeff_results}" : "NA"
+    def cmd_gff            = gff ? "${gff}" : "NA"
+    def cmd_allele_freqs   = allele_freqs ? "T" : "F"
+    def cmd_gff_exclude    = gff_exclude ? "${gff}" : "NA"
+
+    """
+    multivcfanalyzer \\
+        ${cmd_snpeff_results} \\
+        ${fasta} \\
+        ${cmd_gff} \\
+        . \
+        ${cmd_allele_freqs}  \\
+        ${genotype_quality}  \\
+        ${coverage}  \\
+        ${homozygous_freq}  \\
+        ${heterozygous_freq}  \\
+        ${cmd_gff_exclude}  \\
+        ${vcfs.join(" ")}
+
+    gzip \\
+        $args2 \\
+        fullAlignment.fasta snpAlignment.fasta snpAlignmentIncludingRefGenome.fasta
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        multivcfanalyzer: \$(echo \$(multivcfanalyzer --help | head -n 1) | cut -f 3 -d ' ' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/multivcfanalyzer/meta.yml b/modules/multivcfanalyzer/meta.yml
new file mode 100644
index 00000000..76a63901
--- /dev/null
+++ b/modules/multivcfanalyzer/meta.yml
@@ -0,0 +1,122 @@
+name: "multivcfanalyzer"
+description: SNP table generator from GATK UnifiedGenotyper with functionality geared for aDNA
+keywords:
+  - vcf
+  - ancient DNA
+  - aDNA
+  - SNP
+  - GATK UnifiedGenotyper
+  - SNP table
+tools:
+  - "multivcfanalyzer":
+      description: "MultiVCFAnalyzer is a VCF file post-processing tool tailored for aDNA. License on Github repository."
+      homepage: "https://github.com/alexherbig/MultiVCFAnalyzer"
+      documentation: "https://github.com/alexherbig/MultiVCFAnalyzer"
+      tool_dev_url: "https://github.com/alexherbig/MultiVCFAnalyzer"
+      doi: "10.1038/nature13591"
+      licence: "['GPL >=3']"
+
+input:
+  - vcfs:
+      type: file
+      description: One or a list of uncompressed VCF file
+      pattern: "*.vcf"
+  - fasta:
+      type: file
+      description: Reference genome VCF was generated against
+      pattern: "*.{fasta,fna,fa}"
+  - snpeff_results:
+      type: file
+      description: Results from snpEff in txt format (Optional)
+      pattern: "*.txt"
+  - gff:
+      type: file
+      description: GFF file corresponding to reference genome fasta (Optional)
+      pattern: "*.gff"
+  - allele_freqs:
+      type: boolean
+      description: |
+        Whether to include the percentage of reads a given allele is
+        present in in the SNP table.
+  - genotype_quality:
+      type: integer
+      description: |
+        Minimum GATK genotyping threshold threshold of which a SNP call
+        falling under is 'discarded'
+  - coverage:
+      type: integer
+      description: |
+        Minimum number of a reads that a position must be covered by to be
+        reported
+  - homozygous_freq:
+      type: number
+      description: Fraction of reads a base must have to be called 'homozygous'
+  - heterozygous_freq:
+      type: mumber
+      description: |
+        Fraction of which whereby if a call falls above this value, and lower
+        than the homozygous threshold, a base will be called 'heterozygous'.
+  - gff_exclude:
+      type: file
+      description: |
+        file listing positions that will be 'filtered' (i.e. ignored)
+        (Optional)
+      pattern: "*.vcf"
+
+output:
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - bam:
+      type: file
+      description: Sorted BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+
+  - full_alignment:
+      type: file
+      description: Fasta a fasta file of all positions contained in the VCF files i.e. including ref calls
+      pattern: ".fasta.gz"
+  - info_txt:
+      type: file
+      description: Information about the run
+      pattern: ".txt"
+  - snp_alignment:
+      type: file
+      description: A fasta file of just SNP positions with samples only
+      pattern: ".fasta.gz"
+  - snp_genome_alignment:
+      type: file
+      description: A fasta file of just SNP positions with reference genome
+      pattern: ".fasta.gz"
+  - snpstatistics:
+      type: file
+      description: Some basic statistics about the SNP calls of each sample
+      pattern: ".tsv"
+  - snptable:
+      type: file
+      description: Basic SNP table of combined positions taken from each VCF file
+      pattern: ".tsv"
+  - snptable_snpeff:
+      type: file
+      description: Input file for SnpEff
+      pattern: ".tsv"
+  - snptable_uncertainty:
+      type: file
+      description: Same as above, but with lower case characters indicating uncertain calls
+      pattern: ".tsv"
+  - structure_genotypes:
+      type: file
+      description: Input file for STRUCTURE
+      pattern: ".tsv"
+  - structure_genotypes_nomissing:
+      type: file
+      description: Alternate input file for STRUCTURE
+      pattern: ".tsv"
+  - json:
+      type: file
+      description: Summary statistics in MultiQC JSON format
+      pattern: ".json"
+
+authors:
+  - "@jfy133"
diff --git a/tests/config/pytest_modules.yml b/tests/config/pytest_modules.yml
index d7953cd1..6a8eeb54 100644
--- a/tests/config/pytest_modules.yml
+++ b/tests/config/pytest_modules.yml
@@ -1499,6 +1499,10 @@ multiqc:
   - modules/multiqc/**
   - tests/modules/multiqc/**
 
+multivcfanalyzer:
+  - modules/multivcfanalyzer/**
+  - tests/modules/multivcfanalyzer/**
+
 mummer:
   - modules/mummer/**
   - tests/modules/mummer/**
diff --git a/tests/modules/multivcfanalyzer/main.nf b/tests/modules/multivcfanalyzer/main.nf
new file mode 100644
index 00000000..07e6831b
--- /dev/null
+++ b/tests/modules/multivcfanalyzer/main.nf
@@ -0,0 +1,41 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { GATK_UNIFIEDGENOTYPER } from '../../../modules/gatk/unifiedgenotyper/main.nf'
+include { GUNZIP                } from '../../../modules/gunzip/main.nf'
+include { MULTIVCFANALYZER      } from '../../../modules/multivcfanalyzer/main.nf'
+
+workflow test_multivcfanalyzer {
+
+    input     = Channel.of([ [ id:'test' ], // meta map
+                  file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true),
+                  file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true),
+                ],
+                [ [ id:'test2' ], // meta map
+                  file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true),
+                  file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true),
+                ],
+                )
+    fasta = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
+    fai = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true)
+    dict = file(params.test_data['sarscov2']['genome']['genome_dict'], checkIfExists: true)
+
+    GATK_UNIFIEDGENOTYPER ( input, fasta, fai, dict, [], [], [], [])
+
+    mva_vcf = GUNZIP ( GATK_UNIFIEDGENOTYPER.out.vcf ).gunzip
+        .map{it[1]}
+        .collect()
+        .dump()
+
+    snpeff_results          = []
+    gff                     = []
+    allele_freqs            = true
+    genotype_quality        = 30
+    coverage                = 5
+    homozygous_freq         = 0.8
+    heterozygous_freq       = 0.2
+    gff_exclude             = []
+
+    MULTIVCFANALYZER ( mva_vcf, fasta, snpeff_results, gff, allele_freqs, genotype_quality, coverage, homozygous_freq, heterozygous_freq, gff_exclude )
+}
diff --git a/tests/modules/multivcfanalyzer/nextflow.config b/tests/modules/multivcfanalyzer/nextflow.config
new file mode 100644
index 00000000..50f50a7a
--- /dev/null
+++ b/tests/modules/multivcfanalyzer/nextflow.config
@@ -0,0 +1,5 @@
+process {
+
+    publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
+    
+}
\ No newline at end of file
diff --git a/tests/modules/multivcfanalyzer/test.yml b/tests/modules/multivcfanalyzer/test.yml
new file mode 100644
index 00000000..8db0136f
--- /dev/null
+++ b/tests/modules/multivcfanalyzer/test.yml
@@ -0,0 +1,31 @@
+- name: multivcfanalyzer test_multivcfanalyzer
+  command: nextflow run ./tests/modules/multivcfanalyzer -entry test_multivcfanalyzer -c ./tests/config/nextflow.config  -c ./tests/modules/multivcfanalyzer/nextflow.config
+  tags:
+    - multivcfanalyzer
+  files:
+    - path: output/multivcfanalyzer/MultiVCFAnalyzer.json
+      md5sum: c841c9f04c6114911f308ea09a08980e
+    - path: output/multivcfanalyzer/fullAlignment.fasta.gz
+      contains:
+        - ">Reference_MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome"
+    - path: output/multivcfanalyzer/info.txt
+      contains:
+        - "Run finished"
+    - path: output/multivcfanalyzer/snpAlignment.fasta.gz
+      contains:
+        - "test.vcf"
+    - path: output/multivcfanalyzer/snpAlignmentIncludingRefGenome.fasta.gz
+      contains:
+        - ">Reference_MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome"
+    - path: output/multivcfanalyzer/snpStatistics.tsv
+      contains: ["statistics", "test.vcf", "test2.vcf"]
+    - path: output/multivcfanalyzer/snpTable.tsv
+      contains: ["Position", "test.vcf", "test2.vcf"]
+    - path: output/multivcfanalyzer/snpTableForSnpEff.tsv
+      md5sum: 8d7ab4ec98a89d290e301d6feae461aa
+    - path: output/multivcfanalyzer/snpTableWithUncertaintyCalls.tsv
+      contains: ["Position", "test.vcf", "test2.vcf"]
+    - path: output/multivcfanalyzer/structureGenotypes.tsv
+      contains: ["test.vcf", "test2.vcf"]
+    - path: output/multivcfanalyzer/structureGenotypes_noMissingData-Columns.tsv
+      contains: ["test.vcf", "test2.vcf"]