Add MultiVCFAnalyzer (#1845)

* Add MultiVCFAnalyzer * Fix versions * Fix tests due to md5sum var * Apply suggestions from code review * Linting * Apply suggestions from code review Co-authored-by: Robert A. Petit III <robbie.petit@gmail.com> Co-authored-by: Robert A. Petit III <robbie.petit@gmail.com>
2025-01-09 15:21:13 -05:00 · 2022-07-05 09:47:10 +02:00 · 2022-07-05 09:47:10 +02:00 · aed45dd766
commit aed45dd766
parent 6702d2e145
6 changed files with 275 additions and 0 deletions
--- a/modules/multivcfanalyzer/main.nf
+++ b/modules/multivcfanalyzer/main.nf
@ -0,0 +1,72 @@
 process MULTIVCFANALYZER {
    tag '$fasta'
    label 'process_medium'
    conda (params.enable_conda ? "bioconda::multivcfanalyzer=0.85.2" : null)
    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
        'https://depot.galaxyproject.org/singularity/multivcfanalyzer:0.85.2--hdfd78af_1':
        'quay.io/biocontainers/multivcfanalyzer:0.85.2--hdfd78af_1' }"
    input:
    path vcfs
    path fasta
    path snpeff_results
    path gff
    val allele_freqs
    val genotype_quality
    val coverage
    val homozygous_freq
    val heterozygous_freq
    path gff_exclude
    output:
    path('fullAlignment.fasta.gz')                       , emit: full_alignment
    path('info.txt')                                     , emit: info_txt
    path('snpAlignment.fasta.gz')                        , emit: snp_alignment
    path('snpAlignmentIncludingRefGenome.fasta.gz')      , emit: snp_genome_alignment
    path('snpStatistics.tsv')                            , emit: snpstatistics
    path('snpTable.tsv')                                 , emit: snptable
    path('snpTableForSnpEff.tsv')                        , emit: snptable_snpeff
    path('snpTableWithUncertaintyCalls.tsv')             , emit: snptable_uncertainty
    path('structureGenotypes.tsv')                       , emit: structure_genotypes
    path('structureGenotypes_noMissingData-Columns.tsv') , emit: structure_genotypes_nomissing
    path('MultiVCFAnalyzer.json')                        , emit: json
    path "versions.yml"                                  , emit: versions
    when:
    task.ext.when == null || task.ext.when
    script:
    // def args = task.ext.args ?: '' // MultiVCFAnalyzer has strict and input ordering and all are mandatory. Deactivating $args to prevent breakage of input
    def args2 = task.ext.args2 ?: ''
    def cmd_snpeff_results = snpeff_results ? "${snpeff_results}" : "NA"
    def cmd_gff            = gff ? "${gff}" : "NA"
    def cmd_allele_freqs   = allele_freqs ? "T" : "F"
    def cmd_gff_exclude    = gff_exclude ? "${gff}" : "NA"
    """
    multivcfanalyzer \\
        ${cmd_snpeff_results} \\
        ${fasta} \\
        ${cmd_gff} \\
        . \
        ${cmd_allele_freqs}  \\
        ${genotype_quality}  \\
        ${coverage}  \\
        ${homozygous_freq}  \\
        ${heterozygous_freq}  \\
        ${cmd_gff_exclude}  \\
        ${vcfs.join(" ")}
    gzip \\
        $args2 \\
        fullAlignment.fasta snpAlignment.fasta snpAlignmentIncludingRefGenome.fasta
    cat <<-END_VERSIONS > versions.yml
    "${task.process}":
        multivcfanalyzer: \$(echo \$(multivcfanalyzer --help | head -n 1) | cut -f 3 -d ' ' )
    END_VERSIONS
    """
 }
--- a/modules/multivcfanalyzer/meta.yml
+++ b/modules/multivcfanalyzer/meta.yml
@ -0,0 +1,122 @@
 name: "multivcfanalyzer"
 description: SNP table generator from GATK UnifiedGenotyper with functionality geared for aDNA
 keywords:
  - vcf
  - ancient DNA
  - aDNA
  - SNP
  - GATK UnifiedGenotyper
  - SNP table
 tools:
  - "multivcfanalyzer":
      description: "MultiVCFAnalyzer is a VCF file post-processing tool tailored for aDNA. License on Github repository."
      homepage: "https://github.com/alexherbig/MultiVCFAnalyzer"
      documentation: "https://github.com/alexherbig/MultiVCFAnalyzer"
      tool_dev_url: "https://github.com/alexherbig/MultiVCFAnalyzer"
      doi: "10.1038/nature13591"
      licence: "['GPL >=3']"
 input:
  - vcfs:
      type: file
      description: One or a list of uncompressed VCF file
      pattern: "*.vcf"
  - fasta:
      type: file
      description: Reference genome VCF was generated against
      pattern: "*.{fasta,fna,fa}"
  - snpeff_results:
      type: file
      description: Results from snpEff in txt format (Optional)
      pattern: "*.txt"
  - gff:
      type: file
      description: GFF file corresponding to reference genome fasta (Optional)
      pattern: "*.gff"
  - allele_freqs:
      type: boolean
      description: |
        Whether to include the percentage of reads a given allele is
        present in in the SNP table.
  - genotype_quality:
      type: integer
      description: |
        Minimum GATK genotyping threshold threshold of which a SNP call
        falling under is 'discarded'
  - coverage:
      type: integer
      description: |
        Minimum number of a reads that a position must be covered by to be
        reported
  - homozygous_freq:
      type: number
      description: Fraction of reads a base must have to be called 'homozygous'
  - heterozygous_freq:
      type: mumber
      description: |
        Fraction of which whereby if a call falls above this value, and lower
        than the homozygous threshold, a base will be called 'heterozygous'.
  - gff_exclude:
      type: file
      description: |
        file listing positions that will be 'filtered' (i.e. ignored)
        (Optional)
      pattern: "*.vcf"
 output:
  - versions:
      type: file
      description: File containing software versions
      pattern: "versions.yml"
  - bam:
      type: file
      description: Sorted BAM/CRAM/SAM file
      pattern: "*.{bam,cram,sam}"
  - full_alignment:
      type: file
      description: Fasta a fasta file of all positions contained in the VCF files i.e. including ref calls
      pattern: ".fasta.gz"
  - info_txt:
      type: file
      description: Information about the run
      pattern: ".txt"
  - snp_alignment:
      type: file
      description: A fasta file of just SNP positions with samples only
      pattern: ".fasta.gz"
  - snp_genome_alignment:
      type: file
      description: A fasta file of just SNP positions with reference genome
      pattern: ".fasta.gz"
  - snpstatistics:
      type: file
      description: Some basic statistics about the SNP calls of each sample
      pattern: ".tsv"
  - snptable:
      type: file
      description: Basic SNP table of combined positions taken from each VCF file
      pattern: ".tsv"
  - snptable_snpeff:
      type: file
      description: Input file for SnpEff
      pattern: ".tsv"
  - snptable_uncertainty:
      type: file
      description: Same as above, but with lower case characters indicating uncertain calls
      pattern: ".tsv"
  - structure_genotypes:
      type: file
      description: Input file for STRUCTURE
      pattern: ".tsv"
  - structure_genotypes_nomissing:
      type: file
      description: Alternate input file for STRUCTURE
      pattern: ".tsv"
  - json:
      type: file
      description: Summary statistics in MultiQC JSON format
      pattern: ".json"
 authors:
  - "@jfy133"
--- a/tests/config/pytest_modules.yml
+++ b/tests/config/pytest_modules.yml
@ -1499,6 +1499,10 @@ multiqc:
  - modules/multiqc/**
  - tests/modules/multiqc/**
 multivcfanalyzer:
  - modules/multivcfanalyzer/**
  - tests/modules/multivcfanalyzer/**
 mummer:
  - modules/mummer/**
  - tests/modules/mummer/**
--- a/tests/modules/multivcfanalyzer/main.nf
+++ b/tests/modules/multivcfanalyzer/main.nf
@ -0,0 +1,41 @@
 #!/usr/bin/env nextflow
 nextflow.enable.dsl = 2
 include { GATK_UNIFIEDGENOTYPER } from '../../../modules/gatk/unifiedgenotyper/main.nf'
 include { GUNZIP                } from '../../../modules/gunzip/main.nf'
 include { MULTIVCFANALYZER      } from '../../../modules/multivcfanalyzer/main.nf'
 workflow test_multivcfanalyzer {
    input     = Channel.of([ [ id:'test' ], // meta map
                  file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true),
                  file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true),
                ],
                [ [ id:'test2' ], // meta map
                  file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true),
                  file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true),
                ],
                )
    fasta = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
    fai = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true)
    dict = file(params.test_data['sarscov2']['genome']['genome_dict'], checkIfExists: true)
    GATK_UNIFIEDGENOTYPER ( input, fasta, fai, dict, [], [], [], [])
    mva_vcf = GUNZIP ( GATK_UNIFIEDGENOTYPER.out.vcf ).gunzip
        .map{it[1]}
        .collect()
        .dump()
    snpeff_results          = []
    gff                     = []
    allele_freqs            = true
    genotype_quality        = 30
    coverage                = 5
    homozygous_freq         = 0.8
    heterozygous_freq       = 0.2
    gff_exclude             = []
    MULTIVCFANALYZER ( mva_vcf, fasta, snpeff_results, gff, allele_freqs, genotype_quality, coverage, homozygous_freq, heterozygous_freq, gff_exclude )
 }
--- a/tests/modules/multivcfanalyzer/nextflow.config
+++ b/tests/modules/multivcfanalyzer/nextflow.config
@ -0,0 +1,5 @@
 process {
    publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
 }
--- a/tests/modules/multivcfanalyzer/test.yml
+++ b/tests/modules/multivcfanalyzer/test.yml
@ -0,0 +1,31 @@
 - name: multivcfanalyzer test_multivcfanalyzer
  command: nextflow run ./tests/modules/multivcfanalyzer -entry test_multivcfanalyzer -c ./tests/config/nextflow.config  -c ./tests/modules/multivcfanalyzer/nextflow.config
  tags:
    - multivcfanalyzer
  files:
    - path: output/multivcfanalyzer/MultiVCFAnalyzer.json
      md5sum: c841c9f04c6114911f308ea09a08980e
    - path: output/multivcfanalyzer/fullAlignment.fasta.gz
      contains:
        - ">Reference_MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome"
    - path: output/multivcfanalyzer/info.txt
      contains:
        - "Run finished"
    - path: output/multivcfanalyzer/snpAlignment.fasta.gz
      contains:
        - "test.vcf"
    - path: output/multivcfanalyzer/snpAlignmentIncludingRefGenome.fasta.gz
      contains:
        - ">Reference_MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome"
    - path: output/multivcfanalyzer/snpStatistics.tsv
      contains: ["statistics", "test.vcf", "test2.vcf"]
    - path: output/multivcfanalyzer/snpTable.tsv
      contains: ["Position", "test.vcf", "test2.vcf"]
    - path: output/multivcfanalyzer/snpTableForSnpEff.tsv
      md5sum: 8d7ab4ec98a89d290e301d6feae461aa
    - path: output/multivcfanalyzer/snpTableWithUncertaintyCalls.tsv
      contains: ["Position", "test.vcf", "test2.vcf"]
    - path: output/multivcfanalyzer/structureGenotypes.tsv
      contains: ["test.vcf", "test2.vcf"]
    - path: output/multivcfanalyzer/structureGenotypes_noMissingData-Columns.tsv
      contains: ["test.vcf", "test2.vcf"]