add vcftools module (#334)

* add vcftools module

* fix padding issue

* fix linting errors
This commit is contained in:
Mark-S-Hill 2021-03-24 04:54:23 +00:00 committed by GitHub
parent fe8a783cef
commit 399b58043d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 569 additions and 0 deletions

View file

@ -0,0 +1,61 @@
/*
* -----------------------------------------------------
* Utility functions used in nf-core DSL2 module files
* -----------------------------------------------------
*/
/*
* Extract name of software tool from process name using $task.process
*/
def getSoftwareName(task_process) {
return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
}
/*
* Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
*/
def initOptions(Map args) {
def Map options = [:]
options.args = args.args ?: ''
options.args2 = args.args2 ?: ''
options.args3 = args.args3 ?: ''
options.publish_by_id = args.publish_by_id ?: false
options.publish_dir = args.publish_dir ?: ''
options.publish_files = args.publish_files
options.suffix = args.suffix ?: ''
return options
}
/*
* Tidy up and join elements of a list to return a path string
*/
def getPathFromList(path_list) {
def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries
paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
return paths.join('/')
}
/*
* Function to save/publish module results
*/
def saveFiles(Map args) {
if (!args.filename.endsWith('.version.txt')) {
def ioptions = initOptions(args.options)
def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
if (ioptions.publish_by_id) {
path_list.add(args.publish_id)
}
if (ioptions.publish_files instanceof Map) {
for (ext in ioptions.publish_files) {
if (args.filename.endsWith(ext.key)) {
def ext_list = path_list.collect()
ext_list.add(ext.value)
return "${getPathFromList(ext_list)}/$args.filename"
}
}
} else if (ioptions.publish_files == null) {
return "${getPathFromList(path_list)}/$args.filename"
}
}
}

129
software/vcftools/main.nf Normal file
View file

@ -0,0 +1,129 @@
// Import generic module functions
include { initOptions; saveFiles; getSoftwareName } from './functions'
params.options = [:]
options = initOptions(params.options)
process VCFTOOLS {
tag "$meta.id"
label 'process_medium'
publishDir "${params.outdir}",
mode: params.publish_dir_mode,
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) }
conda (params.enable_conda ? "bioconda::vcftools=0.1.16" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/vcftools:0.1.16--he513fc3_4"
} else {
container "quay.io/biocontainers/vcftools:0.1.16--he513fc3_4"
}
input:
// Owing to the nature of vcftools we here provide solutions to working with optional bed files and optional
// alternative variant files, for use with the 'diff' suite of tools.
// Other optional input files can be utilised in a similar way to below but we do not exhaustively itterate through all
// possible options. Instead we leave that to the user.
tuple val(meta), path(variant_file)
path(bed)
path(diff_variant_file)
output:
path("*.version.txt"), emit: version
tuple val(meta), path("*.vcf"), optional:true, emit: vcf
tuple val(meta), path("*.bcf"), optional:true, emit: bcf
tuple val(meta), path("*.frq"), optional:true, emit: frq
tuple val(meta), path("*.frq.count"), optional:true, emit: frq_count
tuple val(meta), path("*.idepth"), optional:true, emit: idepth
tuple val(meta), path("*.ldepth"), optional:true, emit: ldepth
tuple val(meta), path("*.ldepth.mean"), optional:true, emit: ldepth_mean
tuple val(meta), path("*.gdepth"), optional:true, emit: gdepth
tuple val(meta), path("*.hap.ld"), optional:true, emit: hap_ld
tuple val(meta), path("*.geno.ld"), optional:true, emit: geno_ld
tuple val(meta), path("*.geno.chisq"), optional:true, emit: geno_chisq
tuple val(meta), path("*.list.hap.ld"), optional:true, emit: list_hap_ld
tuple val(meta), path("*.list.geno.ld"), optional:true, emit: list_geno_ld
tuple val(meta), path("*.interchrom.hap.ld"), optional:true, emit: interchrom_hap_ld
tuple val(meta), path("*.interchrom.geno.ld"), optional:true, emit: interchrom_geno_ld
tuple val(meta), path("*.TsTv"), optional:true, emit: tstv
tuple val(meta), path("*.TsTv.summary"), optional:true, emit: tstv_summary
tuple val(meta), path("*.TsTv.count"), optional:true, emit: tstv_count
tuple val(meta), path("*.TsTv.qual"), optional:true, emit: tstv_qual
tuple val(meta), path("*.FILTER.summary"), optional:true, emit: filter_summary
tuple val(meta), path("*.sites.pi"), optional:true, emit: sites_pi
tuple val(meta), path("*.windowed.pi"), optional:true, emit: windowed_pi
tuple val(meta), path("*.weir.fst"), optional:true, emit: weir_fst
tuple val(meta), path("*.het"), optional:true, emit: heterozygosity
tuple val(meta), path("*.hwe"), optional:true, emit: hwe
tuple val(meta), path("*.Tajima.D"), optional:true, emit: tajima_d
tuple val(meta), path("*.ifreqburden"), optional:true, emit: freq_burden
tuple val(meta), path("*.LROH"), optional:true, emit: lroh
tuple val(meta), path("*.relatedness"), optional:true, emit: relatedness
tuple val(meta), path("*.relatedness2"), optional:true, emit: relatedness2
tuple val(meta), path("*.lqual"), optional:true, emit: lqual
tuple val(meta), path("*.imiss"), optional:true, emit: missing_individual
tuple val(meta), path("*.lmiss"), optional:true, emit: missing_site
tuple val(meta), path("*.snpden"), optional:true, emit: snp_density
tuple val(meta), path("*.kept.sites"), optional:true, emit: kept_sites
tuple val(meta), path("*.removed.sites"), optional:true, emit: removed_sites
tuple val(meta), path("*.singletons"), optional:true, emit: singeltons
tuple val(meta), path("*.indel.hist"), optional:true, emit: indel_hist
tuple val(meta), path("*.hapcount"), optional:true, emit: hapcount
tuple val(meta), path("*.mendel"), optional:true, emit: mendel
tuple val(meta), path("*.FORMAT"), optional:true, emit: format
tuple val(meta), path("*.INFO"), optional:true, emit: info
tuple val(meta), path("*.012"), optional:true, emit: genotypes_matrix
tuple val(meta), path("*.012.indv"), optional:true, emit: genotypes_matrix_individual
tuple val(meta), path("*.012.pos"), optional:true, emit: genotypes_matrix_position
tuple val(meta), path("*.impute.hap"), optional:true, emit: impute_hap
tuple val(meta), path("*.impute.hap.legend"), optional:true, emit: impute_hap_legend
tuple val(meta), path("*.impute.hap.indv"), optional:true, emit: impute_hap_indv
tuple val(meta), path("*.ldhat.sites"), optional:true, emit: ldhat_sites
tuple val(meta), path("*.ldhat.locs"), optional:true, emit: ldhat_locs
tuple val(meta), path("*.BEAGLE.GL"), optional:true, emit: beagle_gl
tuple val(meta), path("*.BEAGLE.PL"), optional:true, emit: beagle_pl
tuple val(meta), path("*.ped"), optional:true, emit: ped
tuple val(meta), path("*.map"), optional:true, emit: map_
tuple val(meta), path("*.tped"), optional:true, emit: tped
tuple val(meta), path("*.tfam"), optional:true, emit: tfam
tuple val(meta), path("*.diff.sites_in_files"), optional:true, emit: diff_sites_in_files
tuple val(meta), path("*.diff.indv_in_files"), optional:true, emit: diff_indv_in_files
tuple val(meta), path("*.diff.sites"), optional:true, emit: diff_sites
tuple val(meta), path("*.diff.indv"), optional:true, emit: diff_indv
tuple val(meta), path("*.diff.discordance.matrix"), optional:true, emit: diff_discd_matrix
tuple val(meta), path("*.diff.switch"), optional:true, emit: diff_switch_error
script:
def software = getSoftwareName(task.process)
def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}"
def args = options.args.tokenize()
def bed_arg = (options.args.contains('--bed')) ? "--bed ${bed}" :
(options.args.contains('--exclude-bed')) ? "--exclude-bed ${bed}" :
(options.args.contains('--hapcount')) ? "--hapcount ${bed}" : ''
args.removeIf { it.contains('--bed') }
args.removeIf { it.contains('--exclude-bed') }
args.removeIf { it.contains('--hapcount') }
def diff_variant_arg = (options.args.contains('--diff')) ? "--diff ${diff_variant_file}" :
(options.args.contains('--gzdiff')) ? "--gzdiff ${diff_variant_file}" :
(options.args.contains('--diff-bcf')) ? "--diff-bcf ${diff_variant_file}" : ''
args.removeIf { it.contains('--diff') }
args.removeIf { it.contains('--gzdiff') }
args.removeIf { it.contains('--diff-bcf') }
def input_file = ("$variant_file".endsWith(".vcf")) ? "--vcf ${variant_file}" :
("$variant_file".endsWith(".vcf.gz")) ? "--gzvcf ${variant_file}" :
("$variant_file".endsWith(".bcf")) ? "--bcf ${variant_file}" : ''
"""
vcftools \\
$input_file \\
--out $prefix \\
${args.join(' ')} \\
$bed_arg \\
$diff_variant_arg \\
echo \$(vcftools --version 2>&1) | sed 's/^.*vcftools //; s/Using.*\$//' > ${software}.version.txt
"""
}

294
software/vcftools/meta.yml Normal file
View file

@ -0,0 +1,294 @@
name: vcftools
description: A set of tools written in Perl and C++ for working with VCF files
keywords: VCF
- sort
tools:
- vcftools:
description: A set of tools written in Perl and C++ for working with VCF files. This package only contains the C++ libraries whereas the package perl-vcftools-vcf contains the perl libraries
homepage: http://vcftools.sourceforge.net/
documentation: http://vcftools.sourceforge.net/man_latest.html
tool_dev_url: None
doi:
licence: ['LGPL']
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- variant_file:
type: file
description: variant input file which can be vcf, vcf.gz, or bcf format.
- bed:
type: file
description: bed file which can be used with different arguments in vcftools (optional)
- diff_variant_file:
type: file
description: secondary variant file which can be used with the 'diff' suite of tools (optional)
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- version:
type: file
description: File containing software version
pattern: "*.{version.txt}"
- vcf:
type: file
description: vcf file (optional)
pattern: "*.vcf"
- bcf:
type: file
description: bcf file (optional)
pattern: "*.bcf"
- frq:
type: file
description: Allele frequency for each site (optional)
pattern: "*.frq"
- frq_count:
type: file
description: Allele counts for each site (optional)
pattern: "*.frq.count"
- idepth:
type: file
description: mean depth per individual (optional)
pattern: "*.idepth"
- ldepth:
type: file
description: depth per site summed across individuals (optional)
pattern: "*.ildepth"
- ldepth_mean:
type: file
description: mean depth per site calculated across individuals (optional)
pattern: "*.ldepth.mean"
- gdepth:
type: file
description: depth for each genotype in vcf file (optional)
pattern: "*.gdepth"
- hap_ld:
type: file
description: r2, D, and D statistics using phased haplotypes (optional)
pattern: "*.hap.ld"
- geno_ld:
type: file
description: squared correlation coefficient between genotypes encoded as 0, 1 and 2 to represent the number of non-reference alleles in each individual (optional)
pattern: "*.geno.ld"
- geno_chisq:
type: file
description: test for genotype independence via the chi-squared statistic (optional)
pattern: "*.geno.chisq"
- list_hap_ld:
type: file
description: r2 statistics of the sites contained in the provided input file verses all other sites (optional)
pattern: "*.list.hap.ld"
- list_geno_ld:
type: file
description: r2 statistics of the sites contained in the provided input file verses all other sites (optional)
pattern: "*.list.geno.ld"
- interchrom_hap_ld:
type: file
description: r2 statistics for sites (haplotypes) on different chromosomes (optional)
pattern: "*.interchrom.hap.ld"
- interchrom_geno_ld:
type: file
description: r2 statistics for sites (genotypes) on different chromosomes (optional)
pattern: "*.interchrom.geno.ld"
- tstv:
type: file
description: Transition / Transversion ratio in bins of size defined in options (optional)
pattern: "*.TsTv"
- tstv_summary:
type: file
description: Summary of all Transitions and Transversions (optional)
pattern: "*.TsTv.summary"
- tstv_count:
type: file
description: Transition / Transversion ratio as a function of alternative allele count (optional)
pattern: "*.TsTv.count"
- tstv_qual:
type: file
description: Transition / Transversion ratio as a function of SNP quality threshold (optional)
pattern: "*.TsTv.qual"
- filter_summary:
type: file
description: Summary of the number of SNPs and Ts/Tv ratio for each FILTER category (optional)
pattern: "*.FILTER.summary"
- sites_pi:
type: file
description: Nucleotide divergency on a per-site basis (optional)
pattern: "*.sites.pi"
- windowed_pi:
type: file
description: Nucleotide diversity in windows, with window size determined by options (optional)
pattern: "*windowed.pi"
- weir_fst:
type: file
description: Fst estimate from Weir and Cockerhams 1984 paper (optional)
pattern: "*.weir.fst"
- heterozygosity:
type: file
description: Heterozygosity on a per-individual basis (optional)
pattern: "*.het"
- hwe:
type: file
description: Contains the Observed numbers of Homozygotes and Heterozygotes and the corresponding Expected numbers under HWE (optional)
pattern: "*.hwe"
- tajima_d:
type: file
description: Tajimas D statistic in bins with size of the specified number in options (optional)
pattern: "*.Tajima.D"
- freq_burden:
type: file
description: Number of variants within each individual of a specific frequency in options (optional)
pattern: "*.ifreqburden"
- lroh:
type: file
description: Long Runs of Homozygosity (optional)
pattern: "*.LROH"
- relatedness:
type: file
description: Relatedness statistic based on the method of Yang et al, Nature Genetics 2010 (doi:10.1038/ng.608) (optional)
pattern: "*.relatedness"
- relatedness2:
type: file
description: Relatedness statistic based on the method of Manichaikul et al., BIOINFORMATICS 2010 (doi:10.1093/bioinformatics/btq559) (optional)
pattern: "*.relatedness2"
- lqual:
type: file
description: per-site SNP quality (optional)
pattern: "*.lqual"
- missing_individual:
type: file
description: Missingness on a per-individual basis (optional)
pattern: "*.imiss"
- missing_site:
type: file
description: Missingness on a per-site basis (optional)
pattern: "*.lmiss"
- snp_density:
type: file
description: Number and density of SNPs in bins of size defined by option (optional)
pattern: "*.snpden"
- kept_sites:
type: file
description: All sites that have been kept after filtering (optional)
pattern: "*.kept.sites"
- removed_sites:
type: file
description: All sites that have been removed after filtering (optional)
pattern: "*.removed.sites"
- singeltons:
type: file
description: Location of singletons, and the individual they occur in (optional)
pattern: "*.singeltons"
- indel_hist:
type: file
description: Histogram file of the length of all indels (including SNPs) (optional)
pattern: "*.indel_hist"
- hapcount:
type: file
description: Unique haplotypes within user specified bins (optional)
pattern: "*.hapcount"
- mendel:
type: file
description: Mendel errors identified in trios (optional)
pattern: "*.mendel"
- format:
type: file
description: Extracted information from the genotype fields in the VCF file relating to a specfied FORMAT identifier (optional)
pattern: "*.FORMAT"
- info:
type: file
description: Extracted information from the INFO field in the VCF file (optional)
pattern: "*.INFO"
- genotypes_matrix:
type: file
description: |
Genotypes output as large matrix.
Genotypes of each individual on a separate line.
Genotypes are represented as 0, 1 and 2, where the number represent that number of non-reference alleles.
Missing genotypes are represented by -1 (optional)
pattern: "*.012"
- genotypes_matrix_individual:
type: file
description: Details the individuals included in the main genotypes_matrix file (optional)
pattern: "*.012.indv"
- genotypes_matrix_position:
type: file
description: Details the site locations included in the main genotypes_matrix file (optional)
pattern: "*.012.pos"
- impute_hap:
type: file
description: Phased haplotypes in IMPUTE reference-panel format (optional)
pattern: "*.impute.hap"
- impute_hap_legend:
type: file
description: Impute haplotype legend file (optional)
pattern: "*.impute.hap.legend"
- impute_hap_indv:
type: file
description: Impute haplotype individuals file (optional)
pattern: "*.impute.hap.indv"
- ldhat_sites:
type: file
description: Output data in LDhat format, sites (optional)
pattern: "*.ldhat.sites"
- ldhat_locs:
type: file
description: output data in LDhat format, locations (optional)
pattern: "*.ldhat.locs"
- beagle_gl:
type: file
description: Genotype likelihoods for biallelic sites (optional)
pattern: "*.BEAGLE.GL"
- beagle_pl:
type: file
description: Genotype likelihoods for biallelic sites (optional)
pattern: "*.BEAGLE.PL"
- ped:
type: file
description: output the genotype data in PLINK PED format (optional)
pattern: "*.ped"
- map_:
type: file
description: output the genotype data in PLINK PED format (optional)
pattern: "*.map"
- tped:
type: file
description: output the genotype data in PLINK PED format (optional)
pattern: "*.tped"
- tfam:
type: file
description: output the genotype data in PLINK PED format (optional)
pattern: "*.tfam"
- diff_sites_in_files:
type: file
description: Sites that are common / unique to each file specified in optional inputs (optional)
pattern: "*.diff.sites.in.files"
- diff_indv_in_files:
type: file
description: Individuals that are common / unique to each file specified in optional inputs (optional)
pattern: "*.diff.indv.in.files"
- diff_sites:
type: file
description: Discordance on a site by site basis, specified in optional inputs (optional)
pattern: "*.diff.sites"
- diff_indv:
type: file
description: Discordance on a individual by individual basis, specified in optional inputs (optional)
pattern: "*.diff.indv"
- diff_discd_matrix:
type: file
description: Discordance matrix between files specified in optional inputs (optional)
pattern: "*.diff.discordance.matrix"
- diff_switch_error:
type: file
description: Switch errors found between sites (optional)
pattern: "*.diff.switch"
authors:
- "@Mark-S-Hill"

View file

@ -426,3 +426,7 @@ unicycler:
untar:
- software/untar/**
- tests/software/untar/**
vcftools:
- software/vcftools/**
- tests/software/vcftools/**

View file

@ -0,0 +1,46 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { VCFTOOLS as VCFTOOLS_BASE} from '../../../software/vcftools/main.nf' addParams( options: ['args': '--freq'] )
include { VCFTOOLS as VCFTOOLS_OPTIONAL} from '../../../software/vcftools/main.nf' addParams( options: ['args': '--freq --exclude-bed'] )
workflow test_vcftools_vcf_base {
def input = []
input = [ [ id:'test' ], // meta map
file("${launchDir}/tests/data/genomics/sarscov2/vcf/test.vcf", checkIfExists: true) ]
VCFTOOLS_BASE ( input, [], [] )
}
workflow test_vcftools_vcfgz_base {
def input = []
input = [ [ id:'test' ], // meta map
file("${launchDir}/tests/data/genomics/sarscov2/vcf/test.vcf.gz", checkIfExists: true) ]
VCFTOOLS_BASE ( input, [], [] )
}
workflow test_vcftools_vcf_optional {
def input = []
def bed = file("${launchDir}/tests/data/genomics/sarscov2/bed/test.bed", checkIfExists: true)
input = [ [ id:'test' ], // meta map
file("${launchDir}/tests/data/genomics/sarscov2/vcf/test.vcf", checkIfExists: true) ]
VCFTOOLS_OPTIONAL ( input, bed, [] )
}
workflow test_vcftools_vcfgz_optional {
def input = []
def bed = file("${launchDir}/tests/data/genomics/sarscov2/bed/test.bed", checkIfExists: true)
input = [ [ id:'test' ], // meta map
file("${launchDir}/tests/data/genomics/sarscov2/vcf/test.vcf.gz", checkIfExists: true) ]
VCFTOOLS_OPTIONAL ( input, bed, [] )
}

View file

@ -0,0 +1,35 @@
- name: vcftools test_vcftools_vcf_base
command: nextflow run tests/software/vcftools -entry test_vcftools_vcf_base -c tests/config/nextflow.config
tags:
- vcftools
- vcftools_vcf_base
files:
- path: output/vcftools/test.frq
md5sum: 7f126655f17268fd1a338734f62868e9
- name: vcftools test_vcftools_vcfgz_base
command: nextflow run tests/software/vcftools -entry test_vcftools_vcfgz_base -c tests/config/nextflow.config
tags:
- vcftools_vcfgz_base
- vcftools
files:
- path: output/vcftools/test.frq
md5sum: 7f126655f17268fd1a338734f62868e9
- name: vcftools test_vcftools_vcf_optional
command: nextflow run tests/software/vcftools -entry test_vcftools_vcf_optional -c tests/config/nextflow.config
tags:
- vcftools
- vcftools_vcf_optional
files:
- path: output/vcftools/test.frq
md5sum: 7f126655f17268fd1a338734f62868e9
- name: vcftools test_vcftools_vcfgz_optional
command: nextflow run tests/software/vcftools -entry test_vcftools_vcfgz_optional -c tests/config/nextflow.config
tags:
- vcftools
- vcftools_vcfgz_optional
files:
- path: output/vcftools/test.frq
md5sum: 7f126655f17268fd1a338734f62868e9