New module: plink2/extract (#1228)

* add plink2_extract

* fix test yml path

* Update modules/plink2/extract/main.nf

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>

* Update modules/plink2/extract/main.nf

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>

* compress output

* add DOI

* make outputs less ambiguous

* update test for compressed output

* brain is dumb

* Update modules/plink2/extract/main.nf

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>
This commit is contained in:
Benjamin Wingfield 2022-02-03 10:42:56 +00:00 committed by GitHub
parent f112e4d701
commit e687c7025a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 167 additions and 0 deletions

View file

@ -0,0 +1,37 @@
process PLINK2_EXTRACT {
tag "$meta.id"
label 'process_low'
conda (params.enable_conda ? "bioconda::plink2=2.00a2.3" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/plink2:2.00a2.3--h712d239_1' :
'quay.io/biocontainers/plink2:2.00a2.3--h712d239_1' }"
input:
tuple val(meta), path(pgen), path(psam), path(pvar), path(variants)
output:
tuple val(meta), path("*.pgen") , emit: extract_pgen
tuple val(meta), path("*.psam") , emit: extract_psam
tuple val(meta), path("*.pvar.zst"), emit: extract_pvar
path "versions.yml" , emit: versions
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
if( "$pgen" == "${prefix}.pgen" ) error "Input and output names are the same, use \"task.ext.prefix\" in modules.config to disambiguate!"
"""
plink2 \\
--pfile ${pgen.baseName} \\
$args \\
--threads $task.cpus \\
--extract $variants \\
--make-pgen vzs \\
--out ${prefix}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
plink2: \$(plink2 --version 2>&1 | sed 's/^PLINK v//; s/ 64.*\$//' )
END_VERSIONS
"""
}

View file

@ -0,0 +1,64 @@
name: plink2_extract
description: Subset plink pfiles with a text file of variant identifiers
keywords:
- plink2
- extract
tools:
- plink2:
description: |
Whole genome association analysis toolset, designed to perform a range
of basic, large-scale analyses in a computationally efficient manner
homepage: http://www.cog-genomics.org/plink/2.0/
documentation: http://www.cog-genomics.org/plink/2.0/general_usage
tool_dev_url: None
doi: "10.1186/s13742-015-0047-8"
licence: ['GPL v3']
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- pgen:
type: file
description: PLINK 2 binary genotype table
pattern: "*.{pgen}"
- psam:
type: file
description: PLINK 2 sample information file
pattern: "*.{psam}"
- pvar:
type: file
description: PLINK 2 variant information file
pattern: "*.{pvar}"
- variants:
type: file
description: A text file containing variant identifiers to keep (one per line)
pattern: "*.{keep}"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- extract_pgen:
type: file
description: PLINK 2 binary genotype table, containing extracted variants
pattern: "*.{pgen}"
- extract_psam:
type: file
description: PLINK 2 sample information file associated with the extracted data
pattern: "*.{psam}"
- extract_pvar:
type: file
description: PLINK 2 variant information file, containing extracted variants
pattern: "*.{pvar.zst}"
authors:
- "@nebfield"

View file

@ -1149,6 +1149,10 @@ plink/vcf:
- modules/plink/vcf/** - modules/plink/vcf/**
- tests/modules/plink/vcf/** - tests/modules/plink/vcf/**
plink2/extract:
- modules/plink2/extract/**
- tests/modules/plink2/extract/**
plink2/vcf: plink2/vcf:
- modules/plink2/vcf/** - modules/plink2/vcf/**
- tests/modules/plink2/vcf/** - tests/modules/plink2/vcf/**

View file

@ -0,0 +1,30 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { PLINK2_VCF } from '../../../../modules/plink2/vcf/main.nf'
include { PLINK2_EXTRACT } from '../../../../modules/plink2/extract/main.nf'
workflow test_plink2_extract {
input = [
[ id:'test', single_end:false ], // meta map
file(params.test_data['homo_sapiens']['genome']['syntheticvcf_short_vcf_gz'], checkIfExists: true)
]
PLINK2_VCF ( input )
PLINK2_VCF.out.pvar
.splitText(file: 'variants.keep', keepHeader: false, by: 10)
.last()
.set { ch_variants }
ch_variants.view()
PLINK2_VCF.out.pgen
.concat(PLINK2_VCF.out.psam, PLINK2_VCF.out.pvar.concat(ch_variants))
.groupTuple()
.map{ meta, paths -> [meta, paths[0], paths[1], paths[2], paths[3]] }
.set { ch_extract }
PLINK2_EXTRACT ( ch_extract )
}

View file

@ -0,0 +1,12 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
withName: PLINK2_VCF {
ext.args = '--make-pgen --set-missing-var-ids @:#:\\$1:\\$2'
}
withName: PLINK2_EXTRACT {
ext.prefix = { "${meta.id}.extract" }
}
}

View file

@ -0,0 +1,20 @@
- name: plink2 extract test_plink2_extract
command: nextflow run tests/modules/plink2/extract -entry test_plink2_extract -c tests/config/nextflow.config
tags:
- plink2/extract
- plink2
files:
- path: output/plink2/test.extract.pgen
md5sum: 785e729a293ecabb0d39394865316bda
- path: output/plink2/test.extract.psam
md5sum: e6c714488754cb8448c3dfda08c4c0ea
- path: output/plink2/test.extract.pvar.zst
md5sum: 076767e6695e681115eabb924a447ee9
- path: output/plink2/test.pgen
md5sum: fac12ca9041d6950f6b7d60ac2120721
- path: output/plink2/test.psam
md5sum: e6c714488754cb8448c3dfda08c4c0ea
- path: output/plink2/test.pvar
md5sum: ff9e44f8e5f4035d8cf2bfe7be6755b3
- path: output/plink2/versions.yml
md5sum: c477b7c9f6e39b89710fe1a0bceee50d