From e687c7025a4b164e3109b392634c46688baa68a2 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 3 Feb 2022 10:42:56 +0000 Subject: [PATCH] New module: `plink2/extract` (#1228) * add plink2_extract * fix test yml path * Update modules/plink2/extract/main.nf Co-authored-by: James A. Fellows Yates * Update modules/plink2/extract/main.nf Co-authored-by: James A. Fellows Yates * compress output * add DOI * make outputs less ambiguous * update test for compressed output * brain is dumb * Update modules/plink2/extract/main.nf Co-authored-by: James A. Fellows Yates Co-authored-by: James A. Fellows Yates --- modules/plink2/extract/main.nf | 37 +++++++++++ modules/plink2/extract/meta.yml | 64 ++++++++++++++++++++ tests/config/pytest_modules.yml | 4 ++ tests/modules/plink2/extract/main.nf | 30 +++++++++ tests/modules/plink2/extract/nextflow.config | 12 ++++ tests/modules/plink2/extract/test.yml | 20 ++++++ 6 files changed, 167 insertions(+) create mode 100644 modules/plink2/extract/main.nf create mode 100644 modules/plink2/extract/meta.yml create mode 100644 tests/modules/plink2/extract/main.nf create mode 100644 tests/modules/plink2/extract/nextflow.config create mode 100644 tests/modules/plink2/extract/test.yml diff --git a/modules/plink2/extract/main.nf b/modules/plink2/extract/main.nf new file mode 100644 index 00000000..7eb24abc --- /dev/null +++ b/modules/plink2/extract/main.nf @@ -0,0 +1,37 @@ +process PLINK2_EXTRACT { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::plink2=2.00a2.3" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/plink2:2.00a2.3--h712d239_1' : + 'quay.io/biocontainers/plink2:2.00a2.3--h712d239_1' }" + + input: + tuple val(meta), path(pgen), path(psam), path(pvar), path(variants) + + output: + tuple val(meta), path("*.pgen") , emit: extract_pgen + tuple val(meta), path("*.psam") , emit: extract_psam + tuple val(meta), path("*.pvar.zst"), emit: extract_pvar + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if( "$pgen" == "${prefix}.pgen" ) error "Input and output names are the same, use \"task.ext.prefix\" in modules.config to disambiguate!" + """ + plink2 \\ + --pfile ${pgen.baseName} \\ + $args \\ + --threads $task.cpus \\ + --extract $variants \\ + --make-pgen vzs \\ + --out ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + plink2: \$(plink2 --version 2>&1 | sed 's/^PLINK v//; s/ 64.*\$//' ) + END_VERSIONS + """ +} diff --git a/modules/plink2/extract/meta.yml b/modules/plink2/extract/meta.yml new file mode 100644 index 00000000..2323dbc7 --- /dev/null +++ b/modules/plink2/extract/meta.yml @@ -0,0 +1,64 @@ +name: plink2_extract +description: Subset plink pfiles with a text file of variant identifiers +keywords: + - plink2 + - extract +tools: + - plink2: + description: | + Whole genome association analysis toolset, designed to perform a range + of basic, large-scale analyses in a computationally efficient manner + homepage: http://www.cog-genomics.org/plink/2.0/ + documentation: http://www.cog-genomics.org/plink/2.0/general_usage + tool_dev_url: None + doi: "10.1186/s13742-015-0047-8" + licence: ['GPL v3'] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - pgen: + type: file + description: PLINK 2 binary genotype table + pattern: "*.{pgen}" + - psam: + type: file + description: PLINK 2 sample information file + pattern: "*.{psam}" + - pvar: + type: file + description: PLINK 2 variant information file + pattern: "*.{pvar}" + - variants: + type: file + description: A text file containing variant identifiers to keep (one per line) + pattern: "*.{keep}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - extract_pgen: + type: file + description: PLINK 2 binary genotype table, containing extracted variants + pattern: "*.{pgen}" + - extract_psam: + type: file + description: PLINK 2 sample information file associated with the extracted data + pattern: "*.{psam}" + - extract_pvar: + type: file + description: PLINK 2 variant information file, containing extracted variants + pattern: "*.{pvar.zst}" + +authors: + - "@nebfield" diff --git a/tests/config/pytest_modules.yml b/tests/config/pytest_modules.yml index 8b3ff3e0..b6b906f4 100644 --- a/tests/config/pytest_modules.yml +++ b/tests/config/pytest_modules.yml @@ -1149,6 +1149,10 @@ plink/vcf: - modules/plink/vcf/** - tests/modules/plink/vcf/** +plink2/extract: + - modules/plink2/extract/** + - tests/modules/plink2/extract/** + plink2/vcf: - modules/plink2/vcf/** - tests/modules/plink2/vcf/** diff --git a/tests/modules/plink2/extract/main.nf b/tests/modules/plink2/extract/main.nf new file mode 100644 index 00000000..a72e153c --- /dev/null +++ b/tests/modules/plink2/extract/main.nf @@ -0,0 +1,30 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +include { PLINK2_VCF } from '../../../../modules/plink2/vcf/main.nf' +include { PLINK2_EXTRACT } from '../../../../modules/plink2/extract/main.nf' + +workflow test_plink2_extract { + + input = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['homo_sapiens']['genome']['syntheticvcf_short_vcf_gz'], checkIfExists: true) + ] + PLINK2_VCF ( input ) + + PLINK2_VCF.out.pvar + .splitText(file: 'variants.keep', keepHeader: false, by: 10) + .last() + .set { ch_variants } + + ch_variants.view() + + PLINK2_VCF.out.pgen + .concat(PLINK2_VCF.out.psam, PLINK2_VCF.out.pvar.concat(ch_variants)) + .groupTuple() + .map{ meta, paths -> [meta, paths[0], paths[1], paths[2], paths[3]] } + .set { ch_extract } + + PLINK2_EXTRACT ( ch_extract ) +} diff --git a/tests/modules/plink2/extract/nextflow.config b/tests/modules/plink2/extract/nextflow.config new file mode 100644 index 00000000..2a541c99 --- /dev/null +++ b/tests/modules/plink2/extract/nextflow.config @@ -0,0 +1,12 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + + withName: PLINK2_VCF { + ext.args = '--make-pgen --set-missing-var-ids @:#:\\$1:\\$2' + } + + withName: PLINK2_EXTRACT { + ext.prefix = { "${meta.id}.extract" } + } +} diff --git a/tests/modules/plink2/extract/test.yml b/tests/modules/plink2/extract/test.yml new file mode 100644 index 00000000..737ca215 --- /dev/null +++ b/tests/modules/plink2/extract/test.yml @@ -0,0 +1,20 @@ +- name: plink2 extract test_plink2_extract + command: nextflow run tests/modules/plink2/extract -entry test_plink2_extract -c tests/config/nextflow.config + tags: + - plink2/extract + - plink2 + files: + - path: output/plink2/test.extract.pgen + md5sum: 785e729a293ecabb0d39394865316bda + - path: output/plink2/test.extract.psam + md5sum: e6c714488754cb8448c3dfda08c4c0ea + - path: output/plink2/test.extract.pvar.zst + md5sum: 076767e6695e681115eabb924a447ee9 + - path: output/plink2/test.pgen + md5sum: fac12ca9041d6950f6b7d60ac2120721 + - path: output/plink2/test.psam + md5sum: e6c714488754cb8448c3dfda08c4c0ea + - path: output/plink2/test.pvar + md5sum: ff9e44f8e5f4035d8cf2bfe7be6755b3 + - path: output/plink2/versions.yml + md5sum: c477b7c9f6e39b89710fe1a0bceee50d