implement plink2/score module (#1259)

* implement plink2/score module * fix test yml * fix typo :( * set cpu * set mem * fix input process input block * fix tests Co-authored-by: Sateesh <33637490+sateeshperi@users.noreply.github.com>
2024-12-21 18:58:16 +00:00 · 2022-03-07 18:02:40 +00:00 · 2022-03-07 18:02:40 +00:00 · de0d57a562
commit de0d57a562
parent 251015c8ba
7 changed files with 156 additions and 1 deletions
--- a/modules/plink2/score/main.nf
+++ b/modules/plink2/score/main.nf
@ -0,0 +1,39 @@
+process PLINK2_SCORE {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda (params.enable_conda ? "bioconda::plink2=2.00a2.3" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/plink2:2.00a2.3--h712d239_1' :
+        'quay.io/biocontainers/plink2:2.00a2.3--h712d239_1' }"
+
+    input:
+    tuple val(meta), path(pgen), path(psam), path(pvar)
+    path(scorefile)
+
+    output:
+    tuple val(meta), path("*.sscore"), emit: score
+    path("versions.yml")             , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def mem_mb = task.memory.toMega() // plink is greedy
+    """
+    plink2 \\
+        --threads $task.cpus \\
+        --memory $mem_mb \\
+        --pfile ${pgen.baseName} vzs \\
+        --score ${scorefile} \\
+        $args \\
+        --out ${prefix}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        plink2: \$(plink2 --version 2>&1 | sed 's/^PLINK v//; s/ 64.*\$//' )
+    END_VERSIONS
+    """
+}
--- a/modules/plink2/score/meta.yml
+++ b/modules/plink2/score/meta.yml
@ -0,0 +1,56 @@
+name: plink2_score
+description: Apply a scoring system to each sample in a plink 2 fileset
+keywords:
+  - plink2
+  - score
+tools:
+  - plink2:
+      description: |
+          Whole genome association analysis toolset, designed to perform a range
+          of basic, large-scale analyses in a computationally efficient manner
+      homepage: http://www.cog-genomics.org/plink/2.0/
+      documentation: http://www.cog-genomics.org/plink/2.0/general_usage
+      tool_dev_url: None
+      doi: "10.1186/s13742-015-0047-8"
+      licence: ['GPL v3']
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - pgen:
+      type: file
+      description: PLINK 2 binary genotype table
+      pattern: "*.{pgen}"
+  - psam:
+      type: file
+      description: PLINK 2 sample information file
+      pattern: "*.{psam}"
+  - pvar:
+      type: file
+      description: PLINK 2 variant information file
+      pattern: "*.{pvar}"
+  - scorefile:
+      type: file
+      description: A text file containing variant identifiers and weights
+      pattern: "*.{scores,txt,scorefile}"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - score:
+      type: file
+      description: A text file containing sample scores, in plink 2 .sscore format
+      pattern: "*.{sscore}"
+
+authors:
+  - "@nebfield"
--- a/tests/config/pytest_modules.yml
+++ b/tests/config/pytest_modules.yml
@ -1273,6 +1273,10 @@ plink2/extract:
  - modules/plink2/extract/**
  - tests/modules/plink2/extract/**

+plink2/score:
+  - modules/plink2/score/**
+  - tests/modules/plink2/score/**
+
 plink2/vcf:
  - modules/plink2/vcf/**
  - tests/modules/plink2/vcf/**
--- a/tests/config/test_data.config
+++ b/tests/config/test_data.config
@ -119,7 +119,7 @@ params {
                genome_bed_gz_tbi                              = "${test_data_dir}/genomics/homo_sapiens/genome/genome.bed.gz.tbi"
                transcriptome_fasta                            = "${test_data_dir}/genomics/homo_sapiens/genome/transcriptome.fasta"
                genome2_fasta                                  = "${test_data_dir}/genomics/homo_sapiens/genome/genome2.fasta"
-		        genome_chain_gz                                = "${test_data_dir}/genomics/homo_sapiens/genome/genome.chain.gz"
+                genome_chain_gz                                = "${test_data_dir}/genomics/homo_sapiens/genome/genome.chain.gz"
                genome_21_fasta                                = "${test_data_dir}/genomics/homo_sapiens/genome/chr21/sequence/genome.fasta"
                genome_21_fasta_fai                            = "${test_data_dir}/genomics/homo_sapiens/genome/chr21/sequence/genome.fasta.fai"
                genome_21_dict                                 = "${test_data_dir}/genomics/homo_sapiens/genome/chr21/sequence/genome.dict"
@ -138,6 +138,7 @@ params {
                mills_and_1000g_indels_vcf_gz_tbi              = "${test_data_dir}/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz.tbi"
                syntheticvcf_short_vcf_gz                      = "${test_data_dir}/genomics/homo_sapiens/genome/vcf/syntheticvcf_short.vcf.gz"
                syntheticvcf_short_vcf_gz_tbi                  = "${test_data_dir}/genomics/homo_sapiens/genome/vcf/syntheticvcf_short.vcf.gz.tbi"
+                syntheticvcf_short_score                       = "${test_data_dir}/genomics/homo_sapiens/genome/vcf/syntheticvcf_short.score"
                gnomad_r2_1_1_sv_vcf_gz                        = "${test_data_dir}/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1-sv.vcf.gz"

                hapmap_3_3_hg38_21_vcf_gz                      = "${test_data_dir}/genomics/homo_sapiens/genome/chr21/germlineresources/hapmap_3.3.hg38.vcf.gz"
--- a/tests/modules/plink2/score/main.nf
+++ b/tests/modules/plink2/score/main.nf
@ -0,0 +1,24 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { PLINK2_VCF } from '../../../../modules/plink2/vcf/main.nf'
+include { PLINK2_SCORE } from '../../../../modules/plink2/score/main.nf'
+
+workflow test_plink2_score {
+    input = [
+        [ id:'test', single_end:false ], // meta map
+        file(params.test_data['homo_sapiens']['genome']['syntheticvcf_short_vcf_gz'], checkIfExists: true)
+    ]
+    PLINK2_VCF ( input )
+
+    scorefile = file(params.test_data['homo_sapiens']['genome']['syntheticvcf_short_score'], checkIfExists: true)
+
+    PLINK2_VCF.out.pgen
+        .concat(PLINK2_VCF.out.psam, PLINK2_VCF.out.pvar)
+        .groupTuple()
+        .map { it.flatten() }
+        .set { ch_target_genome }
+
+    PLINK2_SCORE ( ch_target_genome, scorefile )
+}
--- a/tests/modules/plink2/score/nextflow.config
+++ b/tests/modules/plink2/score/nextflow.config
@ -0,0 +1,15 @@
+process {
+
+    publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
+
+    // relabel input variants to a common scheme chr:pos:alt:ref
+    withName: PLINK2_VCF {
+        ext.args = '--set-missing-var-ids @:#:\\$1:\\$2'
+    }
+
+    // scoring really needs an adjustment for small test dataset (n > 50
+    // normally)
+    withName: PLINK2_SCORE {
+        ext.args = 'no-mean-imputation'
+    }
+}
--- a/tests/modules/plink2/score/test.yml
+++ b/tests/modules/plink2/score/test.yml
@ -0,0 +1,16 @@
+- name: plink2 score test_plink2_score
+  command: nextflow run tests/modules/plink2/score -entry test_plink2_score -c tests/config/nextflow.config
+  tags:
+    - plink2
+    - plink2/score
+  files:
+    - path: output/plink2/test.pgen
+      md5sum: fac12ca9041d6950f6b7d60ac2120721
+    - path: output/plink2/test.psam
+      md5sum: e6c714488754cb8448c3dfda08c4c0ea
+    - path: output/plink2/test.pvar.zst
+      md5sum: 98d59e9779a8b62d5032cd98b642a63b
+    - path: output/plink2/test.sscore
+      md5sum: 97bde840f69febd65f2c00e9243126e9
+    - path: output/plink2/versions.yml
+      md5sum: 71499ab14e1583c88ced3a7a4f05bfa7