feat: add VEP module (#547)

* feat: add VEP module * fix: name * fix: EC lint] * feat: add info about params * fix: params as params, not input * fix: improve script * Update software/ensemblvep/environment.yml Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com> * Apply suggestions from code review * Apply suggestions from code review Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com> Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>
2024-12-22 11:08:17 +00:00 · 2021-07-06 14:11:11 +02:00 · 2021-07-06 14:11:11 +02:00 · 7b4a28b6e8
commit 7b4a28b6e8
parent 3f4eccbf0f
9 changed files with 284 additions and 0 deletions
--- a/software/ensemblvep/Dockerfile
+++ b/software/ensemblvep/Dockerfile
@ -0,0 +1,30 @@
+FROM nfcore/base:1.14
+LABEL \
+    author="Maxime Garcia" \
+    description="VEP image for nf-core pipelines" \
+    maintainer="maxime.garcia@scilifelab.se"
+
+# Install the conda environment
+COPY environment.yml /
+RUN conda env create -f /environment.yml && conda clean -a
+
+# Add conda installation dir to PATH (instead of doing 'conda activate')
+ENV PATH /opt/conda/envs/nf-core-vep-104.3/bin:$PATH
+
+# Setup default ARG variables
+ARG GENOME=GRCh38
+ARG SPECIES=homo_sapiens
+ARG VEP_VERSION=99
+
+# Download Genome
+RUN vep_install \
+    -a c \
+    -c .vep \
+    -s ${SPECIES} \
+    -y ${GENOME} \
+    --CACHE_VERSION ${VEP_VERSION} \
+    --CONVERT \
+    --NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE
+
+# Dump the details of the installed packages to a file for posterity
+RUN conda env export --name nf-core-vep-104.3 > nf-core-vep-104.3.yml
--- a/software/ensemblvep/build.sh
+++ b/software/ensemblvep/build.sh
@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Build and push all containers
+
+build_push() {
+    GENOME=$1
+    SPECIES=$2
+    VEP_VERSION=$3
+    VEP_TAG=$4
+
+    docker build \
+        -t nfcore/vep:${VEP_TAG}.${GENOME} \
+        software/vep/. \
+        --build-arg GENOME=${GENOME} \
+        --build-arg SPECIES=${SPECIES} \
+        --build-arg VEP_VERSION=${VEP_VERSION}
+
+    docker push nfcore/vep:${VEP_TAG}.${GENOME}
+}
+
+build_push "GRCh37"    "homo_sapiens"           "104" "104.3"
+build_push "GRCh38"    "homo_sapiens"           "104" "104.3"
+build_push "GRCm38"    "mus_musculus"           "102" "104.3"
+build_push "GRCm39"    "mus_musculus"           "104" "104.3"
+build_push "CanFam3.1" "canis_lupus_familiaris" "104" "104.3"
+build_push "WBcel235"  "caenorhabditis_elegans" "104" "104.3"
--- a/software/ensemblvep/environment.yml
+++ b/software/ensemblvep/environment.yml
@ -0,0 +1,10 @@
+# You can use this file to create a conda environment for this module:
+#   conda env create -f environment.yml
+name: nf-core-vep-104.3
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+
+dependencies:
+  - bioconda::ensembl-vep=104.3
--- a/software/ensemblvep/functions.nf
+++ b/software/ensemblvep/functions.nf
@ -0,0 +1,68 @@
+//
+//  Utility functions used in nf-core DSL2 module files
+//
+
+//
+// Extract name of software tool from process name using $task.process
+//
+def getSoftwareName(task_process) {
+    return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
+}
+
+//
+// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
+//
+def initOptions(Map args) {
+    def Map options = [:]
+    options.args            = args.args ?: ''
+    options.args2           = args.args2 ?: ''
+    options.args3           = args.args3 ?: ''
+    options.publish_by_meta = args.publish_by_meta ?: []
+    options.publish_dir     = args.publish_dir ?: ''
+    options.publish_files   = args.publish_files
+    options.suffix          = args.suffix ?: ''
+    return options
+}
+
+//
+// Tidy up and join elements of a list to return a path string
+//
+def getPathFromList(path_list) {
+    def paths = path_list.findAll { item -> !item?.trim().isEmpty() }      // Remove empty entries
+    paths     = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
+    return paths.join('/')
+}
+
+//
+// Function to save/publish module results
+//
+def saveFiles(Map args) {
+    if (!args.filename.endsWith('.version.txt')) {
+        def ioptions  = initOptions(args.options)
+        def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
+        if (ioptions.publish_by_meta) {
+            def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta
+            for (key in key_list) {
+                if (args.meta && key instanceof String) {
+                    def path = key
+                    if (args.meta.containsKey(key)) {
+                        path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key]
+                    }
+                    path = path instanceof String ? path : ''
+                    path_list.add(path)
+                }
+            }
+        }
+        if (ioptions.publish_files instanceof Map) {
+            for (ext in ioptions.publish_files) {
+                if (args.filename.endsWith(ext.key)) {
+                    def ext_list = path_list.collect()
+                    ext_list.add(ext.value)
+                    return "${getPathFromList(ext_list)}/$args.filename"
+                }
+            }
+        } else if (ioptions.publish_files == null) {
+            return "${getPathFromList(path_list)}/$args.filename"
+        }
+    }
+}
--- a/software/ensemblvep/main.nf
+++ b/software/ensemblvep/main.nf
@ -0,0 +1,62 @@
+// Import generic module functions
+include { initOptions; saveFiles; getSoftwareName } from './functions'
+
+params.options = [:]
+options = initOptions(params.options)
+params.use_cache = false
+params.vep_tag = ""
+
+process ENSEMBLVEP {
+    label 'process_medium'
+    publishDir "${params.outdir}",
+        mode: params.publish_dir_mode,
+        saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) }
+
+    conda (params.enable_conda ? "bioconda::ensembl-vep=104.3" : null)
+    if (params.use_cache) {
+        if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+            container "https://depot.galaxyproject.org/singularity/ensembl-vep:104.3--pl5262h4a94de4_0"
+        } else {
+            container "quay.io/biocontainers/ensembl-vep:104.3--pl5262h4a94de4_0"
+        }
+    } else {
+        container "nfcore/vep:${params.vep_tag}"
+    }
+
+    input:
+    tuple val(meta), path(vcf)
+    val   genome
+    val   species
+    val   cache_version
+    path  cache
+
+    output:
+    tuple val(meta), path("*.ann.vcf"), emit: vcf
+    path "*.summary.html"             , emit: report
+    path "*.version.txt"              , emit: version
+
+    script:
+    def software = getSoftwareName(task.process)
+    def prefix   = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}"
+    dir_cache    = params.use_cache ? "\${PWD}/${cache}" : "/.vep"
+    """
+    mkdir $prefix
+
+    vep \\
+        -i $vcf \\
+        -o ${prefix}.ann.vcf \\
+        $options.args \\
+        --assembly $genome \\
+        --species $species \\
+        --cache \\
+        --cache_version $cache_version \\
+        --dir_cache $dir_cache \\
+        --fork $task.cpus \\
+        --format vcf \\
+        --stats_file ${prefix}.summary.html
+
+    rm -rf $prefix
+
+    echo \$(vep --help 2>&1) > ${software}.version.txt
+    """
+}
--- a/software/ensemblvep/meta.yml
+++ b/software/ensemblvep/meta.yml
@ -0,0 +1,64 @@
+name: ENSEMBLVEP
+description: Ensembl Variant Effect Predictor (VEP)
+keywords:
+    - annotation
+tools:
+    - ensemblvep:
+        description: |
+            VEP determines the effect of your variants (SNPs, insertions, deletions, CNVs
+            or structural variants) on genes, transcripts, and protein sequence, as well as regulatory regions.
+        homepage: https://www.ensembl.org/info/docs/tools/vep/index.html
+        documentation: https://www.ensembl.org/info/docs/tools/vep/script/index.html
+params:
+    - use_cache:
+        type: boolean
+        description: |
+          Enable the usage of containers with cache
+          Does not work with conda
+    - vep_tag:
+        type: value
+        description: |
+          Specify the tag for the container
+          https://hub.docker.com/r/nfcore/vep/tags
+input:
+    - meta:
+        type: map
+        description: |
+            Groovy Map containing sample information
+            e.g. [ id:'test', single_end:false ]
+    - vcf:
+        type: file
+        description: |
+            vcf to annotate
+    - genome:
+        type: value
+        description: |
+            which genome to annotate with
+    - species:
+        type: value
+        description: |
+            which species to annotate with
+    - cache_version:
+        type: value
+        description: |
+            which version of the cache to annotate with
+    - cache:
+        type: file
+        description: |
+            path to VEP cache (optional)
+output:
+    - vcf:
+        type: file
+        description: |
+            annotated vcf
+        pattern: "*.ann.vcf"
+    - report:
+        type: file
+        description: VEP report file
+        pattern: "*.html"
+    - version:
+        type: file
+        description: File containing software version
+        pattern: "*.{version.txt}"
+authors:
+    - "@maxulysse"
--- a/tests/config/pytest_software.yml
+++ b/tests/config/pytest_software.yml
@ -230,6 +230,10 @@ dshbio/splitgff3:
  - software/dshbio/splitgff3/**
  - tests/software/dshbio/splitgff3/**

+ensemblvep:
+  - software/ensemblvep/**
+  - tests/software/ensemblvep/**
+
 fastp:
  - software/fastp/**
  - tests/software/fastp/**
--- a/tests/software/ensemblvep/main.nf
+++ b/tests/software/ensemblvep/main.nf
@ -0,0 +1,12 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { ENSEMBLVEP } from '../../../software/ensemblvep/main.nf' addParams( vep_tag: '104.3.WBcel235', use_cache: false )
+
+workflow test_ensemblvep {
+    input = [ [ id:'test' ], // meta map
+              [ file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true) ]
+            ]
+    ENSEMBLVEP ( input, "WBcel235", "caenorhabditis_elegans", "104", [] )
+}
--- a/tests/software/ensemblvep/test.yml
+++ b/tests/software/ensemblvep/test.yml
@ -0,0 +1,7 @@
+- name: ensemblvep test_ensemblvep
+  command: nextflow run tests/software/ensemblvep -entry test_ensemblvep -c tests/config/nextflow.config
+  tags:
+    - ensemblvep
+  files:
+    - path: output/ensemblvep/test.ann.vcf
+    - path: output/ensemblvep/test.summary.html