add ncbi-genome-download module (#980)

* add ncbi-genome-download module * Update modules/ncbigenomedownload/main.nf Co-authored-by: Gregor Sturm <mail@gregor-sturm.de> Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>
2024-12-22 02:58:17 +00:00 · 2021-11-15 11:32:53 -07:00 · 2021-11-15 11:32:53 -07:00 · 2294ff7826
commit 2294ff7826
parent ad46010385
6 changed files with 256 additions and 0 deletions
--- a/modules/ncbigenomedownload/functions.nf
+++ b/modules/ncbigenomedownload/functions.nf
@ -0,0 +1,78 @@
+//
+//  Utility functions used in nf-core DSL2 module files
+//
+
+//
+// Extract name of software tool from process name using $task.process
+//
+def getSoftwareName(task_process) {
+    return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
+}
+
+//
+// Extract name of module from process name using $task.process
+//
+def getProcessName(task_process) {
+    return task_process.tokenize(':')[-1]
+}
+
+//
+// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
+//
+def initOptions(Map args) {
+    def Map options = [:]
+    options.args            = args.args ?: ''
+    options.args2           = args.args2 ?: ''
+    options.args3           = args.args3 ?: ''
+    options.publish_by_meta = args.publish_by_meta ?: []
+    options.publish_dir     = args.publish_dir ?: ''
+    options.publish_files   = args.publish_files
+    options.suffix          = args.suffix ?: ''
+    return options
+}
+
+//
+// Tidy up and join elements of a list to return a path string
+//
+def getPathFromList(path_list) {
+    def paths = path_list.findAll { item -> !item?.trim().isEmpty() }      // Remove empty entries
+    paths     = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
+    return paths.join('/')
+}
+
+//
+// Function to save/publish module results
+//
+def saveFiles(Map args) {
+    def ioptions  = initOptions(args.options)
+    def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
+
+    // Do not publish versions.yml unless running from pytest workflow
+    if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) {
+        return null
+    }
+    if (ioptions.publish_by_meta) {
+        def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta
+        for (key in key_list) {
+            if (args.meta && key instanceof String) {
+                def path = key
+                if (args.meta.containsKey(key)) {
+                    path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key]
+                }
+                path = path instanceof String ? path : ''
+                path_list.add(path)
+            }
+        }
+    }
+    if (ioptions.publish_files instanceof Map) {
+        for (ext in ioptions.publish_files) {
+            if (args.filename.endsWith(ext.key)) {
+                def ext_list = path_list.collect()
+                ext_list.add(ext.value)
+                return "${getPathFromList(ext_list)}/$args.filename"
+            }
+        }
+    } else if (ioptions.publish_files == null) {
+        return "${getPathFromList(path_list)}/$args.filename"
+    }
+}
--- a/modules/ncbigenomedownload/main.nf
+++ b/modules/ncbigenomedownload/main.nf
@ -0,0 +1,56 @@
+// Import generic module functions
+include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions'
+
+params.options = [:]
+options        = initOptions(params.options)
+
+process NCBIGENOMEDOWNLOAD {
+    tag "$meta.id"
+    label 'process_low'
+    publishDir "${params.outdir}",
+        mode: params.publish_dir_mode,
+        saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) }
+
+    conda (params.enable_conda ? "bioconda::ncbi-genome-download=0.3.0" : null)
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://depot.galaxyproject.org/singularity/ncbi-genome-download:0.3.0--pyh864c0ab_1"
+    } else {
+        container "quay.io/biocontainers/ncbi-genome-download:0.3.0--pyh864c0ab_1"
+    }
+
+    input:
+    val meta
+    path accessions
+
+    output:
+    tuple val(meta), path("*_genomic.gbff.gz")        , emit: gbk     , optional: true
+    tuple val(meta), path("*_genomic.fna.gz")         , emit: fna     , optional: true
+    tuple val(meta), path("*_rm.out.gz")              , emit: rm      , optional: true
+    tuple val(meta), path("*_feature_table.txt.gz")   , emit: features, optional: true
+    tuple val(meta), path("*_genomic.gff.gz")         , emit: gff     , optional: true
+    tuple val(meta), path("*_protein.faa.gz")         , emit: faa     , optional: true
+    tuple val(meta), path("*_protein.gpff.gz")        , emit: gpff    , optional: true
+    tuple val(meta), path("*_wgsmaster.gbff.gz")      , emit: wgs_gbk , optional: true
+    tuple val(meta), path("*_cds_from_genomic.fna.gz"), emit: cds     , optional: true
+    tuple val(meta), path("*_rna.fna.gz")             , emit: rna     , optional: true
+    tuple val(meta), path("*_rna_from_genomic.fna.gz"), emit: rna_fna , optional: true
+    tuple val(meta), path("*_assembly_report.txt")    , emit: report  , optional: true
+    tuple val(meta), path("*_assembly_stats.txt")     , emit: stats   , optional: true
+    path "versions.yml"                               , emit: versions
+
+    script:
+    def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}"
+    def accessions_opt = accessions ? "-A ${accessions}" : ""
+    """
+    ncbi-genome-download \\
+        $options.args \\
+        $accessions_opt \\
+        --output-folder ./ \\
+        --flat-output
+
+    cat <<-END_VERSIONS > versions.yml
+    ${getProcessName(task.process)}:
+        ${getSoftwareName(task.process)}: \$( ncbi-genome-download --version )
+    END_VERSIONS
+    """
+}
--- a/modules/ncbigenomedownload/meta.yml
+++ b/modules/ncbigenomedownload/meta.yml
@ -0,0 +1,91 @@
+name: ncbigenomedownload
+description: A tool to quickly download assemblies from NCBI's Assembly database
+keywords:
+  - fasta
+  - download
+  - assembly
+tools:
+  - ncbigenomedownload:
+      description: Download genome files from the NCBI FTP server.
+      homepage: https://github.com/kblin/ncbi-genome-download
+      documentation: https://github.com/kblin/ncbi-genome-download
+      tool_dev_url: https://github.com/kblin/ncbi-genome-download
+      doi: ""
+      licence: ['Apache Software License']
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - accessions:
+      type: file
+      description: List of accessions (one per line) to download
+      pattern: "*.txt"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - gbk:
+      type: file
+      description: GenBank format of the genomic sequence(s) in the assembly
+      pattern: "*_genomic.gbff.gz"
+  - fna:
+      type: file
+      description: FASTA format of the genomic sequence(s) in the assembly.
+      pattern: "*_genomic.fna.gz"
+  - rm:
+      type: file
+      description: RepeatMasker output for eukaryotes.
+      pattern: "*_rm.out.gz"
+  - features:
+      type: file
+      description: Tab-delimited text file reporting locations and attributes for a subset of annotated features
+      pattern: "*_feature_table.txt.gz"
+  - gff:
+      type: file
+      description: Annotation of the genomic sequence(s) in GFF3 format
+      pattern: "*_genomic.gff.gz"
+  - faa:
+      type: file
+      description: FASTA format of the accessioned protein products annotated on the genome assembly.
+      pattern: "*_protein.faa.gz"
+  - gpff:
+      type: file
+      description: GenPept format of the accessioned protein products annotated on the genome assembly.
+      pattern: "*_protein.gpff.gz"
+  - wgs_gbk:
+      type: file
+      description: GenBank flat file format of the WGS master for the assembly
+      pattern: "*_wgsmaster.gbff.gz"
+  - cds:
+      type: file
+      description: FASTA format of the nucleotide sequences corresponding to all CDS features annotated on the assembly
+      pattern: "*_cds_from_genomic.fna.gz"
+  - rna:
+      type: file
+      description: FASTA format of accessioned RNA products annotated on the genome assembly
+      pattern: "*_rna.fna.gz"
+  - rna_fna:
+      type: file
+      description: FASTA format of the nucleotide sequences corresponding to all RNA features annotated on the assembly
+      pattern: "*_rna_from_genomic.fna.gz"
+  - report:
+      type: file
+      description: Tab-delimited text file reporting the name, role and sequence accession.version for objects in the assembly
+      pattern: "*_assembly_report.txt"
+  - stats:
+      type: file
+      description: Tab-delimited text file reporting statistics for the assembly
+      pattern: "*_assembly_stats.txt"
+
+authors:
+  - "@rpetit3"
--- a/tests/config/pytest_modules.yml
+++ b/tests/config/pytest_modules.yml
@ -919,6 +919,10 @@ nanoplot:
  - modules/nanoplot/**
  - tests/modules/nanoplot/**

+ncbigenomedownload:
+  - modules/ncbigenomedownload/**
+  - tests/modules/ncbigenomedownload/**
+
 nextclade:
  - modules/nextclade/**
  - tests/modules/nextclade/**
--- a/tests/modules/ncbigenomedownload/main.nf
+++ b/tests/modules/ncbigenomedownload/main.nf
@ -0,0 +1,16 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { NCBIGENOMEDOWNLOAD } from '../../../modules/ncbigenomedownload/main.nf' addParams( options: [ args: '-A GCF_000013425.1 --formats genbank,fasta,assembly-stats bacteria '] )
+
+workflow test_ncbigenomedownload {
+    
+    input = [ [ id:'test', single_end:false ] ]
+
+    accessions = []
+
+    NCBIGENOMEDOWNLOAD ( input, accessions)
+}
+
+
--- a/tests/modules/ncbigenomedownload/test.yml
+++ b/tests/modules/ncbigenomedownload/test.yml
@ -0,0 +1,11 @@
+- name: ncbigenomedownload test_ncbigenomedownload
+  command: nextflow run tests/modules/ncbigenomedownload -entry test_ncbigenomedownload -c tests/config/nextflow.config
+  tags:
+    - ncbigenomedownload
+  files:
+    - path: output/ncbigenomedownload/GCF_000013425.1_ASM1342v1_assembly_stats.txt
+      md5sum: f78c6a373130e50fac5472962a5fdf44
+    - path: output/ncbigenomedownload/GCF_000013425.1_ASM1342v1_genomic.fna.gz
+      md5sum: b086eb1020e7df022afa545dc6d93297
+    - path: output/ncbigenomedownload/GCF_000013425.1_ASM1342v1_genomic.gbff.gz
+      md5sum: ae2da70e32c783858e6c60c72e9eeb7a