add ncbi-genome-download module (#980)

* add ncbi-genome-download module * Update modules/ncbigenomedownload/main.nf Co-authored-by: Gregor Sturm <mail@gregor-sturm.de> Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>
2024-12-22 11:08:17 +00:00 · 2021-11-15 11:32:53 -07:00 · 2021-11-15 11:32:53 -07:00 · 2294ff7826
commit 2294ff7826
parent ad46010385
6 changed files with 256 additions and 0 deletions
--- a/modules/ncbigenomedownload/functions.nf
+++ b/modules/ncbigenomedownload/functions.nf
@ -0,0 +1,78 @@
 //
 //  Utility functions used in nf-core DSL2 module files
 //
 //
 // Extract name of software tool from process name using $task.process
 //
 def getSoftwareName(task_process) {
    return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
 }
 //
 // Extract name of module from process name using $task.process
 //
 def getProcessName(task_process) {
    return task_process.tokenize(':')[-1]
 }
 //
 // Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
 //
 def initOptions(Map args) {
    def Map options = [:]
    options.args            = args.args ?: ''
    options.args2           = args.args2 ?: ''
    options.args3           = args.args3 ?: ''
    options.publish_by_meta = args.publish_by_meta ?: []
    options.publish_dir     = args.publish_dir ?: ''
    options.publish_files   = args.publish_files
    options.suffix          = args.suffix ?: ''
    return options
 }
 //
 // Tidy up and join elements of a list to return a path string
 //
 def getPathFromList(path_list) {
    def paths = path_list.findAll { item -> !item?.trim().isEmpty() }      // Remove empty entries
    paths     = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
    return paths.join('/')
 }
 //
 // Function to save/publish module results
 //
 def saveFiles(Map args) {
    def ioptions  = initOptions(args.options)
    def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
    // Do not publish versions.yml unless running from pytest workflow
    if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) {
        return null
    }
    if (ioptions.publish_by_meta) {
        def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta
        for (key in key_list) {
            if (args.meta && key instanceof String) {
                def path = key
                if (args.meta.containsKey(key)) {
                    path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key]
                }
                path = path instanceof String ? path : ''
                path_list.add(path)
            }
        }
    }
    if (ioptions.publish_files instanceof Map) {
        for (ext in ioptions.publish_files) {
            if (args.filename.endsWith(ext.key)) {
                def ext_list = path_list.collect()
                ext_list.add(ext.value)
                return "${getPathFromList(ext_list)}/$args.filename"
            }
        }
    } else if (ioptions.publish_files == null) {
        return "${getPathFromList(path_list)}/$args.filename"
    }
 }
--- a/modules/ncbigenomedownload/main.nf
+++ b/modules/ncbigenomedownload/main.nf
@ -0,0 +1,56 @@
 // Import generic module functions
 include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions'
 params.options = [:]
 options        = initOptions(params.options)
 process NCBIGENOMEDOWNLOAD {
    tag "$meta.id"
    label 'process_low'
    publishDir "${params.outdir}",
        mode: params.publish_dir_mode,
        saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) }
    conda (params.enable_conda ? "bioconda::ncbi-genome-download=0.3.0" : null)
    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
        container "https://depot.galaxyproject.org/singularity/ncbi-genome-download:0.3.0--pyh864c0ab_1"
    } else {
        container "quay.io/biocontainers/ncbi-genome-download:0.3.0--pyh864c0ab_1"
    }
    input:
    val meta
    path accessions
    output:
    tuple val(meta), path("*_genomic.gbff.gz")        , emit: gbk     , optional: true
    tuple val(meta), path("*_genomic.fna.gz")         , emit: fna     , optional: true
    tuple val(meta), path("*_rm.out.gz")              , emit: rm      , optional: true
    tuple val(meta), path("*_feature_table.txt.gz")   , emit: features, optional: true
    tuple val(meta), path("*_genomic.gff.gz")         , emit: gff     , optional: true
    tuple val(meta), path("*_protein.faa.gz")         , emit: faa     , optional: true
    tuple val(meta), path("*_protein.gpff.gz")        , emit: gpff    , optional: true
    tuple val(meta), path("*_wgsmaster.gbff.gz")      , emit: wgs_gbk , optional: true
    tuple val(meta), path("*_cds_from_genomic.fna.gz"), emit: cds     , optional: true
    tuple val(meta), path("*_rna.fna.gz")             , emit: rna     , optional: true
    tuple val(meta), path("*_rna_from_genomic.fna.gz"), emit: rna_fna , optional: true
    tuple val(meta), path("*_assembly_report.txt")    , emit: report  , optional: true
    tuple val(meta), path("*_assembly_stats.txt")     , emit: stats   , optional: true
    path "versions.yml"                               , emit: versions
    script:
    def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}"
    def accessions_opt = accessions ? "-A ${accessions}" : ""
    """
    ncbi-genome-download \\
        $options.args \\
        $accessions_opt \\
        --output-folder ./ \\
        --flat-output
    cat <<-END_VERSIONS > versions.yml
    ${getProcessName(task.process)}:
        ${getSoftwareName(task.process)}: \$( ncbi-genome-download --version )
    END_VERSIONS
    """
 }
--- a/modules/ncbigenomedownload/meta.yml
+++ b/modules/ncbigenomedownload/meta.yml
@ -0,0 +1,91 @@
 name: ncbigenomedownload
 description: A tool to quickly download assemblies from NCBI's Assembly database
 keywords:
  - fasta
  - download
  - assembly
 tools:
  - ncbigenomedownload:
      description: Download genome files from the NCBI FTP server.
      homepage: https://github.com/kblin/ncbi-genome-download
      documentation: https://github.com/kblin/ncbi-genome-download
      tool_dev_url: https://github.com/kblin/ncbi-genome-download
      doi: ""
      licence: ['Apache Software License']
 input:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - accessions:
      type: file
      description: List of accessions (one per line) to download
      pattern: "*.txt"
 output:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - versions:
      type: file
      description: File containing software versions
      pattern: "versions.yml"
  - gbk:
      type: file
      description: GenBank format of the genomic sequence(s) in the assembly
      pattern: "*_genomic.gbff.gz"
  - fna:
      type: file
      description: FASTA format of the genomic sequence(s) in the assembly.
      pattern: "*_genomic.fna.gz"
  - rm:
      type: file
      description: RepeatMasker output for eukaryotes.
      pattern: "*_rm.out.gz"
  - features:
      type: file
      description: Tab-delimited text file reporting locations and attributes for a subset of annotated features
      pattern: "*_feature_table.txt.gz"
  - gff:
      type: file
      description: Annotation of the genomic sequence(s) in GFF3 format
      pattern: "*_genomic.gff.gz"
  - faa:
      type: file
      description: FASTA format of the accessioned protein products annotated on the genome assembly.
      pattern: "*_protein.faa.gz"
  - gpff:
      type: file
      description: GenPept format of the accessioned protein products annotated on the genome assembly.
      pattern: "*_protein.gpff.gz"
  - wgs_gbk:
      type: file
      description: GenBank flat file format of the WGS master for the assembly
      pattern: "*_wgsmaster.gbff.gz"
  - cds:
      type: file
      description: FASTA format of the nucleotide sequences corresponding to all CDS features annotated on the assembly
      pattern: "*_cds_from_genomic.fna.gz"
  - rna:
      type: file
      description: FASTA format of accessioned RNA products annotated on the genome assembly
      pattern: "*_rna.fna.gz"
  - rna_fna:
      type: file
      description: FASTA format of the nucleotide sequences corresponding to all RNA features annotated on the assembly
      pattern: "*_rna_from_genomic.fna.gz"
  - report:
      type: file
      description: Tab-delimited text file reporting the name, role and sequence accession.version for objects in the assembly
      pattern: "*_assembly_report.txt"
  - stats:
      type: file
      description: Tab-delimited text file reporting statistics for the assembly
      pattern: "*_assembly_stats.txt"
 authors:
  - "@rpetit3"
--- a/tests/config/pytest_modules.yml
+++ b/tests/config/pytest_modules.yml
@ -919,6 +919,10 @@ nanoplot:
  - modules/nanoplot/**
  - tests/modules/nanoplot/**
 ncbigenomedownload:
  - modules/ncbigenomedownload/**
  - tests/modules/ncbigenomedownload/**
 nextclade:
  - modules/nextclade/**
  - tests/modules/nextclade/**
--- a/tests/modules/ncbigenomedownload/main.nf
+++ b/tests/modules/ncbigenomedownload/main.nf
@ -0,0 +1,16 @@
 #!/usr/bin/env nextflow
 nextflow.enable.dsl = 2
 include { NCBIGENOMEDOWNLOAD } from '../../../modules/ncbigenomedownload/main.nf' addParams( options: [ args: '-A GCF_000013425.1 --formats genbank,fasta,assembly-stats bacteria '] )
 workflow test_ncbigenomedownload {
    input = [ [ id:'test', single_end:false ] ]
    accessions = []
    NCBIGENOMEDOWNLOAD ( input, accessions)
 }
--- a/tests/modules/ncbigenomedownload/test.yml
+++ b/tests/modules/ncbigenomedownload/test.yml
@ -0,0 +1,11 @@
 - name: ncbigenomedownload test_ncbigenomedownload
  command: nextflow run tests/modules/ncbigenomedownload -entry test_ncbigenomedownload -c tests/config/nextflow.config
  tags:
    - ncbigenomedownload
  files:
    - path: output/ncbigenomedownload/GCF_000013425.1_ASM1342v1_assembly_stats.txt
      md5sum: f78c6a373130e50fac5472962a5fdf44
    - path: output/ncbigenomedownload/GCF_000013425.1_ASM1342v1_genomic.fna.gz
      md5sum: b086eb1020e7df022afa545dc6d93297
    - path: output/ncbigenomedownload/GCF_000013425.1_ASM1342v1_genomic.gbff.gz
      md5sum: ae2da70e32c783858e6c60c72e9eeb7a