Merge pull request #146 from nf-core/feat-bracken

add Bracken to the profiling workflow
2024-11-25 18:09:55 +00:00 · 2022-10-27 17:35:06 +02:00 · 2022-10-27 17:35:06 +02:00 · bf89525bc2
commit bf89525bc2
parent 63c260bfbc e658fab430
17 changed files with 216 additions and 8 deletions
--- a/CITATIONS.md
+++ b/CITATIONS.md
@ -36,6 +36,10 @@
  > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. Improved Metagenomic Analysis with Kraken 2. Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0.
 - [Bracken](https://doi.org/10.7717/peerj-cs.104)
  > Lu, J., Breitwieser, F. P., Thielen, P., & Salzberg, S. L. (2017). Bracken: Estimating species abundance in metagenomics data. PeerJ Computer Science, 3, e104. doi: 10.7717/peerj-cs.104
 - [Krona](https://doi.org/10.1186/1471-2105-12-385)
  > Ondov, Brian D., Nicholas H. Bergman, and Adam M. Phillippy. 2011. Interactive metagenomic visualization in a Web browser. BMC Bioinformatics 12 (1): 385. doi: 10.1186/1471-2105-12-385.
--- a/conf/modules.config
+++ b/conf/modules.config
@ -295,7 +295,7 @@ process {
    }
    withName: KRAKEN2_KRAKEN2 {
-        ext.args = { "${meta.db_params}" }
+        ext.args = params.kraken2_save_minimizers ? { "${meta.db_params} --report-minimizer-data" } : { "${meta.db_params}" }
        ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
        publishDir = [
            path: { "${params.outdir}/kraken2/${meta.db_name}/" },
@ -304,6 +304,16 @@ process {
        ]
    }
    withName: BRACKEN_BRACKEN {
        errorStrategy = 'ignore'
        ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
        publishDir = [
            path: { "${params.outdir}/bracken/${meta.db_name}/" },
            mode: params.publish_dir_mode,
            pattern: '*.tsv'
        ]
    }
    withName: KRAKENTOOLS_COMBINEKREPORTS {
        ext.prefix = { "kraken2_${meta.id}_combined_reports" }
        publishDir = [
--- a/conf/test.config
+++ b/conf/test.config
@ -34,6 +34,7 @@ params {
    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
    run_kaiju                             = true
    run_kraken2                           = true
    run_bracken                           = true
    run_malt                              = true
    run_metaphlan3                        = true
    run_centrifuge                        = true
--- a/conf/test_motus.config
+++ b/conf/test_motus.config
@ -33,6 +33,7 @@ params {
    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
    run_kaiju                             = false
    run_kraken2                           = false
    run_bracken                           = false
    run_malt                              = false
    run_metaphlan3                        = false
    run_centrifuge                        = false
--- a/conf/test_nopreprocessing.config
+++ b/conf/test_nopreprocessing.config
@ -33,6 +33,7 @@ params {
    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
    run_kaiju                             = true
    run_kraken2                           = true
    run_bracken                           = true
    run_malt                              = true
    run_metaphlan3                        = true
    run_centrifuge                        = true
--- a/conf/test_noprofiling.config
+++ b/conf/test_noprofiling.config
@ -34,6 +34,7 @@ params {
    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
    run_kaiju                             = false
    run_kraken2                           = false
    run_bracken                           = false
    run_malt                              = false
    run_metaphlan3                        = false
    run_centrifuge                        = false
--- a/conf/test_nothing.config
+++ b/conf/test_nothing.config
@ -33,6 +33,7 @@ params {
    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
    run_kaiju                             = false
    run_kraken2                           = false
    run_bracken                           = false
    run_malt                              = false
    run_metaphlan3                        = false
    run_centrifuge                        = false
--- a/conf/test_pep.config
+++ b/conf/test_pep.config
@ -19,6 +19,7 @@ params {
    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
    run_kaiju                             = true
    run_kraken2                           = true
    run_bracken                           = true
    run_malt                              = true
    run_metaphlan3                        = true
    run_centrifuge                        = true
--- a/docs/usage.md
+++ b/docs/usage.md
@ -74,13 +74,13 @@ The pipeline takes the locations and specific profiling parameters of the tool o
 > ⚠️ nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user. See below for more information of the expected database files.
-An example database sheet can look as follows, where 4 tools are being used, and `malt` and `kraken2` will be used against two databases each.
+An example database sheet can look as follows, where 4 tools are being used, and `malt` and `kraken2` will be used against two databases each. This is because specifying `bracken` implies first running `kraken2` on the same database.
 ```console
 tool,db_name,db_params,db_path
 malt,malt85,-id 85,/<path>/<to>/malt/testdb-malt/
 malt,malt95,-id 90,/<path>/<to>/malt/testdb-malt.tar.gz
-kraken2,db1,,/<path>/<to>/kraken2/testdb-kraken2.tar.gz
+bracken,db1,,/<path>/<to>/bracken/testdb-bracken.tar.gz
 kraken2,db2,--quick,/<path>/<to>/kraken2/testdb-kraken2.tar.gz
 centrifuge,db1,,/<path>/<to>/centrifuge/minigut_cf.tar.gz
 metaphlan3,db1,,/<path>/<to>/metaphlan3/metaphlan_database/
@ -91,8 +91,8 @@ Column specifications are as follows:
 | Column      | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
 | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tool`      | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required].                                                                                                                                                                                                                                                                                                                                                 |
+| `tool`      | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required]. Please note that `bracken` also implies running `kraken2` on the same database.                                                                                                                                                                                                                                                                 |
-| `db_name`   | A unique name of the particular database [required].                                                                                                                                                                                                                                                                                                                                                                                                           |
+| `db_name`   | A unique name per tool for the particular database [required]. Please note that names need to be unique across both `kraken2` and `bracken` as well, even if re-using the same database.                                                                                                                                                                                                                                                                       |
 | `db_params` | Any parameters of the given taxonomic profiler that you wish to specify that the taxonomic profiling tool should use when profiling against this specific. Can be empty to use taxonomic profiler defaults. Must not be surrounded by quotes [required]. We generally do not recommend specifying parameters here that turn on/off saving of output files or specifying particular file extensions - this should be already addressed via pipeline parameters. |
 | `db_path`   | Path to the database. Can either be a path to a directory containing the database index files or a `.tar.gz` file which contains the compressed database directory with the same name as the tar archive, minus `.tar.gz` [required].                                                                                                                                                                                                                          |
@ -116,6 +116,15 @@ Expected (uncompressed) database files for each tool are as follows:
  - `opts.k2d`
  - `hash.k2d`
  - `taxo.k2d`
 - **Bracken** output of a combined `kraken2-` and `bracken-build` process. Please see the [documentation on Bracken](https://github.com/jenniferlu717/Bracken#running-bracken-easy-version) for details. The output is a directory containing files per expected sequencing read length similarly to:
  - `hash.k2d`
  - `opts.k2d`
  - `taxo.k2d`
  - `database.kraken`
  - `database100mers.kmer_distrib`
  - `database100mers.kraken`
  - `database150mers.kmer_distrib`
  - `database150mers.kraken`
 - **Centrifuge** output of `centrifuge-build`. A directory containing:
  - `<database_name>.<number>.cf`
  - `<database_name>.<number>.cf`
--- a/modules.json
+++ b/modules.json
@ -21,6 +21,10 @@
                        "branch": "master",
                        "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905"
                    },
                    "bracken/bracken": {
                        "branch": "master",
                        "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905"
                    },
                    "cat/fastq": {
                        "branch": "master",
                        "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905"
--- a/modules/local/kraken2_standard_report.nf
+++ b/modules/local/kraken2_standard_report.nf
@ -0,0 +1,32 @@
 process KRAKEN2_STANDARD_REPORT {
    tag "$meta.id"
    label 'process_single'
    conda (params.enable_conda ? 'conda-forge::sed=4.8' : null)
    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
        'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv2/biocontainers_v1.2.0_cv2.img' :
        'biocontainers/biocontainers:v1.2.0_cv2' }"
    input:
    tuple val(meta), path(report)
    output:
    tuple val(meta), path(result), emit: report
    path 'versions.yml'          , emit: versions
    when:
    task.ext.when == null || task.ext.when
    script:
    def prefix = task.ext.prefix ?: "${meta.id}"
    result = "${prefix}_standardized.kraken2.report.txt"
    """
    cut -f1-3,6-8 '${report}' > '${result}'
    cat <<-END_VERSIONS > versions.yml
    "${task.process}":
        cut: \$(echo \$(cut --version 2>&1) | sed 's/^.*(GNU coreutils) //; s/ Copyright.*\$//')
    END_VERSIONS
    """
 }
--- a/modules/nf-core/bracken/bracken/main.nf
+++ b/modules/nf-core/bracken/bracken/main.nf
@ -0,0 +1,45 @@
 process BRACKEN_BRACKEN {
    tag "$meta.id"
    label 'process_low'
    // WARN: Version information not provided by tool on CLI.
    // Please update version string below when bumping container versions.
    conda (params.enable_conda ? "bioconda::bracken=2.7" : null)
    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
        'https://depot.galaxyproject.org/singularity/bracken:2.7--py39hc16433a_0':
        'quay.io/biocontainers/bracken:2.7--py39hc16433a_0' }"
    input:
    tuple val(meta), path(kraken_report)
    path database
    output:
    tuple val(meta), path(bracken_report), emit: reports
    path "versions.yml"          , emit: versions
    when:
    task.ext.when == null || task.ext.when
    script:
    def threshold = meta.threshold ?: 10
    def taxonomic_level = meta.taxonomic_level ?: 'S'
    def read_length = meta.read_length ?: 150
    def args = task.ext.args ?: "-l ${taxonomic_level} -t ${threshold} -r ${read_length}"
    def prefix = task.ext.prefix ?: "${meta.id}"
    bracken_report = "${prefix}_${taxonomic_level}.tsv"
    // WARN: Version information not provided by tool on CLI.
    // Please update version string below when bumping container versions.
    def VERSION = '2.7'
    """
    bracken \\
        ${args} \\
        -d '${database}' \\
        -i '${kraken_report}' \\
        -o '${bracken_report}'
    cat <<-END_VERSIONS > versions.yml
    "${task.process}":
        bracken: ${VERSION}
    END_VERSIONS
    """
 }
--- a/modules/nf-core/bracken/bracken/meta.yml
+++ b/modules/nf-core/bracken/bracken/meta.yml
@ -0,0 +1,45 @@
 name: bracken_bracken
 description: Re-estimate taxonomic abundance of metagenomic samples analyzed by kraken.
 keywords:
  - sort
 tools:
  - bracken:
      description: Bracken (Bayesian Reestimation of Abundance with KrakEN) is a highly accurate statistical method that computes the abundance of species in DNA sequences from a metagenomics sample.
      homepage: https://ccb.jhu.edu/software/bracken/
      documentation: https://ccb.jhu.edu/software/bracken/index.shtml?t=manual
      tool_dev_url: https://github.com/jenniferlu717/Bracken
      doi: "10.7717/peerj-cs.104"
      licence: ["GPL v3"]
 input:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - kraken_report:
      type: file
      description: TSV file with six columns coming from kraken2 output
      pattern: "*.{tsv}"
  - database:
      type: file
      description: Directory containing the kraken2/Bracken files for analysis
      pattern: "*"
 output:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - versions:
      type: file
      description: File containing software versions
      pattern: "versions.yml"
  - reports:
      type: file
      description: TSV output report of the re-estimated abundances
      pattern: "*.{tsv}"
 authors:
  - "@Midnighter"
--- a/nextflow.config
+++ b/nextflow.config
@ -116,6 +116,10 @@ params {
    run_kraken2                     = false
    kraken2_save_reads              = false // added directly to module in profiling.nf
    kraken2_save_readclassification = false // added directly to module in profiling.nf
    kraken2_save_minimizers         = false
    // Bracken
    run_bracken = false
    // centrifuge
    run_centrifuge             = false
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -10,7 +10,7 @@
            "type": "object",
            "fa_icon": "fas fa-terminal",
            "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input", "outdir", "databases"],
+            "required": ["input", "databases", "outdir"],
            "properties": {
                "input": {
                    "type": "string",
@ -382,6 +382,17 @@
                    "description": "Turn on saving of Kraken2 per-read taxonomic assignment file",
                    "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - kraken2: `--output`"
                },
                "kraken2_save_minimizers": {
                    "type": "boolean",
                    "description": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.",
                    "fa_icon": "fas fa-save",
                    "help_text": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.\n\nAdds `--report-minimizer-data` to the kraken2 command."
                },
                "run_bracken": {
                    "type": "boolean",
                    "description": "Post-process kraken2 reports with Bracken.",
                    "fa_icon": "fas fa-toggle-on"
                },
                "run_malt": {
                    "type": "boolean",
                    "fa_icon": "fas fa-toggle-on",
--- a/subworkflows/local/profiling.nf
+++ b/subworkflows/local/profiling.nf
@ -5,6 +5,8 @@
 include { MALT_RUN                              } from '../../modules/nf-core/malt/run/main'
 include { MEGAN_RMA2INFO as MEGAN_RMA2INFO_TSV  } from '../../modules/nf-core/megan/rma2info/main'
 include { KRAKEN2_KRAKEN2                       } from '../../modules/nf-core/kraken2/kraken2/main'
 include { KRAKEN2_STANDARD_REPORT                } from '../../modules/local/kraken2_standard_report'
 include { BRACKEN_BRACKEN                       } from '../../modules/nf-core/bracken/bracken/main'
 include { CENTRIFUGE_CENTRIFUGE                 } from '../../modules/nf-core/centrifuge/centrifuge/main'
 include { CENTRIFUGE_KREPORT                    } from '../../modules/nf-core/centrifuge/kreport/main'
 include { METAPHLAN3_METAPHLAN3                 } from '../../modules/nf-core/metaphlan3/metaphlan3/main'
@ -39,7 +41,7 @@ workflow PROFILING {
            .combine(databases)
            .branch {
                malt:    it[2]['tool'] == 'malt'
-                kraken2: it[2]['tool'] == 'kraken2'
+                kraken2: it[2]['tool'] == 'kraken2' || it[2]['tool'] == 'bracken' // to reuse the kraken module to produce the input data for bracken
                metaphlan3: it[2]['tool'] == 'metaphlan3'
                centrifuge: it[2]['tool'] == 'centrifuge'
                kaiju: it[2]['tool'] == 'kaiju'
@ -129,7 +131,42 @@ workflow PROFILING {
        ch_multiqc_files       = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report )
        ch_versions            = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() )
        ch_raw_classifications = ch_raw_classifications.mix( KRAKEN2_KRAKEN2.out.classified_reads_assignment )
-        ch_raw_profiles        = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.report )
+        ch_raw_profiles        = ch_raw_profiles.mix(
            KRAKEN2_KRAKEN2.out.report
                // Set the tool to be strictly 'kraken2' instead of potentially 'bracken' for downstream use.
                // Will remain distinct from 'pure' Kraken2 results due to distinct database names in file names.
                .map { meta, report -> [meta + [tool: 'kraken2'], report]}
        )
    }
    if ( params.run_kraken2 && params.run_bracken ) {
        // Remove files from 'pure' kraken2 runs, so only those aligned against Bracken & kraken2 database are used.
        def ch_kraken2_output = KRAKEN2_KRAKEN2.out.report
            .filter { meta, report -> meta['tool'] == 'bracken' }
        // If necessary, convert the eight column output to six column output.
        if (params.kraken2_save_minimizers) {
            ch_kraken2_output = KRAKEN2_STANDARD_REPORT(ch_kraken2_output).report
        }
        // Extract the database name to combine by.
        ch_bracken_databases = databases
            .filter { meta, db -> meta['tool'] == 'bracken' }
            .map { meta, db -> [meta['db_name'], meta, db] }
        // Extract the database name to combine by.
        ch_input_for_bracken = ch_kraken2_output
            .map { meta, report -> [meta['db_name'], meta, report] }
            .combine(ch_bracken_databases, by: 0)
            .multiMap { key, meta, report, db_meta, db ->
                report: [meta + db_meta, report]
                db: db
            }
        BRACKEN_BRACKEN(ch_input_for_bracken.report, ch_input_for_bracken.db)
        ch_versions     = ch_versions.mix(BRACKEN_BRACKEN.out.versions.first())
        ch_raw_profiles = ch_raw_profiles.mix(BRACKEN_BRACKEN.out.reports)
    }
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -41,6 +41,7 @@ if (params.longread_hostremoval_index      ) { ch_longread_reference_index  = fi
 if (params.diamond_save_reads              ) log.warn "[nf-core/taxprofiler] DIAMOND only allows output of a single format. As --diamond_save_reads supplied, only aligned reads in SAM format will be produced, no taxonomic profiles will be available."
 if (params.run_malt && params.run_krona && !params.krona_taxonomy_directory) log.warn "[nf-core/taxprofiler] Krona can only be run on MALT output if path to Krona taxonomy database supplied to --krona_taxonomy_directory. Krona will not be executed in this run for MALT."
 if (params.run_bracken && !params.run_kraken2) exit 1, 'ERROR: [nf-core/taxprofiler] You are attempting to run Bracken without running kraken2. This is not possible! Please set --run_kraken2 as well.'
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~