Merge pull request #237 from nf-core/bracken-params

Support Kraken and Bracken db_params in database sheet
2024-11-24 21:29:56 +00:00 · 2023-02-09 10:59:27 +01:00 · 2023-02-09 10:59:27 +01:00 · 45ac6fedc2
commit 45ac6fedc2
parent 31d05d1994 70eb84b8d5
4 changed files with 61 additions and 13 deletions
--- a/conf/modules.config
+++ b/conf/modules.config
@ -355,6 +355,7 @@ process {

    withName: BRACKEN_BRACKEN {
        errorStrategy = 'ignore'
+        ext.args = { "${meta.db_params}" }
        ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.bracken" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.bracken" }
        publishDir = [
            path: { "${params.outdir}/bracken/${meta.db_name}/" },
--- a/docs/usage.md
+++ b/docs/usage.md
@ -95,7 +95,7 @@ An example database sheet can look as follows, where 5 tools are being used, and
 tool,db_name,db_params,db_path
 malt,malt85,-id 85,/<path>/<to>/malt/testdb-malt/
 malt,malt95,-id 90,/<path>/<to>/malt/testdb-malt.tar.gz
-bracken,db1,,/<path>/<to>/bracken/testdb-bracken.tar.gz
+bracken,db1,;-r 150,/<path>/<to>/bracken/testdb-bracken.tar.gz
 kraken2,db2,--quick,/<path>/<to>/kraken2/testdb-kraken2.tar.gz
 krakenuniq,db3,,/<path>/<to>/krakenuniq/testdb-krakenuniq.tar.gz
 centrifuge,db1,,/<path>/<to>/centrifuge/minigut_cf.tar.gz
@ -103,14 +103,16 @@ metaphlan3,db1,,/<path>/<to>/metaphlan3/metaphlan_database/
 motus,db_mOTU,,/<path>/<to>/motus/motus_database/
 ```

+For Bracken, if you wish to supply any parameters to either the Kraken or Bracken step you **must** have a _semi-colon_ `;` list as in `db_params`. This is to allow to specify the Kraken2 parameters before, and Bracken parameters after the `;` as Bracken is a two step process. This is particularly important if you supply a Bracken database with a non-default read length parameter. If you do not have any parameters to specify, you can leave this as empty.
+
 Column specifications are as follows:

-| Column      | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tool`      | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required]. Please note that `bracken` also implies running `kraken2` on the same database.                                                                                                                                                                                                                                                                                                           |
-| `db_name`   | A unique name per tool for the particular database [required]. Please note that names need to be unique across both `kraken2` and `bracken` as well, even if re-using the same database.                                                                                                                                                                                                                                                                                                                 |
-| `db_params` | Any parameters of the given taxonomic classifier/profiler that you wish to specify that the taxonomic classifier/profiling tool should use when profiling against this specific database. Can be empty to use taxonomic classifier/profiler defaults. Must not be surrounded by quotes [required]. We generally do not recommend specifying parameters here that turn on/off saving of output files or specifying particular file extensions - this should be already addressed via pipeline parameters. |
-| `db_path`   | Path to the database. Can either be a path to a directory containing the database index files or a `.tar.gz` file which contains the compressed database directory with the same name as the tar archive, minus `.tar.gz` [required].                                                                                                                                                                                                                                                                    |
+| Column      | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tool`      | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required]. Please note that `bracken` also implies running `kraken2` on the same database.                                                                                                                                                                                                                                                                                                                                                                                                              |
+| `db_name`   | A unique name per tool for the particular database [required]. Please note that names need to be unique across both `kraken2` and `bracken` as well, even if re-using the same database.                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| `db_params` | Any parameters of the given taxonomic classifier/profiler that you wish to specify that the taxonomic classifier/profiling tool should use when profiling against this specific database. Can be empty to use taxonomic classifier/profiler defaults. Must not be surrounded by quotes [required]. We generally do not recommend specifying parameters here that turn on/off saving of output files or specifying particular file extensions - this should be already addressed via pipeline parameters. For Bracken databases, must at a minimum contain a `;` separating Kraken2 from Bracken parameters. |
+| `db_path`   | Path to the database. Can either be a path to a directory containing the database index files or a `.tar.gz` file which contains the compressed database directory with the same name as the tar archive, minus `.tar.gz` [required].                                                                                                                                                                                                                                                                                                                                                                       |

 > 💡 You can also specify the same database directory/file twice (ensuring unique `db_name`s) and specify different parameters for each database to compare the effect of different parameters during classification/profiling.

--- a/subworkflows/local/db_check.nf
+++ b/subworkflows/local/db_check.nf
@ -13,8 +13,8 @@ workflow DB_CHECK {
    ch_dbs_for_untar = Channel.empty()
    ch_final_dbs = Channel.empty()

-    // special check to check _between_ rows, for which we must group rows together
-    // note: this will run in parallel to within-row validity, but we can assume this will run faster thus will fail first
+    // Special check to check _between_ rows, for which we must group rows together
+    // Note: this will run in parallel to within-row validity, but we can assume this will run faster thus will fail first
    Channel.fromPath(dbsheet)
            .splitCsv ( header:true, sep:',' )
            .map {[it.tool, it.db_name] }
@ -25,7 +25,7 @@ workflow DB_CHECK {
                    if ( unique_names.size() < db_name.size() ) exit 1, "[nf-core/taxprofiler] ERROR: Each database for a tool must have a unique name, duplicated detected. Tool: ${tool}, Database name: ${unique_names}"
            }

-    // normal checks for within-row validity, so can be moved to separate functions
+    // Normal checks for within-row validity, so can be moved to separate functions
    parsed_samplesheet = Channel.fromPath(dbsheet)
        .splitCsv ( header:true, sep:',' )
        .map {
@ -40,7 +40,7 @@ workflow DB_CHECK {
        }

    // Filter the channel to untar only those databases for tools that are selected to be run by the user.
-    ch_input_untar = ch_dbs_for_untar.untar.dump()
+    ch_input_untar = ch_dbs_for_untar.untar
                        .filter {
                          params["run_${it[0]['tool']}"]
                        }
@ -71,6 +71,9 @@ def validate_db_rows(LinkedHashMap row){
        if ( row.db_params.contains('"') ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. No quotes allowed. Error in: ${row}"
        if ( row.db_params.contains("'") ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. No quotes allowed. Error in: ${row}"

+        // check if any form of bracken params, that it must have `;`
+        if ( row.tool == 'bracken' && row.db_params && !row.db_params.contains(";") )  exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. Bracken requires a semi-colon if passing parameter. Error in: ${row}"
+
 }

 def create_db_channels(LinkedHashMap row) {
--- a/subworkflows/local/profiling.nf
+++ b/subworkflows/local/profiling.nf
@ -120,8 +120,27 @@ workflow PROFILING {
    }

    if ( params.run_kraken2 ) {
+        // Have to pick first element of db_params if using bracken,
+        // as db sheet for bracken must have ; sep list to
+        // distinguish between kraken and bracken parameters
+        ch_input_for_kraken2 = ch_input_for_profiling.kraken2
+                                .dump(tag: "ch_input_for_kraken2_b4")
+                                .map {
+                                    meta, reads, db_meta, db ->
+                                        def db_meta_new = db_meta.clone()

-        ch_input_for_kraken2 =  ch_input_for_profiling.kraken2
+                                        // Only take second element if one exists
+                                        def parsed_params = db_meta_new['db_params'].split(";")
+                                        if ( parsed_params.size() == 2 ) {
+                                            db_meta_new['db_params'] = parsed_params[0]
+                                        } else if ( parsed_params.size() == 0 ) {
+                                            db_meta_new['db_params'] = ""
+                                        } else {
+                                            db_meta_new['db_params'] = parsed_params[0]
+                                        }
+
+                                    [ meta, reads, db_meta_new, db ]
+                                }
                                .multiMap {
                                    it ->
                                        reads: [ it[0] + it[2], it[1] ]
@ -160,10 +179,33 @@ workflow PROFILING {
            .filter { meta, db -> meta['tool'] == 'bracken' }
            .map { meta, db -> [meta['db_name'], meta, db] }

-        // Extract the database name to combine by.
+        // Combine back with the reads
        ch_input_for_bracken = ch_kraken2_output
            .map { meta, report -> [meta['db_name'], meta, report] }
            .combine(ch_bracken_databases, by: 0)
+            .map {
+
+                key, meta, reads, db_meta, db ->
+                    def db_meta_new = db_meta.clone()
+
+                    // Have to pick second element if using bracken, as first element
+                    // contains kraken parameters
+                    if ( db_meta['tool'] == 'bracken' ) {
+
+                        // Only take second element if one exists
+                        def parsed_params = db_meta_new['db_params'].split(";")
+                        if ( parsed_params.size() == 2 ) {
+                            db_meta_new['db_params'] =  parsed_params[1]
+                        } else {
+                            db_meta_new['db_params'] = ""
+                        }
+
+                    } else {
+                        db_meta_new['db_params']
+                    }
+
+                [ key, meta, reads, db_meta_new, db ]
+            }
            .multiMap { key, meta, report, db_meta, db ->
                report: [meta + db_meta, report]
                db: db