From 18886c0cbca45a1959b5c433822ee3e8f5163957 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 7 Feb 2023 15:51:25 +0100 Subject: [PATCH 1/4] Start trying a different method with maps in profiling --- conf/modules.config | 2 +- docs/usage.md | 16 +++++++++------- subworkflows/local/db_check.nf | 11 +++++++---- subworkflows/local/profiling.nf | 15 ++++++++++++++- 4 files changed, 31 insertions(+), 13 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index cae5a45..47c4663 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -359,7 +359,7 @@ process { } withName: BRACKEN_BRACKEN { - errorStrategy = 'ignore' + ext.args = { "${meta.db_params}" } ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.bracken" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.bracken" } publishDir = [ path: { "${params.outdir}/bracken/${meta.db_name}/" }, diff --git a/docs/usage.md b/docs/usage.md index 366e0e5..21c983d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -95,7 +95,7 @@ An example database sheet can look as follows, where 5 tools are being used, and tool,db_name,db_params,db_path malt,malt85,-id 85,///malt/testdb-malt/ malt,malt95,-id 90,///malt/testdb-malt.tar.gz -bracken,db1,,///bracken/testdb-bracken.tar.gz +bracken,db1,;-r 150,///bracken/testdb-bracken.tar.gz kraken2,db2,--quick,///kraken2/testdb-kraken2.tar.gz krakenuniq,db3,,///krakenuniq/testdb-krakenuniq.tar.gz centrifuge,db1,,///centrifuge/minigut_cf.tar.gz @@ -103,14 +103,16 @@ metaphlan3,db1,,///metaphlan3/metaphlan_database/ motus,db_mOTU,,///motus/motus_database/ ``` +Bracken **must** have a _semi-colon_ `;` list as in `db_params`, regardless of whether you have parameters or not. This is to allow to specify the Kraken2 parameters before, and Bracken parameters after the `;` as Bracken is a two step process. This is particularly important if you supply a Bracken database with a non-default read length parameter. + Column specifications are as follows: -| Column | Description | -| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tool` | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required]. Please note that `bracken` also implies running `kraken2` on the same database. | -| `db_name` | A unique name per tool for the particular database [required]. Please note that names need to be unique across both `kraken2` and `bracken` as well, even if re-using the same database. | -| `db_params` | Any parameters of the given taxonomic classifier/profiler that you wish to specify that the taxonomic classifier/profiling tool should use when profiling against this specific database. Can be empty to use taxonomic classifier/profiler defaults. Must not be surrounded by quotes [required]. We generally do not recommend specifying parameters here that turn on/off saving of output files or specifying particular file extensions - this should be already addressed via pipeline parameters. | -| `db_path` | Path to the database. Can either be a path to a directory containing the database index files or a `.tar.gz` file which contains the compressed database directory with the same name as the tar archive, minus `.tar.gz` [required]. | +| Column | Description | +| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tool` | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required]. Please note that `bracken` also implies running `kraken2` on the same database. | +| `db_name` | A unique name per tool for the particular database [required]. Please note that names need to be unique across both `kraken2` and `bracken` as well, even if re-using the same database. | +| `db_params` | Any parameters of the given taxonomic classifier/profiler that you wish to specify that the taxonomic classifier/profiling tool should use when profiling against this specific database. Can be empty to use taxonomic classifier/profiler defaults. Must not be surrounded by quotes [required]. We generally do not recommend specifying parameters here that turn on/off saving of output files or specifying particular file extensions - this should be already addressed via pipeline parameters. For Bracken databases, must at a minimum contain a `;` separating Kraken2 from Bracken parameters. | +| `db_path` | Path to the database. Can either be a path to a directory containing the database index files or a `.tar.gz` file which contains the compressed database directory with the same name as the tar archive, minus `.tar.gz` [required]. | > 💡 You can also specify the same database directory/file twice (ensuring unique `db_name`s) and specify different parameters for each database to compare the effect of different parameters during classification/profiling. diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index 270f597..5a52704 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -13,8 +13,8 @@ workflow DB_CHECK { ch_dbs_for_untar = Channel.empty() ch_final_dbs = Channel.empty() - // special check to check _between_ rows, for which we must group rows together - // note: this will run in parallel to within-row validity, but we can assume this will run faster thus will fail first + // Special check to check _between_ rows, for which we must group rows together + // Note: this will run in parallel to within-row validity, but we can assume this will run faster thus will fail first Channel.fromPath(dbsheet) .splitCsv ( header:true, sep:',' ) .map {[it.tool, it.db_name] } @@ -25,13 +25,14 @@ workflow DB_CHECK { if ( unique_names.size() < db_name.size() ) exit 1, "[nf-core/taxprofiler] ERROR: Each database for a tool must have a unique name, duplicated detected. Tool: ${tool}, Database name: ${unique_names}" } - // normal checks for within-row validity, so can be moved to separate functions + // Normal checks for within-row validity, so can be moved to separate functions parsed_samplesheet = Channel.fromPath(dbsheet) .splitCsv ( header:true, sep:',' ) .map { validate_db_rows(it) create_db_channels(it) } + .dump(tag: "blah") ch_dbs_for_untar = parsed_samplesheet .branch { @@ -40,7 +41,7 @@ workflow DB_CHECK { } // Filter the channel to untar only those databases for tools that are selected to be run by the user. - ch_input_untar = ch_dbs_for_untar.untar.dump() + ch_input_untar = ch_dbs_for_untar.untar .filter { params["run_${it[0]['tool']}"] } @@ -71,6 +72,8 @@ def validate_db_rows(LinkedHashMap row){ if ( row.db_params.contains('"') ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. No quotes allowed. Error in: ${row}" if ( row.db_params.contains("'") ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. No quotes allowed. Error in: ${row}" + if ( row.tool == 'bracken' && !row.db_params.contains(";") ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. Bracken requires a semi-colon. Error in: ${row}" + } def create_db_channels(LinkedHashMap row) { diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 6b5347f..0cd6110 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -122,6 +122,15 @@ workflow PROFILING { if ( params.run_kraken2 ) { ch_input_for_kraken2 = ch_input_for_profiling.kraken2 + .dump(tag: "ch_input_for_kraken2") + .map { + // Have to pick first element if using bracken, + // as db sheet for bracken must have ; sep list to + meta, reads, db_meta, db -> + def db_meta_new = db_meta.clone() + db_meta_new['db_params'] = db_meta['tool'] == 'bracken' ? db_meta_new['db_params'].split(;)[0] : db_meta_new['db_params'] + [ meta, reads, db_meta_new, db ] + } .multiMap { it -> reads: [ it[0] + it[2], it[1] ] @@ -155,15 +164,19 @@ workflow PROFILING { ch_kraken2_output = KRAKEN2_STANDARD_REPORT(ch_kraken2_output).report } + // TODO UPDATE BRACKEN TO TAKE SECOND ELEMENT OF LIST + // NEED TO DO CHECKS WHEN ONE OR THE OTHER IS EMPTY AS WELL + // Extract the database name to combine by. ch_bracken_databases = databases .filter { meta, db -> meta['tool'] == 'bracken' } .map { meta, db -> [meta['db_name'], meta, db] } - // Extract the database name to combine by. + // Combine back with the reads ch_input_for_bracken = ch_kraken2_output .map { meta, report -> [meta['db_name'], meta, report] } .combine(ch_bracken_databases, by: 0) + .dump(tag: "ch_input_for_bracken") .multiMap { key, meta, report, db_meta, db -> report: [meta + db_meta, report] db: db From fe3d42ffd78a1f6ae7c8c3ab334e1036ba32d0b0 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 8 Feb 2023 13:34:36 +0100 Subject: [PATCH 2/4] Make bracken db_params be a semi-colon separate list to allow kraken and bracken parameters --- subworkflows/local/db_check.nf | 1 - subworkflows/local/profiling.nf | 52 ++++++++++++++++++++++++++------- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index 5a52704..20402f6 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -32,7 +32,6 @@ workflow DB_CHECK { validate_db_rows(it) create_db_channels(it) } - .dump(tag: "blah") ch_dbs_for_untar = parsed_samplesheet .branch { diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 0cd6110..16caa8f 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -120,17 +120,28 @@ workflow PROFILING { } if ( params.run_kraken2 ) { - - ch_input_for_kraken2 = ch_input_for_profiling.kraken2 - .dump(tag: "ch_input_for_kraken2") + // Have to pick first element of db_params if using bracken, + // as db sheet for bracken must have ; sep list to + // distinguish between kraken and bracken parameters + ch_input_for_kraken2 = ch_input_for_profiling.kraken2 + .dump(tag: "ch_input_for_kraken2_b4") .map { - // Have to pick first element if using bracken, - // as db sheet for bracken must have ; sep list to meta, reads, db_meta, db -> def db_meta_new = db_meta.clone() - db_meta_new['db_params'] = db_meta['tool'] == 'bracken' ? db_meta_new['db_params'].split(;)[0] : db_meta_new['db_params'] + + // Only take second element if one exists + def parsed_params = db_meta_new['db_params'].split(";") + if ( parsed_params.size() == 2 ) { + db_meta_new['db_params'] = parsed_params[0] + } else if ( parsed_params.size() == 0 ) { + db_meta_new['db_params'] = "" + } else { + db_meta_new['db_params'] = parsed_params[0] + } + [ meta, reads, db_meta_new, db ] } + .dump(tag: "ch_input_for_kraken2_after") .multiMap { it -> reads: [ it[0] + it[2], it[1] ] @@ -164,9 +175,6 @@ workflow PROFILING { ch_kraken2_output = KRAKEN2_STANDARD_REPORT(ch_kraken2_output).report } - // TODO UPDATE BRACKEN TO TAKE SECOND ELEMENT OF LIST - // NEED TO DO CHECKS WHEN ONE OR THE OTHER IS EMPTY AS WELL - // Extract the database name to combine by. ch_bracken_databases = databases .filter { meta, db -> meta['tool'] == 'bracken' } @@ -176,7 +184,31 @@ workflow PROFILING { ch_input_for_bracken = ch_kraken2_output .map { meta, report -> [meta['db_name'], meta, report] } .combine(ch_bracken_databases, by: 0) - .dump(tag: "ch_input_for_bracken") + .dump(tag: "ch_input_for_bracken_b4") + .map { + + key, meta, reads, db_meta, db -> + def db_meta_new = db_meta.clone() + + // Have to pick second element if using bracken, as first element + // contains kraken parameters + if ( db_meta['tool'] == 'bracken' ) { + + // Only take second element if one exists + def parsed_params = db_meta_new['db_params'].split(";") + if ( parsed_params.size() == 2 ) { + db_meta_new['db_params'] = parsed_params[1] + } else { + db_meta_new['db_params'] = "" + } + + } else { + db_meta_new['db_params'] + } + + [ key, meta, reads, db_meta_new, db ] + } + .dump(tag: "ch_input_for_bracken_after") .multiMap { key, meta, report, db_meta, db -> report: [meta + db_meta, report] db: db From b3837cce5b4397ae0e0f17a296ff1d31d29285bd Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 8 Feb 2023 15:44:37 +0100 Subject: [PATCH 3/4] Relax strictness so can supply empty db_params but block if only single parameter with out ; --- docs/usage.md | 2 +- subworkflows/local/db_check.nf | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 21c983d..53dc8c3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -103,7 +103,7 @@ metaphlan3,db1,,///metaphlan3/metaphlan_database/ motus,db_mOTU,,///motus/motus_database/ ``` -Bracken **must** have a _semi-colon_ `;` list as in `db_params`, regardless of whether you have parameters or not. This is to allow to specify the Kraken2 parameters before, and Bracken parameters after the `;` as Bracken is a two step process. This is particularly important if you supply a Bracken database with a non-default read length parameter. +For Bracken, if you wish to supply amy parameters to either the Kraken or Bracken step you **must** have a _semi-colon_ `;` list as in `db_params`. This is to allow to specify the Kraken2 parameters before, and Bracken parameters after the `;` as Bracken is a two step process. This is particularly important if you supply a Bracken database with a non-default read length parameter. If you do not have any parameters to specify, you can leave this as empty. Column specifications are as follows: diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index 20402f6..35240ed 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -71,7 +71,8 @@ def validate_db_rows(LinkedHashMap row){ if ( row.db_params.contains('"') ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. No quotes allowed. Error in: ${row}" if ( row.db_params.contains("'") ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. No quotes allowed. Error in: ${row}" - if ( row.tool == 'bracken' && !row.db_params.contains(";") ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. Bracken requires a semi-colon. Error in: ${row}" + // check if any form of bracken params, that it must have `;` + if ( row.tool == 'bracken' && row.db_params && !row.db_params.contains(";") ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. Bracken requires a semi-colon if passing parameter. Error in: ${row}" } From 70eb84b8d5d310c666d0de2fb5f9db401571cc39 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 9 Feb 2023 08:01:52 +0100 Subject: [PATCH 4/4] Apply suggestions from code review Co-authored-by: Moritz E. Beber --- conf/modules.config | 1 + docs/usage.md | 2 +- subworkflows/local/profiling.nf | 3 --- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 47c4663..b0a6e03 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -359,6 +359,7 @@ process { } withName: BRACKEN_BRACKEN { + errorStrategy = 'ignore' ext.args = { "${meta.db_params}" } ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.bracken" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.bracken" } publishDir = [ diff --git a/docs/usage.md b/docs/usage.md index 53dc8c3..523d92a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -103,7 +103,7 @@ metaphlan3,db1,,///metaphlan3/metaphlan_database/ motus,db_mOTU,,///motus/motus_database/ ``` -For Bracken, if you wish to supply amy parameters to either the Kraken or Bracken step you **must** have a _semi-colon_ `;` list as in `db_params`. This is to allow to specify the Kraken2 parameters before, and Bracken parameters after the `;` as Bracken is a two step process. This is particularly important if you supply a Bracken database with a non-default read length parameter. If you do not have any parameters to specify, you can leave this as empty. +For Bracken, if you wish to supply any parameters to either the Kraken or Bracken step you **must** have a _semi-colon_ `;` list as in `db_params`. This is to allow to specify the Kraken2 parameters before, and Bracken parameters after the `;` as Bracken is a two step process. This is particularly important if you supply a Bracken database with a non-default read length parameter. If you do not have any parameters to specify, you can leave this as empty. Column specifications are as follows: diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 16caa8f..cf5a9b8 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -141,7 +141,6 @@ workflow PROFILING { [ meta, reads, db_meta_new, db ] } - .dump(tag: "ch_input_for_kraken2_after") .multiMap { it -> reads: [ it[0] + it[2], it[1] ] @@ -184,7 +183,6 @@ workflow PROFILING { ch_input_for_bracken = ch_kraken2_output .map { meta, report -> [meta['db_name'], meta, report] } .combine(ch_bracken_databases, by: 0) - .dump(tag: "ch_input_for_bracken_b4") .map { key, meta, reads, db_meta, db -> @@ -208,7 +206,6 @@ workflow PROFILING { [ key, meta, reads, db_meta_new, db ] } - .dump(tag: "ch_input_for_bracken_after") .multiMap { key, meta, report, db_meta, db -> report: [meta + db_meta, report] db: db