mirror of
https://github.com/MillironX/taxprofiler.git
synced 2024-12-22 10:28:16 +00:00
Start trying a different method with maps in profiling
This commit is contained in:
parent
de5bdc36c5
commit
18886c0cbc
4 changed files with 31 additions and 13 deletions
|
@ -359,7 +359,7 @@ process {
|
|||
}
|
||||
|
||||
withName: BRACKEN_BRACKEN {
|
||||
errorStrategy = 'ignore'
|
||||
ext.args = { "${meta.db_params}" }
|
||||
ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.bracken" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.bracken" }
|
||||
publishDir = [
|
||||
path: { "${params.outdir}/bracken/${meta.db_name}/" },
|
||||
|
|
|
@ -95,7 +95,7 @@ An example database sheet can look as follows, where 5 tools are being used, and
|
|||
tool,db_name,db_params,db_path
|
||||
malt,malt85,-id 85,/<path>/<to>/malt/testdb-malt/
|
||||
malt,malt95,-id 90,/<path>/<to>/malt/testdb-malt.tar.gz
|
||||
bracken,db1,,/<path>/<to>/bracken/testdb-bracken.tar.gz
|
||||
bracken,db1,;-r 150,/<path>/<to>/bracken/testdb-bracken.tar.gz
|
||||
kraken2,db2,--quick,/<path>/<to>/kraken2/testdb-kraken2.tar.gz
|
||||
krakenuniq,db3,,/<path>/<to>/krakenuniq/testdb-krakenuniq.tar.gz
|
||||
centrifuge,db1,,/<path>/<to>/centrifuge/minigut_cf.tar.gz
|
||||
|
@ -103,14 +103,16 @@ metaphlan3,db1,,/<path>/<to>/metaphlan3/metaphlan_database/
|
|||
motus,db_mOTU,,/<path>/<to>/motus/motus_database/
|
||||
```
|
||||
|
||||
Bracken **must** have a _semi-colon_ `;` list as in `db_params`, regardless of whether you have parameters or not. This is to allow to specify the Kraken2 parameters before, and Bracken parameters after the `;` as Bracken is a two step process. This is particularly important if you supply a Bracken database with a non-default read length parameter.
|
||||
|
||||
Column specifications are as follows:
|
||||
|
||||
| Column | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tool` | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required]. Please note that `bracken` also implies running `kraken2` on the same database. |
|
||||
| `db_name` | A unique name per tool for the particular database [required]. Please note that names need to be unique across both `kraken2` and `bracken` as well, even if re-using the same database. |
|
||||
| `db_params` | Any parameters of the given taxonomic classifier/profiler that you wish to specify that the taxonomic classifier/profiling tool should use when profiling against this specific database. Can be empty to use taxonomic classifier/profiler defaults. Must not be surrounded by quotes [required]. We generally do not recommend specifying parameters here that turn on/off saving of output files or specifying particular file extensions - this should be already addressed via pipeline parameters. |
|
||||
| `db_path` | Path to the database. Can either be a path to a directory containing the database index files or a `.tar.gz` file which contains the compressed database directory with the same name as the tar archive, minus `.tar.gz` [required]. |
|
||||
| Column | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tool` | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required]. Please note that `bracken` also implies running `kraken2` on the same database. |
|
||||
| `db_name` | A unique name per tool for the particular database [required]. Please note that names need to be unique across both `kraken2` and `bracken` as well, even if re-using the same database. |
|
||||
| `db_params` | Any parameters of the given taxonomic classifier/profiler that you wish to specify that the taxonomic classifier/profiling tool should use when profiling against this specific database. Can be empty to use taxonomic classifier/profiler defaults. Must not be surrounded by quotes [required]. We generally do not recommend specifying parameters here that turn on/off saving of output files or specifying particular file extensions - this should be already addressed via pipeline parameters. For Bracken databases, must at a minimum contain a `;` separating Kraken2 from Bracken parameters. |
|
||||
| `db_path` | Path to the database. Can either be a path to a directory containing the database index files or a `.tar.gz` file which contains the compressed database directory with the same name as the tar archive, minus `.tar.gz` [required]. |
|
||||
|
||||
> 💡 You can also specify the same database directory/file twice (ensuring unique `db_name`s) and specify different parameters for each database to compare the effect of different parameters during classification/profiling.
|
||||
|
||||
|
|
|
@ -13,8 +13,8 @@ workflow DB_CHECK {
|
|||
ch_dbs_for_untar = Channel.empty()
|
||||
ch_final_dbs = Channel.empty()
|
||||
|
||||
// special check to check _between_ rows, for which we must group rows together
|
||||
// note: this will run in parallel to within-row validity, but we can assume this will run faster thus will fail first
|
||||
// Special check to check _between_ rows, for which we must group rows together
|
||||
// Note: this will run in parallel to within-row validity, but we can assume this will run faster thus will fail first
|
||||
Channel.fromPath(dbsheet)
|
||||
.splitCsv ( header:true, sep:',' )
|
||||
.map {[it.tool, it.db_name] }
|
||||
|
@ -25,13 +25,14 @@ workflow DB_CHECK {
|
|||
if ( unique_names.size() < db_name.size() ) exit 1, "[nf-core/taxprofiler] ERROR: Each database for a tool must have a unique name, duplicated detected. Tool: ${tool}, Database name: ${unique_names}"
|
||||
}
|
||||
|
||||
// normal checks for within-row validity, so can be moved to separate functions
|
||||
// Normal checks for within-row validity, so can be moved to separate functions
|
||||
parsed_samplesheet = Channel.fromPath(dbsheet)
|
||||
.splitCsv ( header:true, sep:',' )
|
||||
.map {
|
||||
validate_db_rows(it)
|
||||
create_db_channels(it)
|
||||
}
|
||||
.dump(tag: "blah")
|
||||
|
||||
ch_dbs_for_untar = parsed_samplesheet
|
||||
.branch {
|
||||
|
@ -40,7 +41,7 @@ workflow DB_CHECK {
|
|||
}
|
||||
|
||||
// Filter the channel to untar only those databases for tools that are selected to be run by the user.
|
||||
ch_input_untar = ch_dbs_for_untar.untar.dump()
|
||||
ch_input_untar = ch_dbs_for_untar.untar
|
||||
.filter {
|
||||
params["run_${it[0]['tool']}"]
|
||||
}
|
||||
|
@ -71,6 +72,8 @@ def validate_db_rows(LinkedHashMap row){
|
|||
if ( row.db_params.contains('"') ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. No quotes allowed. Error in: ${row}"
|
||||
if ( row.db_params.contains("'") ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. No quotes allowed. Error in: ${row}"
|
||||
|
||||
if ( row.tool == 'bracken' && !row.db_params.contains(";") ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. Bracken requires a semi-colon. Error in: ${row}"
|
||||
|
||||
}
|
||||
|
||||
def create_db_channels(LinkedHashMap row) {
|
||||
|
|
|
@ -122,6 +122,15 @@ workflow PROFILING {
|
|||
if ( params.run_kraken2 ) {
|
||||
|
||||
ch_input_for_kraken2 = ch_input_for_profiling.kraken2
|
||||
.dump(tag: "ch_input_for_kraken2")
|
||||
.map {
|
||||
// Have to pick first element if using bracken,
|
||||
// as db sheet for bracken must have ; sep list to
|
||||
meta, reads, db_meta, db ->
|
||||
def db_meta_new = db_meta.clone()
|
||||
db_meta_new['db_params'] = db_meta['tool'] == 'bracken' ? db_meta_new['db_params'].split(;)[0] : db_meta_new['db_params']
|
||||
[ meta, reads, db_meta_new, db ]
|
||||
}
|
||||
.multiMap {
|
||||
it ->
|
||||
reads: [ it[0] + it[2], it[1] ]
|
||||
|
@ -155,15 +164,19 @@ workflow PROFILING {
|
|||
ch_kraken2_output = KRAKEN2_STANDARD_REPORT(ch_kraken2_output).report
|
||||
}
|
||||
|
||||
// TODO UPDATE BRACKEN TO TAKE SECOND ELEMENT OF LIST
|
||||
// NEED TO DO CHECKS WHEN ONE OR THE OTHER IS EMPTY AS WELL
|
||||
|
||||
// Extract the database name to combine by.
|
||||
ch_bracken_databases = databases
|
||||
.filter { meta, db -> meta['tool'] == 'bracken' }
|
||||
.map { meta, db -> [meta['db_name'], meta, db] }
|
||||
|
||||
// Extract the database name to combine by.
|
||||
// Combine back with the reads
|
||||
ch_input_for_bracken = ch_kraken2_output
|
||||
.map { meta, report -> [meta['db_name'], meta, report] }
|
||||
.combine(ch_bracken_databases, by: 0)
|
||||
.dump(tag: "ch_input_for_bracken")
|
||||
.multiMap { key, meta, report, db_meta, db ->
|
||||
report: [meta + db_meta, report]
|
||||
db: db
|
||||
|
|
Loading…
Reference in a new issue