1
0
Fork 0
mirror of https://github.com/MillironX/taxprofiler.git synced 2024-12-22 04:18:17 +00:00

Merge pull request #276 from nf-core/feat-batch

feat: introduce sample batches for krakenuniq
This commit is contained in:
Moritz E. Beber 2023-04-23 11:17:29 +02:00 committed by GitHub
commit c4ad03ff9f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 23 additions and 10 deletions

View file

@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### `Added`
- [#276](https://github.com/nf-core/taxprofiler/pull/276) Implemented batching in the KrakenUniq samples processing. (added by @Midnighter)
- [#272](https://github.com/nf-core/taxprofiler/pull/272) - Add saving of final 'analysis-ready-reads' to dedicated directory. (❤️ to @alexhbnr for reporting, added by @jfy133)
### `Fixed`

View file

@ -125,6 +125,7 @@ params {
krakenuniq_ram_chunk_size = '16G'
krakenuniq_save_reads = false // added directly to module in profiling.nf
krakenuniq_save_readclassifications = false // added directly to module in profiling.nf
krakenuniq_batch_size = 20
// Bracken
run_bracken = false

View file

@ -432,6 +432,13 @@
"description": "Turn on saving of KrakenUniq per-read taxonomic assignment file",
"help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--output`"
},
"krakenuniq_batch_size": {
"type": "integer",
"default": 20,
"fa_icon": "far fa-window-restore",
"description": "Specify the number of samples for each KrakenUniq run",
"help_text": "Specify the batch size for KrakenUniq. The reference database for KrakenUniq is loaded into memory once per nextflow process and then used to classify many samples. When you have many samples, a single KrakenUniq run can be rather slow. Alternatively, we can split up KrakenUniq runs for a 'batch' of samples, allowing a balance between shared use of database for multiple samples, but also faster parallelised KrakenUniq runs. This parameter determines for how many samples at a time."
},
"run_bracken": {
"type": "boolean",
"description": "Turn on Bracken (and the required Kraken2 prerequisite step).",

View file

@ -315,16 +315,20 @@ workflow PROFILING {
if ( params.run_krakenuniq ) {
ch_input_for_krakenuniq = ch_input_for_profiling.krakenuniq
.map {
meta, reads, db_meta, db ->
[[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db]
}
.groupTuple(by: [0,2,3])
.multiMap {
single_meta, reads, db_meta, db ->
reads: [ single_meta + db_meta, reads.flatten() ]
db: db
}
.map {
meta, reads, db_meta, db ->
[[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db]
}
.groupTuple(by: [0,2,3])
.flatMap { single_meta, reads, db_meta, db ->
def batches = reads.collate(params.krakenuniq_batch_size)
return batches.collect { batch -> [ single_meta + db_meta, batch.flatten(), db ]}
}
.multiMap {
meta, reads, db ->
reads: [ meta, reads ]
db: db
}
// Hardcode to _always_ produce the report file (which is our basic output, and goes into)
KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.krakenuniq_ram_chunk_size, params.krakenuniq_save_reads, true, params.krakenuniq_save_readclassifications )
ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report )