diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e91599..65b53cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +- [#276](https://github.com/nf-core/taxprofiler/pull/276) Implemented batching in the KrakenUniq samples processing. (added by @Midnighter) - [#272](https://github.com/nf-core/taxprofiler/pull/272) - Add saving of final 'analysis-ready-reads' to dedicated directory. (❤️ to @alexhbnr for reporting, added by @jfy133) ### `Fixed` diff --git a/nextflow.config b/nextflow.config index 827d182..c155769 100644 --- a/nextflow.config +++ b/nextflow.config @@ -125,6 +125,7 @@ params { krakenuniq_ram_chunk_size = '16G' krakenuniq_save_reads = false // added directly to module in profiling.nf krakenuniq_save_readclassifications = false // added directly to module in profiling.nf + krakenuniq_batch_size = 20 // Bracken run_bracken = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 803e213..60c22df 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -432,6 +432,13 @@ "description": "Turn on saving of KrakenUniq per-read taxonomic assignment file", "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--output`" }, + "krakenuniq_batch_size": { + "type": "integer", + "default": 20, + "fa_icon": "far fa-window-restore", + "description": "Specify the number of samples for each KrakenUniq run", + "help_text": "Specify the batch size for KrakenUniq. The reference database for KrakenUniq is loaded into memory once per nextflow process and then used to classify many samples. When you have many samples, a single KrakenUniq run can be rather slow. Alternatively, we can split up KrakenUniq runs for a 'batch' of samples, allowing a balance between shared use of database for multiple samples, but also faster parallelised KrakenUniq runs. This parameter determines for how many samples at a time." + }, "run_bracken": { "type": "boolean", "description": "Turn on Bracken (and the required Kraken2 prerequisite step).", diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index d328a9c..7f8b943 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -315,16 +315,20 @@ workflow PROFILING { if ( params.run_krakenuniq ) { ch_input_for_krakenuniq = ch_input_for_profiling.krakenuniq - .map { - meta, reads, db_meta, db -> - [[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db] - } - .groupTuple(by: [0,2,3]) - .multiMap { - single_meta, reads, db_meta, db -> - reads: [ single_meta + db_meta, reads.flatten() ] - db: db - } + .map { + meta, reads, db_meta, db -> + [[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db] + } + .groupTuple(by: [0,2,3]) + .flatMap { single_meta, reads, db_meta, db -> + def batches = reads.collate(params.krakenuniq_batch_size) + return batches.collect { batch -> [ single_meta + db_meta, batch.flatten(), db ]} + } + .multiMap { + meta, reads, db -> + reads: [ meta, reads ] + db: db + } // Hardcode to _always_ produce the report file (which is our basic output, and goes into) KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.krakenuniq_ram_chunk_size, params.krakenuniq_save_reads, true, params.krakenuniq_save_readclassifications ) ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report )