Merge pull request #276 from nf-core/feat-batch

feat: introduce sample batches for krakenuniq
2024-12-22 04:18:17 +00:00 · 2023-04-23 11:17:29 +02:00 · 2023-04-23 11:17:29 +02:00 · c4ad03ff9f
commit c4ad03ff9f
parent b299a25bc7 66a166dbdc
4 changed files with 23 additions and 10 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### `Added`

+- [#276](https://github.com/nf-core/taxprofiler/pull/276) Implemented batching in the KrakenUniq samples processing. (added by @Midnighter)
 - [#272](https://github.com/nf-core/taxprofiler/pull/272) - Add saving of final 'analysis-ready-reads' to dedicated directory. (❤️ to @alexhbnr for reporting, added by @jfy133)

 ### `Fixed`
--- a/nextflow.config
+++ b/nextflow.config
@ -125,6 +125,7 @@ params {
    krakenuniq_ram_chunk_size           = '16G'
    krakenuniq_save_reads               = false // added directly to module in profiling.nf
    krakenuniq_save_readclassifications = false // added directly to module in profiling.nf
+    krakenuniq_batch_size               = 20

    // Bracken
    run_bracken = false
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -432,6 +432,13 @@
                    "description": "Turn on saving of KrakenUniq per-read taxonomic assignment file",
                    "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--output`"
                },
+                "krakenuniq_batch_size": {
+                    "type": "integer",
+                    "default": 20,
+                    "fa_icon": "far fa-window-restore",
+                    "description": "Specify the number of samples for each KrakenUniq run",
+                    "help_text": "Specify the batch size for KrakenUniq. The reference database for KrakenUniq is loaded into memory once per nextflow process and then used to classify many samples. When you have many samples, a single KrakenUniq run can be rather slow. Alternatively, we can split up KrakenUniq runs for a 'batch' of samples, allowing a balance between shared use of database for multiple samples, but also faster parallelised KrakenUniq runs. This parameter determines for how many samples at a time."
+                },
                "run_bracken": {
                    "type": "boolean",
                    "description": "Turn on Bracken (and the required Kraken2 prerequisite step).",
--- a/subworkflows/local/profiling.nf
+++ b/subworkflows/local/profiling.nf
@ -315,16 +315,20 @@ workflow PROFILING {

    if ( params.run_krakenuniq ) {
        ch_input_for_krakenuniq =  ch_input_for_profiling.krakenuniq
-                                    .map {
-                                        meta, reads, db_meta, db ->
-                                            [[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db]
-                                    }
-                                    .groupTuple(by: [0,2,3])
-                                    .multiMap {
-                                        single_meta, reads, db_meta, db ->
-                                            reads: [ single_meta + db_meta, reads.flatten() ]
-                                            db: db
-                                }
+            .map {
+                meta, reads, db_meta, db ->
+                    [[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db]
+            }
+            .groupTuple(by: [0,2,3])
+            .flatMap { single_meta, reads, db_meta, db ->
+                def batches = reads.collate(params.krakenuniq_batch_size)
+                return batches.collect { batch -> [ single_meta + db_meta, batch.flatten(), db ]}
+            }
+            .multiMap {
+                meta, reads, db ->
+                    reads: [ meta, reads ]
+                    db: db
+            }
        // Hardcode to _always_ produce the report file (which is our basic output, and goes into)
        KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.krakenuniq_ram_chunk_size, params.krakenuniq_save_reads, true, params.krakenuniq_save_readclassifications )
        ch_multiqc_files       = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report )