From 70dd7cade992d05b848de6ea41752a98e8546811 Mon Sep 17 00:00:00 2001
From: "Moritz E. Beber" <midnighter@posteo.net>
Date: Thu, 30 Mar 2023 16:49:22 +0200
Subject: [PATCH 1/6] feat: introduce sample batches for krakenuniq

---
 nextflow.config                 |  1 +
 subworkflows/local/profiling.nf | 24 ++++++++++++++----------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 827d182..c155769 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -125,6 +125,7 @@ params {
     krakenuniq_ram_chunk_size           = '16G'
     krakenuniq_save_reads               = false // added directly to module in profiling.nf
     krakenuniq_save_readclassifications = false // added directly to module in profiling.nf
+    krakenuniq_batch_size               = 20
 
     // Bracken
     run_bracken = false
diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf
index d328a9c..760353a 100644
--- a/subworkflows/local/profiling.nf
+++ b/subworkflows/local/profiling.nf
@@ -315,16 +315,20 @@ workflow PROFILING {
 
     if ( params.run_krakenuniq ) {
         ch_input_for_krakenuniq =  ch_input_for_profiling.krakenuniq
-                                    .map {
-                                        meta, reads, db_meta, db ->
-                                            [[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db]
-                                    }
-                                    .groupTuple(by: [0,2,3])
-                                    .multiMap {
-                                        single_meta, reads, db_meta, db ->
-                                            reads: [ single_meta + db_meta, reads.flatten() ]
-                                            db: db
-                                }
+            .map {
+                meta, reads, db_meta, db ->
+                    [[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db]
+            }
+            .groupTuple(by: [0,2,3])
+            .flatMap { single_meta, reads, db_meta, db ->
+                def batches = reads.collate(params.krakenuniq_batch_size)
+                return batches.colect { batch -> [ single_meta + db_meta, batch.flatten(), db ]}
+            }
+            .multiMap {
+                meta, reads, db ->
+                    reads: [ meta, reads ]
+                    db: db
+            }
         // Hardcode to _always_ produce the report file (which is our basic output, and goes into)
         KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.krakenuniq_ram_chunk_size, params.krakenuniq_save_reads, true, params.krakenuniq_save_readclassifications )
         ch_multiqc_files       = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report )

From 17a9493e6dd501486afc438b2d94e1dd0fe726cf Mon Sep 17 00:00:00 2001
From: "Moritz E. Beber" <midnighter@posteo.net>
Date: Thu, 30 Mar 2023 16:51:41 +0200
Subject: [PATCH 2/6] fix: remove typo

---
 subworkflows/local/profiling.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf
index 760353a..7f8b943 100644
--- a/subworkflows/local/profiling.nf
+++ b/subworkflows/local/profiling.nf
@@ -322,7 +322,7 @@ workflow PROFILING {
             .groupTuple(by: [0,2,3])
             .flatMap { single_meta, reads, db_meta, db ->
                 def batches = reads.collate(params.krakenuniq_batch_size)
-                return batches.colect { batch -> [ single_meta + db_meta, batch.flatten(), db ]}
+                return batches.collect { batch -> [ single_meta + db_meta, batch.flatten(), db ]}
             }
             .multiMap {
                 meta, reads, db ->

From 2725fe15a9afbc40cfecdb15235021f10b2794e3 Mon Sep 17 00:00:00 2001
From: "Moritz E. Beber" <midnighter@posteo.net>
Date: Sat, 1 Apr 2023 15:18:46 +0200
Subject: [PATCH 3/6] docs: create a schema entry for the new parameter

---
 nextflow_schema.json | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index 803e213..9c0d1b7 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -432,6 +432,13 @@
                     "description": "Turn on saving of KrakenUniq per-read taxonomic assignment file",
                     "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--output`"
                 },
+                "krakenuniq_batch_size": {
+                    "type": "integer",
+                    "default": 20,
+                    "fa_icon": "far fa-window-restore",
+                    "description": "Set the batch size for KrakenUniq",
+                    "help_text": "Specify the batch size for KrakenUniq. The reference database for KrakenUniq is loaded into memory once in a process and then used to classify many samples. This parameter determines for how many samples at a time."
+                },
                 "run_bracken": {
                     "type": "boolean",
                     "description": "Turn on Bracken (and the required Kraken2 prerequisite step).",

From 49086d9475826fa9edf3b021dec1927d465b14e0 Mon Sep 17 00:00:00 2001
From: "Moritz E. Beber" <midnighter@posteo.net>
Date: Tue, 4 Apr 2023 12:40:27 +0200
Subject: [PATCH 4/6] docs: add changelog entry

[skip ci]
---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0e91599..85a760d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Added`
 
+- [#276](https://github.com/nf-core/taxprofiler/pull/276) Implemented batching in the KrakenUniq samples processing.
 - [#272](https://github.com/nf-core/taxprofiler/pull/272) - Add saving of final 'analysis-ready-reads' to dedicated directory. (❤️ to @alexhbnr for reporting, added by @jfy133)
 
 ### `Fixed`

From 08f06b9982f31aaf0e7ed5af6a79c8584170fb0d Mon Sep 17 00:00:00 2001
From: "Moritz E. Beber" <midnighter@posteo.net>
Date: Sat, 22 Apr 2023 21:18:58 +0200
Subject: [PATCH 5/6] Apply suggestions from code review

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>
---
 CHANGELOG.md         | 2 +-
 nextflow_schema.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 85a760d..65b53cf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Added`
 
-- [#276](https://github.com/nf-core/taxprofiler/pull/276) Implemented batching in the KrakenUniq samples processing.
+- [#276](https://github.com/nf-core/taxprofiler/pull/276) Implemented batching in the KrakenUniq samples processing. (added by @Midnighter)
 - [#272](https://github.com/nf-core/taxprofiler/pull/272) - Add saving of final 'analysis-ready-reads' to dedicated directory. (❤️ to @alexhbnr for reporting, added by @jfy133)
 
 ### `Fixed`
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 9c0d1b7..23f10c8 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -436,8 +436,8 @@
                     "type": "integer",
                     "default": 20,
                     "fa_icon": "far fa-window-restore",
-                    "description": "Set the batch size for KrakenUniq",
-                    "help_text": "Specify the batch size for KrakenUniq. The reference database for KrakenUniq is loaded into memory once in a process and then used to classify many samples. This parameter determines for how many samples at a time."
+                    "description": "Specify the number samples for each KrakenUniq run",
+                    "help_text": "Specify the batch size for KrakenUniq. The reference database for KrakenUniq is loaded into memory once in a process and then used to classify many samples. When you have many samples, a single KrakenUniq run can be rather slow. Alternatively, we can split up KrakenUniq runs for a 'batch' of samples, allowing a balance between shared using a database for multiple samples, but also faster parallelised KrakenUniq runs. This parameter determines for how many samples at a time."
                 },
                 "run_bracken": {
                     "type": "boolean",

From 66a166dbdc8cfd71cde975ccc3828822c37d3ed8 Mon Sep 17 00:00:00 2001
From: "Moritz E. Beber" <midnighter@posteo.net>
Date: Sat, 22 Apr 2023 21:23:27 +0200
Subject: [PATCH 6/6] docs: tweak parameter schema description

---
 nextflow_schema.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index 23f10c8..60c22df 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -436,8 +436,8 @@
                     "type": "integer",
                     "default": 20,
                     "fa_icon": "far fa-window-restore",
-                    "description": "Specify the number samples for each KrakenUniq run",
-                    "help_text": "Specify the batch size for KrakenUniq. The reference database for KrakenUniq is loaded into memory once in a process and then used to classify many samples. When you have many samples, a single KrakenUniq run can be rather slow. Alternatively, we can split up KrakenUniq runs for a 'batch' of samples, allowing a balance between shared using a database for multiple samples, but also faster parallelised KrakenUniq runs. This parameter determines for how many samples at a time."
+                    "description": "Specify the number of samples for each KrakenUniq run",
+                    "help_text": "Specify the batch size for KrakenUniq. The reference database for KrakenUniq is loaded into memory once per nextflow process and then used to classify many samples. When you have many samples, a single KrakenUniq run can be rather slow. Alternatively, we can split up KrakenUniq runs for a 'batch' of samples, allowing a balance between shared use of database for multiple samples, but also faster parallelised KrakenUniq runs. This parameter determines for how many samples at a time."
                 },
                 "run_bracken": {
                     "type": "boolean",