From 4e93abc7c06b35eae584af4338c8aeaa60b8fcf8 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Fri, 25 Mar 2022 14:58:06 +0100
Subject: [PATCH 01/21] Add read improved read preprocessing

---
 conf/modules.config                           | 21 +++++-
 nextflow.config                               | 12 +++-
 nextflow_schema.json                          |  2 +-
 subworkflows/local/shortread_fastp.nf         | 65 +++++++++++++++++++
 subworkflows/local/shortread_preprocessing.nf | 55 ++++------------
 workflows/taxprofiler.nf                      |  2 +
 6 files changed, 107 insertions(+), 50 deletions(-)
 create mode 100644 subworkflows/local/shortread_fastp.nf

diff --git a/conf/modules.config b/conf/modules.config
index 29a5135..36bc626 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -52,12 +52,27 @@ process {
         ]
     }
 
+    withName: FASTQC_PROCESSED {
+        ext.args = '--quiet'
+        ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
+        publishDir = [
+            path: { "${params.outdir}/fastqc/processed" },
+            mode: 'copy',
+            pattern: '*.html'
+        ]
+    }
+
     withName: FASTP {
         ext.prefix = { "${meta.id}_${meta.run_accession}" }
-        // TODO also include option to NOT merge
         ext.args   = [
-            { ${meta.single_end} } == 0 ? "-m" : '',
-            params.shortread_excludeunmerged ? '' : "--include_unmerged"
+            // collapsing options
+            params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged",
+            // trimming options
+            params.shortread_clipmerge_skiptrim ? "--disable_adapter_trimming" : "",
+            params.shortread_adapter1 ? "--adapter_sequence ${params.shortread_adapter1}" : "",
+            !{ ${meta.single_end} } && params.shortread_adapter2 ? "--adapter_sequence_r2 ${params.shortread_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : ""
+            // filtering options
+            "--length_required ${params.shortread_clipmerge_minlength}"
         ].join(' ').trim()
         publishDir = [
             path: { "${params.outdir}/fastp" },
diff --git a/nextflow.config b/nextflow.config
index cc77a99..a312d0c 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -55,9 +55,15 @@ params {
     databases = null
 
     // FASTQ preprocessing
-    shortread_clipmerge           = false
-    shortread_excludeunmerged     = true
-    longread_clip                 = false
+    shortread_clipmerge                     = false
+    shortread_clipmerge_tool                = 'fastp'
+    shortread_clipmerge_skiptrim            = false
+    shortread_clipmerge_mergepairs          = false
+    shortread_clipmerge_excludeunmerged     = true
+    shortread_clipmerge_adapter1            = null
+    shortread_clipmerge_adapter2            = null
+    shortread_clipmerge_minlength           = 15
+    longread_clip                           = false
 
     // MALT
     run_malt                   = false
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 9527da4..0fa217f 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -265,7 +265,7 @@
         "shortread_clipmerge": {
             "type": "boolean"
         },
-        "shortread_excludeunmerged": {
+        "shortread_clipmerge_excludeunmerged": {
             "type": "boolean",
             "default": true
         },
diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf
new file mode 100644
index 0000000..87aba25
--- /dev/null
+++ b/subworkflows/local/shortread_fastp.nf
@@ -0,0 +1,65 @@
+//
+// Check input samplesheet and get read channels
+//
+
+
+include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
+include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
+
+workflow SHORTREAD_FASTP {
+    take:
+    reads // file: /path/to/samplesheet.csv
+
+    main:
+    ch_versions = Channel.empty()
+    ch_multiqc_files      = Channel.empty()
+
+    //
+    // STEP: Read clipping and merging
+    //
+    // TODO give option to retain singletons (probably fastp option likely)
+    // TODO move to subworkflow
+
+    ch_input_for_fastp = reads
+                            .dump(tag: "pre-fastp_branch")
+                            .branch{
+                                single: it[0]['single_end'] == true
+                                paired: it[0]['single_end'] == false
+                            }
+
+    ch_input_for_fastp.single.dump(tag: "input_fastp_single")
+    ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
+
+    FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
+    FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs )
+
+    if ( params.shortread_clipmerge_mergepairs ) {
+        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
+                                    .mix( FASTP_SINGLE.out.reads )
+                                    .map {
+                                        meta, reads ->
+                                        def meta_new = meta.clone()
+                                        meta_new['single_end'] = 1
+                                        [ meta_new, reads ]
+                                        }
+    } else {
+        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads
+                                    .mix( FASTP_SINGLE.out.reads )
+    }
+
+    ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
+    ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
+
+    ch_processed_reads = ch_fastp_reads_prepped.dump(tag: "ch_fastp_reads_prepped")
+
+    ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
+
+    ch_multiqc_files.dump(tag: "preprocessing_fastp_mqc_final")
+
+    emit:
+    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
+    versions = ch_versions          // channel: [ versions.yml ]
+    mqc      = ch_multiqc_files
+}
+
diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf
index d996a76..c31289d 100644
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@@ -3,17 +3,16 @@
 //
 
 
-include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
-include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
-include { FASTQC as FASTQC_POST       } from '../../modules/nf-core/modules/fastqc/main'
+include { SHORTREAD_FASTP             } from './shortread_fastp'
+include { FASTQC as FASTQC_PROCESSED       } from '../../modules/nf-core/modules/fastqc/main'
 
 workflow SHORTREAD_PREPROCESSING {
     take:
     reads // file: /path/to/samplesheet.csv
 
     main:
-    ch_versions = Channel.empty()
-    ch_multiqc_files      = Channel.empty()
+    ch_versions       = Channel.empty()
+    ch_multiqc_files  = Channel.empty()
 
     //
     // STEP: Read clipping and merging
@@ -22,50 +21,20 @@ workflow SHORTREAD_PREPROCESSING {
     // TODO give option to retain singletons (probably fastp option likely)
     // TODO move to subworkflow
 
-
-    if ( params.shortread_clipmerge ) {
-
-        ch_input_for_fastp = reads
-                                .dump(tag: "pre-fastp_branch")
-                                .branch{
-                                    single: it[0]['single_end'] == true
-                                    paired: it[0]['single_end'] == false
-                                }
-
-        ch_input_for_fastp.single.dump(tag: "input_fastp_single")
-        ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
-
-        FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
-        FASTP_PAIRED ( ch_input_for_fastp.paired, false, true )
-
-        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
-                                    .mix( FASTP_SINGLE.out.reads )
-                                    .map {
-                                        meta, reads ->
-                                        def meta_new = meta.clone()
-                                        meta_new['single_end'] = 1
-                                        [ meta_new, reads ]
-                                    }
-
-        FASTQC_POST ( ch_fastp_reads_prepped )
-
-        ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
-        ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
-
-        ch_processed_reads = ch_fastp_reads_prepped
-
-        ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} )
-        ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
-        ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
-
-        ch_multiqc_files.dump(tag: "preprocessing_mqc_final")
-
+    if ( params.shortread_clipmerge_tool == "fastp" ) {
+        ch_processed_reads = SHORTREAD_FASTP ( reads ).reads
+        ch_versions        =  ch_versions.mix( SHORTREAD_FASTP.out.versions )
+        ch_multiqc_files   =  ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc )
     } else {
         ch_processed_reads = reads
     }
 
+    //FASTQC_PROCESSED ( ch_processed_reads )
+    //ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions )
+    //ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )
 
     emit:
+            // TODO: problem, this is being exported as a multi-channel output? This is why FASTQC is broken
     reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
     versions = ch_versions          // channel: [ versions.yml ]
     mqc      = ch_multiqc_files
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index 29058e6..0a907bf 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -17,6 +17,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true
 // Check mandatory parameters
 if (params.input    ) { ch_input     = file(params.input)     } else { exit 1, 'Input samplesheet not specified!' }
 if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
+if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not except uncollapsed paired-reads. Pairs will be profiled as separate files."
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -135,6 +136,7 @@ workflow TAXPROFILER {
 
     CAT_FASTQ ( ch_processed_for_combine.combine )
 
+    // TODO May need to flatten reads?
     ch_reads_for_profiling = ch_processed_for_combine.skip
                                 .dump(tag: "skip_combine")
                                 .mix( CAT_FASTQ.out.reads )

From ff1f28f4f0b8e88898e0f568f83e2da90ff81bc6 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Fri, 25 Mar 2022 15:01:25 +0100
Subject: [PATCH 02/21] Update schema

---
 conf/modules.config  |  4 ++--
 nextflow_schema.json | 38 +++++++++++++++++++++++++++++++++++---
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 36bc626..1052bed 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -69,8 +69,8 @@ process {
             params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged",
             // trimming options
             params.shortread_clipmerge_skiptrim ? "--disable_adapter_trimming" : "",
-            params.shortread_adapter1 ? "--adapter_sequence ${params.shortread_adapter1}" : "",
-            !{ ${meta.single_end} } && params.shortread_adapter2 ? "--adapter_sequence_r2 ${params.shortread_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : ""
+            params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
+            !{ ${meta.single_end} } && params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : "",
             // filtering options
             "--length_required ${params.shortread_clipmerge_minlength}"
         ].join(' ').trim()
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 0fa217f..42307e9 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -10,7 +10,10 @@
             "type": "object",
             "fa_icon": "fas fa-terminal",
             "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input", "outdir"],
+            "required": [
+                "input",
+                "outdir"
+            ],
             "properties": {
                 "input": {
                     "type": "string",
@@ -173,7 +176,14 @@
                     "description": "Method used to save pipeline results to output directory.",
                     "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                     "fa_icon": "fas fa-copy",
-                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
+                    "enum": [
+                        "symlink",
+                        "rellink",
+                        "link",
+                        "copy",
+                        "copyNoFollow",
+                        "move"
+                    ],
                     "hidden": true
                 },
                 "email_on_fail": {
@@ -281,6 +291,28 @@
         },
         "run_kraken2": {
             "type": "boolean"
+        },
+        "shortread_clipmerge_tool": {
+            "type": "string",
+            "default": "fastp"
+        },
+        "shortread_clipmerge_skiptrim": {
+            "type": "boolean"
+        },
+        "shortread_clipmerge_mergepairs": {
+            "type": "boolean"
+        },
+        "shortread_clipmerge_adapter1": {
+            "type": "string",
+            "default": null
+        },
+        "shortread_clipmerge_adapter2": {
+            "type": "string",
+            "default": null
+        },
+        "shortread_clipmerge_minlength": {
+            "type": "integer",
+            "default": 15
         }
     }
-}
+}
\ No newline at end of file

From dfcf8f7b1ae183a233d8a1c94f39a1f27084a966 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Fri, 25 Mar 2022 15:10:52 +0100
Subject: [PATCH 03/21] Prettier

---
 conf/test.config     |  1 -
 nextflow_schema.json | 16 +++-------------
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/conf/test.config b/conf/test.config
index 5924d7a..92a10e4 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -23,7 +23,6 @@ params {
     // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
     // TODO nf-core: Give any required params for the test so that command line flags are not needed
     input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
-    outdir              = "./results"
     databases           = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
     run_kraken2         = true
     run_malt            = true
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 42307e9..545f990 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -10,10 +10,7 @@
             "type": "object",
             "fa_icon": "fas fa-terminal",
             "description": "Define where the pipeline should find input data and save output data.",
-            "required": [
-                "input",
-                "outdir"
-            ],
+            "required": ["input", "outdir"],
             "properties": {
                 "input": {
                     "type": "string",
@@ -176,14 +173,7 @@
                     "description": "Method used to save pipeline results to output directory.",
                     "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                     "fa_icon": "fas fa-copy",
-                    "enum": [
-                        "symlink",
-                        "rellink",
-                        "link",
-                        "copy",
-                        "copyNoFollow",
-                        "move"
-                    ],
+                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
                     "hidden": true
                 },
                 "email_on_fail": {
@@ -315,4 +305,4 @@
             "default": 15
         }
     }
-}
\ No newline at end of file
+}

From b5f5c755046b3ae1af9648e879dfa48519385891 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Fri, 25 Mar 2022 15:21:52 +0100
Subject: [PATCH 04/21] Fix file header descriptions

---
 subworkflows/local/longread_preprocessing.nf |  9 ++++++---
 subworkflows/local/shortread_fastp.nf        | 10 +++-------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf
index da1049a..58968e8 100644
--- a/subworkflows/local/longread_preprocessing.nf
+++ b/subworkflows/local/longread_preprocessing.nf
@@ -1,6 +1,9 @@
+/*
+Process long raw reads with porechop
+*/
 
-include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main'
-include { PORECHOP              } from '../../modules/nf-core/modules/porechop/main'
+include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main'
+include { PORECHOP                   } from '../../modules/nf-core/modules/porechop/main'
 
 workflow LONGREAD_PREPROCESSING {
     take:
@@ -23,7 +26,7 @@ workflow LONGREAD_PREPROCESSING {
 
     FASTQC_POST ( PORECHOP.out.reads )
     ch_versions = ch_versions.mix(PORECHOP.out.versions.first())
-    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )
 
 
     emit:
diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf
index 87aba25..f457cf3 100644
--- a/subworkflows/local/shortread_fastp.nf
+++ b/subworkflows/local/shortread_fastp.nf
@@ -1,7 +1,6 @@
-//
-// Check input samplesheet and get read channels
-//
-
+/*
+Process short raw reads with FastP
+*/
 
 include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
 include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
@@ -17,9 +16,6 @@ workflow SHORTREAD_FASTP {
     //
     // STEP: Read clipping and merging
     //
-    // TODO give option to retain singletons (probably fastp option likely)
-    // TODO move to subworkflow
-
     ch_input_for_fastp = reads
                             .dump(tag: "pre-fastp_branch")
                             .branch{

From 7ddcb09a85ed8c50f5d661b398f0e0a5e113add8 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Fri, 25 Mar 2022 15:22:41 +0100
Subject: [PATCH 05/21] Some more cleanup

---
 .../local/shortread_adapterremoval.nf         | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 subworkflows/local/shortread_adapterremoval.nf

diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf
new file mode 100644
index 0000000..e15d2ef
--- /dev/null
+++ b/subworkflows/local/shortread_adapterremoval.nf
@@ -0,0 +1,64 @@
+/*
+Process raw reads with AdapterRemoval
+*/
+
+include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
+include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
+
+workflow SHORTREAD_FASTP {
+    take:
+    reads // file: /path/to/samplesheet.csv
+
+    main:
+    ch_versions = Channel.empty()
+    ch_multiqc_files      = Channel.empty()
+
+    //
+    // STEP: Read clipping and merging
+    //
+    // TODO give option to retain singletons (probably fastp option likely)
+    // TODO move to subworkflow
+
+    ch_input_for_fastp = reads
+                            .dump(tag: "pre-fastp_branch")
+                            .branch{
+                                single: it[0]['single_end'] == true
+                                paired: it[0]['single_end'] == false
+                            }
+
+    ch_input_for_fastp.single.dump(tag: "input_fastp_single")
+    ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
+
+    FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
+    FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs )
+
+    if ( params.shortread_clipmerge_mergepairs ) {
+        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
+                                    .mix( FASTP_SINGLE.out.reads )
+                                    .map {
+                                        meta, reads ->
+                                        def meta_new = meta.clone()
+                                        meta_new['single_end'] = 1
+                                        [ meta_new, reads ]
+                                        }
+    } else {
+        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads
+                                    .mix( FASTP_SINGLE.out.reads )
+    }
+
+    ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
+    ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
+
+    ch_processed_reads = ch_fastp_reads_prepped.dump(tag: "ch_fastp_reads_prepped")
+
+    ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
+
+    ch_multiqc_files.dump(tag: "preprocessing_fastp_mqc_final")
+
+    emit:
+    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
+    versions = ch_versions          // channel: [ versions.yml ]
+    mqc      = ch_multiqc_files
+}
+

From 0e6f7e2ca137fbb8306b733ab3abc3b6d8ad83f7 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Fri, 25 Mar 2022 15:24:10 +0100
Subject: [PATCH 06/21] Revert "Some more cleanup"

This reverts commit 7ddcb09a85ed8c50f5d661b398f0e0a5e113add8.
---
 .../local/shortread_adapterremoval.nf         | 64 -------------------
 1 file changed, 64 deletions(-)
 delete mode 100644 subworkflows/local/shortread_adapterremoval.nf

diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf
deleted file mode 100644
index e15d2ef..0000000
--- a/subworkflows/local/shortread_adapterremoval.nf
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
-Process raw reads with AdapterRemoval
-*/
-
-include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
-include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
-
-workflow SHORTREAD_FASTP {
-    take:
-    reads // file: /path/to/samplesheet.csv
-
-    main:
-    ch_versions = Channel.empty()
-    ch_multiqc_files      = Channel.empty()
-
-    //
-    // STEP: Read clipping and merging
-    //
-    // TODO give option to retain singletons (probably fastp option likely)
-    // TODO move to subworkflow
-
-    ch_input_for_fastp = reads
-                            .dump(tag: "pre-fastp_branch")
-                            .branch{
-                                single: it[0]['single_end'] == true
-                                paired: it[0]['single_end'] == false
-                            }
-
-    ch_input_for_fastp.single.dump(tag: "input_fastp_single")
-    ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
-
-    FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
-    FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs )
-
-    if ( params.shortread_clipmerge_mergepairs ) {
-        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
-                                    .mix( FASTP_SINGLE.out.reads )
-                                    .map {
-                                        meta, reads ->
-                                        def meta_new = meta.clone()
-                                        meta_new['single_end'] = 1
-                                        [ meta_new, reads ]
-                                        }
-    } else {
-        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads
-                                    .mix( FASTP_SINGLE.out.reads )
-    }
-
-    ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
-    ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
-
-    ch_processed_reads = ch_fastp_reads_prepped.dump(tag: "ch_fastp_reads_prepped")
-
-    ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
-    ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
-
-    ch_multiqc_files.dump(tag: "preprocessing_fastp_mqc_final")
-
-    emit:
-    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
-    versions = ch_versions          // channel: [ versions.yml ]
-    mqc      = ch_multiqc_files
-}
-

From b763bfa2c0a7dabe4ad8c4216cf852a3938a49b6 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Fri, 25 Mar 2022 15:26:57 +0100
Subject: [PATCH 07/21] More cleanup

---
 subworkflows/local/shortread_fastp.nf | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf
index f457cf3..57ce2a6 100644
--- a/subworkflows/local/shortread_fastp.nf
+++ b/subworkflows/local/shortread_fastp.nf
@@ -13,9 +13,6 @@ workflow SHORTREAD_FASTP {
     ch_versions = Channel.empty()
     ch_multiqc_files      = Channel.empty()
 
-    //
-    // STEP: Read clipping and merging
-    //
     ch_input_for_fastp = reads
                             .dump(tag: "pre-fastp_branch")
                             .branch{

From ede362dbf91df9f5118f0122f8bfc09d888cb141 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Fri, 25 Mar 2022 15:28:30 +0100
Subject: [PATCH 08/21] Remove duplicate config and sync longread fastqc

---
 conf/modules.config                          | 10 ----------
 subworkflows/local/longread_preprocessing.nf |  2 +-
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 1052bed..7d50174 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -90,16 +90,6 @@ process {
         ]
     }
 
-    withName: FASTQC_POST {
-        ext.args = '--quiet'
-        ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
-        publishDir = [
-            path: { "${params.outdir}/fastqc/processed" },
-            mode: 'copy',
-            pattern: '*.html'
-        ]
-    }
-
     withName: CAT_FASTQ {
         publishDir = [
             path: { "${params.outdir}/prepared_sequences" },
diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf
index 58968e8..7c7c24b 100644
--- a/subworkflows/local/longread_preprocessing.nf
+++ b/subworkflows/local/longread_preprocessing.nf
@@ -24,7 +24,7 @@ workflow LONGREAD_PREPROCESSING {
                                         [ meta_new, reads ]
                                     }
 
-    FASTQC_POST ( PORECHOP.out.reads )
+    FASTQC_PROCESSED ( PORECHOP.out.reads )
     ch_versions = ch_versions.mix(PORECHOP.out.versions.first())
     ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )
 

From 000f129dab1b139cec6e9194670addf8d9e6173a Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates" <jfy133@gmail.com>
Date: Fri, 25 Mar 2022 15:32:56 +0100
Subject: [PATCH 09/21] Apply suggestions from code review

---
 subworkflows/local/shortread_preprocessing.nf | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf
index c31289d..c82536f 100644
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@@ -29,12 +29,11 @@ workflow SHORTREAD_PREPROCESSING {
         ch_processed_reads = reads
     }
 
-    //FASTQC_PROCESSED ( ch_processed_reads )
-    //ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions )
-    //ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )
+    FASTQC_PROCESSED ( ch_processed_reads )
+    ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )
 
     emit:
-            // TODO: problem, this is being exported as a multi-channel output? This is why FASTQC is broken
     reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
     versions = ch_versions          // channel: [ versions.yml ]
     mqc      = ch_multiqc_files

From 120382f51d184bed0375ab8c642f6bdab77a617f Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Sat, 26 Mar 2022 06:45:16 +0100
Subject: [PATCH 10/21] Fix input channels to kraken2

---
 workflows/taxprofiler.nf | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index 0a907bf..c1d7f34 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -37,11 +37,11 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi
 //
 // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
 //
-include { INPUT_CHECK         } from '../subworkflows/local/input_check'
+include { INPUT_CHECK             } from '../subworkflows/local/input_check'
 
-include { DB_CHECK            } from '../subworkflows/local/db_check'
+include { DB_CHECK                } from '../subworkflows/local/db_check'
 include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing'
-include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing'
+include { LONGREAD_PREPROCESSING  } from '../subworkflows/local/longread_preprocessing'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -171,7 +171,7 @@ workflow TAXPROFILER {
                                     [ temp_meta, it[1], db ]
                             }
                             .groupTuple(by: [0,2])
-                            .dump(tag: "input for malt")
+                            .dump(tag: "input_for_malt")
                             .multiMap {
                                 it ->
                                     reads: [ it[0], it[1].flatten() ]
@@ -180,10 +180,10 @@ workflow TAXPROFILER {
 
     // We can run Kraken2 one-by-one sample-wise
     ch_input_for_kraken2 =  ch_input_for_profiling.kraken2
-                            .dump(tag: "input for kraken")
+                            .dump(tag: "input_for_kraken")
                             .multiMap {
                                 it ->
-                                    reads: [ it[0] + it[2], it[1] ]
+                                    reads: [ it[0] + it[2], it[1].flatten() ]
                                     db: it[3]
                             }
 

From 622fafedc88489026d4c8cd5eb62319e736d1375 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates" <jfy133@gmail.com>
Date: Sat, 26 Mar 2022 20:22:35 +0000
Subject: [PATCH 11/21] Prempt centrifuge database untar crash

---
 subworkflows/local/db_check.nf | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf
index 890e373..96f815a 100644
--- a/subworkflows/local/db_check.nf
+++ b/subworkflows/local/db_check.nf
@@ -12,6 +12,9 @@ workflow DB_CHECK {
     main:
 
     // TODO: make database sheet check
+    // Checks:
+    // 1) no duplicates,
+    // 2) dbs with no special arguments does not have quotes, e.g. just `,,` and NOT `,"",`
     parsed_samplesheet = DATABASE_CHECK ( dbsheet )
         .csv
         .splitCsv ( header:true, sep:',' )
@@ -21,7 +24,7 @@ workflow DB_CHECK {
 
     ch_dbs_for_untar = parsed_samplesheet
         .branch {
-            untar: it[1].toString().endsWith(".tar.gz")
+            untar: it[1].toString().endsWith(".tar.gz") && it[0]['tool'] != 'centrifuge'
             skip: true
         }
 

From e6e8ed7cc9904973ce6236c98eb74f8246c3ec09 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates" <jfy133@gmail.com>
Date: Sat, 26 Mar 2022 20:54:50 +0000
Subject: [PATCH 12/21] Add some debugging notes

---
 conf/modules.config      |  4 +--
 workflows/taxprofiler.nf | 60 +++++++++++++++++++++-------------------
 2 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 7d50174..5823d3f 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -105,7 +105,7 @@ process {
             pattern: '*.{rma6,tab,text,sam,log}'
         ]
         ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.db_name}" }
+        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
     }
 
     withName: KRAKEN2_KRAKEN2 {
@@ -115,7 +115,7 @@ process {
             pattern: '*.{fastq.gz,txt}'
         ]
         ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.db_name}" }
+        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
     }
 
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index c1d7f34..ee921ea 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -74,9 +74,9 @@ workflow TAXPROFILER {
 
     ch_versions = Channel.empty()
 
-    //
-    // SUBWORKFLOW: Read in samplesheet, validate and stage input files
-    //
+    /*
+        SUBWORKFLOW: Read in samplesheet, validate and stage input files
+    */
     INPUT_CHECK (
         ch_input
     )
@@ -86,9 +86,9 @@ workflow TAXPROFILER {
         ch_databases
     )
 
-    //
-    // MODULE: Run FastQC
-    //
+    /*
+        MODULE: Run FastQC
+    */
     ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ).dump(tag: "input_to_fastq")
     FASTQC (
         ch_input_for_fastqc
@@ -99,9 +99,9 @@ workflow TAXPROFILER {
         ch_versions.unique().collectFile(name: 'collated_versions.yml')
     )
 
-    //
-    // PERFORM PREPROCESSING
-    //
+    /*
+        SUBWORKFLOW: PERFORM PREPROCESSING
+    */
     if ( params.shortread_clipmerge ) {
         ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads
     } else {
@@ -116,16 +116,19 @@ workflow TAXPROFILER {
         ch_longreads_preprocessed = INPUT_CHECK.out.nanopore
     }
 
-    //
-    // PERFORM SHORT READ RUN MERGING
+    /*
+        MODULE: PERFORM SHORT READ RUN MERGING
+    */
+
     // TODO: Check not necessary for long reads too?
-    //
+    // TODO: source of clash - combined should only occur when
+    // files ARE to be combined. SE/unmerged (see not below)
     ch_processed_for_combine = ch_shortreads_preprocessed
         .dump(tag: "prep_for_combine_grouping")
         .map {
             meta, reads ->
             def meta_new = meta.clone()
-            meta_new['run_accession'] = 'combined'
+            //meta_new['run_accession'] = 'combined'
             [ meta_new, reads ]
         }
         .groupTuple ( by: 0 )
@@ -134,17 +137,18 @@ workflow TAXPROFILER {
             skip: it[1].size() < 2
         }
 
+    // NOTE: this does not allow CATing of SE & PE runs of same sample
+    // when --shortread_clipmerge_mergepairs is false
     CAT_FASTQ ( ch_processed_for_combine.combine )
 
-    // TODO May need to flatten reads?
     ch_reads_for_profiling = ch_processed_for_combine.skip
                                 .dump(tag: "skip_combine")
                                 .mix( CAT_FASTQ.out.reads )
                                 .dump(tag: "files_for_profiling")
 
-    //
-    // COMBINE READS WITH POSSIBLE DATABASES
-    //
+    /*
+        COMBINE READS WITH POSSIBLE DATABASES
+    */
 
     // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
     ch_input_for_profiling = ch_reads_for_profiling
@@ -157,9 +161,9 @@ workflow TAXPROFILER {
                 unknown: true
             }
 
-    //
-    // PREPARE PROFILER INPUT CHANNELS
-    //
+    /*
+        PREPARE PROFILER INPUT CHANNELS
+    */
 
     // We groupTuple to have all samples in one channel for MALT as database
     // loading takes a long time, so we only want to run it once per database
@@ -171,7 +175,7 @@ workflow TAXPROFILER {
                                     [ temp_meta, it[1], db ]
                             }
                             .groupTuple(by: [0,2])
-                            .dump(tag: "input_for_malt")
+                            .dump(tag: "input_to_malt")
                             .multiMap {
                                 it ->
                                     reads: [ it[0], it[1].flatten() ]
@@ -180,16 +184,16 @@ workflow TAXPROFILER {
 
     // We can run Kraken2 one-by-one sample-wise
     ch_input_for_kraken2 =  ch_input_for_profiling.kraken2
-                            .dump(tag: "input_for_kraken")
+                            .dump(tag: "input_to_kraken")
                             .multiMap {
                                 it ->
                                     reads: [ it[0] + it[2], it[1].flatten() ]
                                     db: it[3]
                             }
 
-    //
-    // RUN PROFILING
-    //
+    /*
+        MODULE: RUN PROFILING
+    */
     if ( params.run_malt ) {
         MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db )
     }
@@ -198,9 +202,9 @@ workflow TAXPROFILER {
         KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db  )
     }
 
-    //
-    // MODULE: MultiQC
-    //
+    /*
+        MODULE: MultiQC
+    */
     workflow_summary    = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params)
     ch_workflow_summary = Channel.value(workflow_summary)
 

From 8dc9e583add80f51111dc6c796a2ea1413e8731b Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Sun, 27 Mar 2022 09:30:23 +0200
Subject: [PATCH 13/21] Debugging run merging

---
 conf/modules.config                           |  6 ++---
 .../nf-core/modules/kraken2/kraken2/main.nf   |  3 ++-
 subworkflows/local/shortread_fastp.nf         | 10 ++++---
 subworkflows/local/shortread_preprocessing.nf |  7 -----
 workflows/taxprofiler.nf                      | 27 ++++++++++++++-----
 5 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 5823d3f..41f471f 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -65,7 +65,7 @@ process {
     withName: FASTP {
         ext.prefix = { "${meta.id}_${meta.run_accession}" }
         ext.args   = [
-            // collapsing options
+            // collapsing options - option to retain singletons
             params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged",
             // trimming options
             params.shortread_clipmerge_skiptrim ? "--disable_adapter_trimming" : "",
@@ -105,7 +105,7 @@ process {
             pattern: '*.{rma6,tab,text,sam,log}'
         ]
         ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
+        ext.prefix = { "${meta.id}-${meta.db_name}" }
     }
 
     withName: KRAKEN2_KRAKEN2 {
@@ -115,7 +115,7 @@ process {
             pattern: '*.{fastq.gz,txt}'
         ]
         ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
+        ext.prefix = { "${meta.id}-${meta.db_name}" }
     }
 
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
diff --git a/modules/nf-core/modules/kraken2/kraken2/main.nf b/modules/nf-core/modules/kraken2/kraken2/main.nf
index 3ec5df5..52351f5 100644
--- a/modules/nf-core/modules/kraken2/kraken2/main.nf
+++ b/modules/nf-core/modules/kraken2/kraken2/main.nf
@@ -32,11 +32,12 @@ process KRAKEN2_KRAKEN2 {
         --threads $task.cpus \\
         --unclassified-out $unclassified \\
         --classified-out $classified \\
+        $args \\
         --report ${prefix}.kraken2.report.txt \\
         --gzip-compressed \\
         $paired \\
-        $args \\
         $reads
+        
 
     pigz -p $task.cpus *.fastq
 
diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf
index 57ce2a6..d4d706e 100644
--- a/subworkflows/local/shortread_fastp.nf
+++ b/subworkflows/local/shortread_fastp.nf
@@ -7,7 +7,7 @@ include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fast
 
 workflow SHORTREAD_FASTP {
     take:
-    reads // file: /path/to/samplesheet.csv
+    reads // [[meta], [reads]]
 
     main:
     ch_versions = Channel.empty()
@@ -24,16 +24,18 @@ workflow SHORTREAD_FASTP {
     ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
 
     FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
+    // Last parameter here turns on merging of PE data
     FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs )
 
     if ( params.shortread_clipmerge_mergepairs ) {
+        // TODO update to replace meta suffix
         ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
                                     .mix( FASTP_SINGLE.out.reads )
                                     .map {
                                         meta, reads ->
-                                        def meta_new = meta.clone()
-                                        meta_new['single_end'] = 1
-                                        [ meta_new, reads ]
+                                            def meta_new = meta.clone()
+                                            meta_new['single_end'] = 1
+                                            [ meta_new, reads ]
                                         }
     } else {
         ch_fastp_reads_prepped = FASTP_PAIRED.out.reads
diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf
index c82536f..7fba0c0 100644
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@@ -14,13 +14,6 @@ workflow SHORTREAD_PREPROCESSING {
     ch_versions       = Channel.empty()
     ch_multiqc_files  = Channel.empty()
 
-    //
-    // STEP: Read clipping and merging
-    //
-    // TODO give option to clip only and retain pairs
-    // TODO give option to retain singletons (probably fastp option likely)
-    // TODO move to subworkflow
-
     if ( params.shortread_clipmerge_tool == "fastp" ) {
         ch_processed_reads = SHORTREAD_FASTP ( reads ).reads
         ch_versions        =  ch_versions.mix( SHORTREAD_FASTP.out.versions )
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index ee921ea..87f7a30 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -120,25 +120,40 @@ workflow TAXPROFILER {
         MODULE: PERFORM SHORT READ RUN MERGING
     */
 
-    // TODO: Check not necessary for long reads too?
-    // TODO: source of clash - combined should only occur when
-    // files ARE to be combined. SE/unmerged (see not below)
+    // Remove run accession to allow grouping by sample. Will only merge
+    // if pairment type is the same.
+
+    // TODO Current Branch system currently problematic - when single file not in a list, splits at
+    // `/` so makes list >= 2, so tries to merge, but then breaks kraken downstream
+    // e.g. `home jfellows Documents git nf-core taxprofiler testing work 68 9a2c8362add37832a776058d280bb7 2612_se.merged.fastq.gz`
+    // So theoretically need to force this into a list, (but results the can't access meta.id error as incorrect  input format)
+    // But second issue >= 2 is MAYBE sufficient because what if merging two paired-end files? Need to chcek if the input channel formatted correctly for this? Need to check...
     ch_processed_for_combine = ch_shortreads_preprocessed
         .dump(tag: "prep_for_combine_grouping")
         .map {
             meta, reads ->
             def meta_new = meta.clone()
-            //meta_new['run_accession'] = 'combined'
+
+            // remove run accession to allow group by sample
+            meta_new.remove('run_accession')
+
+            // update id to prevent file name clashes when unable to group
+            // unmerged PE and SE runs of same sample
+            def type = meta_new['single_end'] ? "_se" : "_pe"
+            meta_new['id'] = meta['id'] + type
+
             [ meta_new, reads ]
         }
         .groupTuple ( by: 0 )
+        .dump(tag: "files_for_cat_fastq_branch")
         .branch{
-            combine: it[1].size() >= 2
-            skip: it[1].size() < 2
+            combine: it[1] && it[1].size() > 1
+            skip: true
         }
 
     // NOTE: this does not allow CATing of SE & PE runs of same sample
     // when --shortread_clipmerge_mergepairs is false
+    ch_processed_for_combine.combine.dump(tag: "input_into_cat_fastq")
     CAT_FASTQ ( ch_processed_for_combine.combine )
 
     ch_reads_for_profiling = ch_processed_for_combine.skip

From ee53283696261fa2687fa55402e3cfa3da164509 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Mon, 28 Mar 2022 09:13:05 +0200
Subject: [PATCH 14/21] Debugging comment

---
 workflows/taxprofiler.nf | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index 87f7a30..8a05720 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -118,7 +118,6 @@ workflow TAXPROFILER {
 
     /*
         MODULE: PERFORM SHORT READ RUN MERGING
-    */
 
     // Remove run accession to allow grouping by sample. Will only merge
     // if pairment type is the same.
@@ -128,6 +127,7 @@ workflow TAXPROFILER {
     // e.g. `home jfellows Documents git nf-core taxprofiler testing work 68 9a2c8362add37832a776058d280bb7 2612_se.merged.fastq.gz`
     // So theoretically need to force this into a list, (but results the can't access meta.id error as incorrect  input format)
     // But second issue >= 2 is MAYBE sufficient because what if merging two paired-end files? Need to chcek if the input channel formatted correctly for this? Need to check...
+
     ch_processed_for_combine = ch_shortreads_preprocessed
         .dump(tag: "prep_for_combine_grouping")
         .map {
@@ -160,6 +160,9 @@ workflow TAXPROFILER {
                                 .dump(tag: "skip_combine")
                                 .mix( CAT_FASTQ.out.reads )
                                 .dump(tag: "files_for_profiling")
+    */
+
+    ch_reads_for_profiling = ch_shortreads_preprocessed
 
     /*
         COMBINE READS WITH POSSIBLE DATABASES
@@ -198,6 +201,7 @@ workflow TAXPROFILER {
                             }
 
     // We can run Kraken2 one-by-one sample-wise
+    // TODO Only flatten when paired-end! Causing issue commented out above!
     ch_input_for_kraken2 =  ch_input_for_profiling.kraken2
                             .dump(tag: "input_to_kraken")
                             .multiMap {

From 231253227c7e59c1c108cf4cf4953cb89c4f6f5f Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Mon, 28 Mar 2022 16:38:10 +0200
Subject: [PATCH 15/21] Remove run merging for now due to complexity

---
 conf/modules.config                   |  4 +-
 subworkflows/local/shortread_fastp.nf | 17 +++++----
 workflows/taxprofiler.nf              | 55 ++-------------------------
 3 files changed, 14 insertions(+), 62 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 41f471f..a9bc3a1 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -105,7 +105,7 @@ process {
             pattern: '*.{rma6,tab,text,sam,log}'
         ]
         ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.db_name}" }
+        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
     }
 
     withName: KRAKEN2_KRAKEN2 {
@@ -115,7 +115,7 @@ process {
             pattern: '*.{fastq.gz,txt}'
         ]
         ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.db_name}" }
+        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
     }
 
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf
index d4d706e..c2e435c 100644
--- a/subworkflows/local/shortread_fastp.nf
+++ b/subworkflows/local/shortread_fastp.nf
@@ -28,15 +28,16 @@ workflow SHORTREAD_FASTP {
     FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs )
 
     if ( params.shortread_clipmerge_mergepairs ) {
-        // TODO update to replace meta suffix
-        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
-                                    .mix( FASTP_SINGLE.out.reads )
-                                    .map {
-                                        meta, reads ->
-                                            def meta_new = meta.clone()
-                                            meta_new['single_end'] = 1
-                                            [ meta_new, reads ]
+        ch_fastp_reads_prepped_pe = FASTP_PAIRED.out.reads_merged
+                                        .map {
+                                            meta, reads ->
+                                                def meta_new = meta.clone()
+                                                meta_new['single_end'] = 1
+                                                [ meta_new, reads ]
                                         }
+
+        ch_fastp_reads_prepped = ch_fastp_reads_prepped_pe.mix( FASTP_SINGLE.out.reads )
+
     } else {
         ch_fastp_reads_prepped = FASTP_PAIRED.out.reads
                                     .mix( FASTP_SINGLE.out.reads )
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index 8a05720..e73d94d 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -116,63 +116,15 @@ workflow TAXPROFILER {
         ch_longreads_preprocessed = INPUT_CHECK.out.nanopore
     }
 
-    /*
-        MODULE: PERFORM SHORT READ RUN MERGING
-
-    // Remove run accession to allow grouping by sample. Will only merge
-    // if pairment type is the same.
-
-    // TODO Current Branch system currently problematic - when single file not in a list, splits at
-    // `/` so makes list >= 2, so tries to merge, but then breaks kraken downstream
-    // e.g. `home jfellows Documents git nf-core taxprofiler testing work 68 9a2c8362add37832a776058d280bb7 2612_se.merged.fastq.gz`
-    // So theoretically need to force this into a list, (but results the can't access meta.id error as incorrect  input format)
-    // But second issue >= 2 is MAYBE sufficient because what if merging two paired-end files? Need to chcek if the input channel formatted correctly for this? Need to check...
-
-    ch_processed_for_combine = ch_shortreads_preprocessed
-        .dump(tag: "prep_for_combine_grouping")
-        .map {
-            meta, reads ->
-            def meta_new = meta.clone()
-
-            // remove run accession to allow group by sample
-            meta_new.remove('run_accession')
-
-            // update id to prevent file name clashes when unable to group
-            // unmerged PE and SE runs of same sample
-            def type = meta_new['single_end'] ? "_se" : "_pe"
-            meta_new['id'] = meta['id'] + type
-
-            [ meta_new, reads ]
-        }
-        .groupTuple ( by: 0 )
-        .dump(tag: "files_for_cat_fastq_branch")
-        .branch{
-            combine: it[1] && it[1].size() > 1
-            skip: true
-        }
-
-    // NOTE: this does not allow CATing of SE & PE runs of same sample
-    // when --shortread_clipmerge_mergepairs is false
-    ch_processed_for_combine.combine.dump(tag: "input_into_cat_fastq")
-    CAT_FASTQ ( ch_processed_for_combine.combine )
-
-    ch_reads_for_profiling = ch_processed_for_combine.skip
-                                .dump(tag: "skip_combine")
-                                .mix( CAT_FASTQ.out.reads )
-                                .dump(tag: "files_for_profiling")
-    */
-
-    ch_reads_for_profiling = ch_shortreads_preprocessed
-
     /*
         COMBINE READS WITH POSSIBLE DATABASES
     */
 
     // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
-    ch_input_for_profiling = ch_reads_for_profiling
+    ch_input_for_profiling = ch_shortreads_preprocessed
             .mix( ch_longreads_preprocessed )
             .combine(DB_CHECK.out.dbs)
-            .dump(tag: "reads_plus_db")
+            .dump(tag: "reads_plus_db_clean")
             .branch {
                 malt:    it[2]['tool'] == 'malt'
                 kraken2: it[2]['tool'] == 'kraken2'
@@ -201,12 +153,11 @@ workflow TAXPROFILER {
                             }
 
     // We can run Kraken2 one-by-one sample-wise
-    // TODO Only flatten when paired-end! Causing issue commented out above!
     ch_input_for_kraken2 =  ch_input_for_profiling.kraken2
                             .dump(tag: "input_to_kraken")
                             .multiMap {
                                 it ->
-                                    reads: [ it[0] + it[2], it[1].flatten() ]
+                                    reads: [ it[0] + it[2], it[1] ]
                                     db: it[3]
                             }
 

From 98dc8014a526209b24bdd55e321c10b2dc007a88 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Mon, 28 Mar 2022 16:46:45 +0200
Subject: [PATCH 16/21] Fix kraken2 module

---
 modules.json                                    | 8 ++++----
 modules/nf-core/modules/kraken2/kraken2/main.nf | 3 +--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/modules.json b/modules.json
index 9e6428a..2494fc1 100644
--- a/modules.json
+++ b/modules.json
@@ -24,12 +24,12 @@
             "multiqc": {
                 "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
             },
-            "untar": {
-                "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918"
-            },
             "porechop": {
                 "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046"
+            },
+            "untar": {
+                "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918"
             }
         }
     }
-}
+}
\ No newline at end of file
diff --git a/modules/nf-core/modules/kraken2/kraken2/main.nf b/modules/nf-core/modules/kraken2/kraken2/main.nf
index 52351f5..3ec5df5 100644
--- a/modules/nf-core/modules/kraken2/kraken2/main.nf
+++ b/modules/nf-core/modules/kraken2/kraken2/main.nf
@@ -32,12 +32,11 @@ process KRAKEN2_KRAKEN2 {
         --threads $task.cpus \\
         --unclassified-out $unclassified \\
         --classified-out $classified \\
-        $args \\
         --report ${prefix}.kraken2.report.txt \\
         --gzip-compressed \\
         $paired \\
+        $args \\
         $reads
-        
 
     pigz -p $task.cpus *.fastq
 

From eada201eb28f3572af5481a6d69e4f87407a352d Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Mon, 28 Mar 2022 16:47:37 +0200
Subject: [PATCH 17/21] Prettier

---
 modules.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules.json b/modules.json
index 2494fc1..34ef9f1 100644
--- a/modules.json
+++ b/modules.json
@@ -32,4 +32,4 @@
             }
         }
     }
-}
\ No newline at end of file
+}

From 94e5cfef4aa7c2dd69e72e7b1b12439110ed1a09 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Mon, 28 Mar 2022 18:20:10 +0200
Subject: [PATCH 18/21] Filter long reads for MALT, bump cpus for FastQC for
 minigut to pass

---
 conf/base.config               |  2 +-
 conf/test.config               | 15 ++++++++++++++-
 subworkflows/local/db_check.nf |  2 +-
 workflows/taxprofiler.nf       |  6 ++++++
 4 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/conf/base.config b/conf/base.config
index 0c65574..2f0f97f 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -27,7 +27,7 @@ process {
     // TODO nf-core: Customise requirements for specific processes.
     // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
     withLabel:process_low {
-        cpus   = { check_max( 2     * task.attempt, 'cpus'    ) }
+        cpus   = { check_max( 4     * task.attempt, 'cpus'    ) }
         memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
         time   = { check_max( 4.h   * task.attempt, 'time'    ) }
     }
diff --git a/conf/test.config b/conf/test.config
index 92a10e4..26972df 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -15,7 +15,7 @@ params {
     config_profile_description = 'Minimal test dataset to check pipeline function'
 
     // Limit resources so that this can run on GitHub Actions
-    max_cpus   = 2
+    max_cpus   = 8
     max_memory = '6.GB'
     max_time   = '6.h'
 
@@ -29,3 +29,16 @@ params {
     shortread_clipmerge = true
 
 }
+
+process {
+    withName: FASTQC {
+        cpus   = { check_max( 8     * task.attempt, 'cpus'    ) }
+        memory = { check_max( 6.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 6.h   * task.attempt, 'time'    ) }
+    }
+    withName: FASTQC_PROCESSED {
+        cpus   = { check_max( 8     * task.attempt, 'cpus'    ) }
+        memory = { check_max( 6.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 6.h   * task.attempt, 'time'    ) }
+    }
+}
diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf
index 96f815a..bda5cfe 100644
--- a/subworkflows/local/db_check.nf
+++ b/subworkflows/local/db_check.nf
@@ -14,7 +14,7 @@ workflow DB_CHECK {
     // TODO: make database sheet check
     // Checks:
     // 1) no duplicates,
-    // 2) dbs with no special arguments does not have quotes, e.g. just `,,` and NOT `,"",`
+    // 2) args do not have quotes, e.g. just `,,` and NOT `,"",`
     parsed_samplesheet = DATABASE_CHECK ( dbsheet )
         .csv
         .splitCsv ( header:true, sep:',' )
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index e73d94d..af5c54d 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -90,9 +90,11 @@ workflow TAXPROFILER {
         MODULE: Run FastQC
     */
     ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ).dump(tag: "input_to_fastq")
+
     FASTQC (
         ch_input_for_fastqc
     )
+
     ch_versions = ch_versions.mix(FASTQC.out.versions.first())
 
     CUSTOM_DUMPSOFTWAREVERSIONS (
@@ -137,7 +139,11 @@ workflow TAXPROFILER {
 
     // We groupTuple to have all samples in one channel for MALT as database
     // loading takes a long time, so we only want to run it once per database
+    // TODO document somewhere we only accept illumina short reads for MALT?
     ch_input_for_malt =  ch_input_for_profiling.malt
+                            .dump(tag: "input_to_malt_prefilter")
+                            .filter { it[0]['instrument_platform'] == 'ILLUMINA' }
+                            .dump(tag: "input_to_malt_postfilter")
                             .map {
                                 it ->
                                     def temp_meta =  [ id: it[2]['db_name']]  + it[2]

From 04fb6f412752811ac8fc621c8730d418986a1d08 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Mon, 28 Mar 2022 20:38:16 +0200
Subject: [PATCH 19/21] Use new resources for test data

---
 conf/test.config | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/conf/test.config b/conf/test.config
index 26972df..92a10e4 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -15,7 +15,7 @@ params {
     config_profile_description = 'Minimal test dataset to check pipeline function'
 
     // Limit resources so that this can run on GitHub Actions
-    max_cpus   = 8
+    max_cpus   = 2
     max_memory = '6.GB'
     max_time   = '6.h'
 
@@ -29,16 +29,3 @@ params {
     shortread_clipmerge = true
 
 }
-
-process {
-    withName: FASTQC {
-        cpus   = { check_max( 8     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 6.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 6.h   * task.attempt, 'time'    ) }
-    }
-    withName: FASTQC_PROCESSED {
-        cpus   = { check_max( 8     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 6.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 6.h   * task.attempt, 'time'    ) }
-    }
-}

From 98a9688329e66814764746a9219580e841e84477 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Tue, 29 Mar 2022 15:18:06 +0200
Subject: [PATCH 20/21] Change CPUs back to template, update skiptrim fiag

---
 conf/base.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/base.config b/conf/base.config
index 2f0f97f..0c65574 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -27,7 +27,7 @@ process {
     // TODO nf-core: Customise requirements for specific processes.
     // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
     withLabel:process_low {
-        cpus   = { check_max( 4     * task.attempt, 'cpus'    ) }
+        cpus   = { check_max( 2     * task.attempt, 'cpus'    ) }
         memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
         time   = { check_max( 4.h   * task.attempt, 'time'    ) }
     }

From 76e9624e3f04e573a56c1443884b2ca385488319 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Wed, 30 Mar 2022 11:31:37 +0200
Subject: [PATCH 21/21] NOW commit the adapte rtrim flag

---
 conf/modules.config  | 2 +-
 nextflow.config      | 2 +-
 nextflow_schema.json | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index a9bc3a1..71eaa75 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -68,7 +68,7 @@ process {
             // collapsing options - option to retain singletons
             params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged",
             // trimming options
-            params.shortread_clipmerge_skiptrim ? "--disable_adapter_trimming" : "",
+            params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "",
             params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
             !{ ${meta.single_end} } && params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : !{ ${meta.single_end} } ? "--detect_adapter_for_pe" : "",
             // filtering options
diff --git a/nextflow.config b/nextflow.config
index a312d0c..6fde513 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -57,7 +57,7 @@ params {
     // FASTQ preprocessing
     shortread_clipmerge                     = false
     shortread_clipmerge_tool                = 'fastp'
-    shortread_clipmerge_skiptrim            = false
+    shortread_clipmerge_skipadaptertrim     = false
     shortread_clipmerge_mergepairs          = false
     shortread_clipmerge_excludeunmerged     = true
     shortread_clipmerge_adapter1            = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 545f990..adc5a73 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -286,7 +286,7 @@
             "type": "string",
             "default": "fastp"
         },
-        "shortread_clipmerge_skiptrim": {
+        "shortread_clipmerge_skipadaptertrim": {
             "type": "boolean"
         },
         "shortread_clipmerge_mergepairs": {