Merge pull request #272 from nf-core/final-reads-saving

Add 'final reads for profiling' saving mechanism
2024-11-24 04:09:54 +00:00 · 2023-04-21 15:21:35 +02:00 · 2023-04-21 15:21:35 +02:00 · 2140928638
commit 2140928638
parent 003862d9ea ee76a5f38b
8 changed files with 229 additions and 74 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### `Added`

+- [#272](https://github.com/nf-core/taxprofiler/pull/272) - Add saving of final 'analysis-ready-reads' to dedicated directory. (❤️ to @alexhbnr for reporting, added by @jfy133)
+
 ### `Fixed`

 - [#271](https://github.com/nf-core/taxprofiler/pull/271/files) Improved standardised table generation documentation nd mOTUs manual database download tutorial (♥ to @prototaxites for reporting, fix by @jfy133)
--- a/conf/modules.config
+++ b/conf/modules.config
@ -71,6 +71,14 @@ process {
                path: { "${params.outdir}/fastp" },
                mode: params.publish_dir_mode,
                pattern: '*.{log,html,json}'
+            ],
+            [
+                path: { "${params.outdir}/analysis_ready_fastqs" },
+                mode: params.publish_dir_mode,
+                pattern: '*.fastq.gz',
+                enabled: params.save_analysis_ready_fastqs,
+                // Don't know why `!` doesn't work here, but `== false` makes it work...
+                saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && params.save_analysis_ready_fastqs ? it : null }
            ]
        ]
    }
@ -99,6 +107,13 @@ process {
                path: { "${params.outdir}/fastp" },
                mode: params.publish_dir_mode,
                pattern: '*.{log,html,json}'
+            ],
+            [
+                path: { "${params.outdir}/analysis_ready_fastqs" },
+                mode: params.publish_dir_mode,
+                pattern: params.shortread_qc_mergepairs ? '*merged.fastq.gz' : '*.fastp.fastq.gz',
+                enabled: params.save_analysis_ready_fastqs,
+                saveAs:  { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && params.save_analysis_ready_fastqs ? it : null }
            ]
        ]
    }
@ -122,6 +137,13 @@ process {
                path: { "${params.outdir}/adapterremoval" },
                mode: params.publish_dir_mode,
                pattern: '*.settings'
+            ],
+            [
+                path: { "${params.outdir}/analysis_ready_fastqs" },
+                mode: params.publish_dir_mode,
+                pattern: '*truncated.fastq.gz',
+                enabled: params.save_analysis_ready_fastqs,
+                saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && params.save_analysis_ready_fastqs ? it : null }
            ]
        ]
    }
@ -148,6 +170,27 @@ process {
                path: { "${params.outdir}/adapterremoval" },
                mode: params.publish_dir_mode,
                pattern: '*.settings'
+            ],
+            [
+                path: { "${params.outdir}/analysis_ready_fastqs" },
+                mode: params.publish_dir_mode,
+                pattern: '*{truncated.fastq,singleton.truncated}.gz',
+                enabled: params.save_analysis_ready_fastqs,
+                saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && !params.shortread_qc_mergepairs && params.save_analysis_ready_fastqs ? it : null}
+            ]
+        ]
+    }
+
+    // AdapterRemoval separate output merging
+    withName: CAT_FASTQ {
+        ext.prefix = { "${meta.id}_${meta.run_accession}" }
+        publishDir = [
+            [
+                path: { "${params.outdir}/analysis_ready_fastqs" },
+                mode: params.publish_dir_mode,
+                pattern: '*.fastq.gz',
+                enabled: params.save_analysis_ready_fastqs,
+                saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && params.save_analysis_ready_fastqs ? it : null }
            ]
        ]
    }
@ -165,6 +208,13 @@ process {
                path: { "${params.outdir}/porechop" },
                mode: params.publish_dir_mode,
                pattern: '*.log'
+            ],
+            [
+                path: { "${params.outdir}/analysis_ready_fastqs" },
+                mode: params.publish_dir_mode,
+                pattern: '*_porechopped.fastq.gz',
+                enabled: params.save_analysis_ready_fastqs,
+                saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_longread_hostremoval && params.longread_qc_skipqualityfilter && !params.longread_qc_skipadaptertrim && params.perform_longread_qc && params.save_analysis_ready_fastqs ? it : null }
            ]
        ]
    }
@ -188,17 +238,83 @@ process {
                path: { "${params.outdir}/filtlong" },
                mode: params.publish_dir_mode,
                pattern: '*.log'
+            ],
+            [
+                path: { "${params.outdir}/analysis_ready_fastqs" },
+                mode: params.publish_dir_mode,
+                pattern: '*.fastq.gz',
+                enabled: params.save_analysis_ready_fastqs,
+                saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_longread_hostremoval && !params.longread_qc_skipqualityfilter && params.perform_longread_qc && params.save_analysis_ready_fastqs ? it : null }
+            ]
+        ]
+    }
+
+    withName: BBMAP_BBDUK {
+        ext.args =  [
+                "entropy=${params.shortread_complexityfilter_entropy}",
+                "entropywindow=${params.shortread_complexityfilter_bbduk_windowsize}",
+                params.shortread_complexityfilter_bbduk_mask ?  "entropymask=t" : "entropymask=f"
+            ].join(' ').trim()
+        ext.prefix = { "${meta.id}_${meta.run_accession}" }
+        publishDir = [
+            [
+                path: { "${params.outdir}/bbduk/" },
+                mode: params.publish_dir_mode,
+                pattern: '*.{fastq.gz}',
+                enabled: params.save_complexityfiltered_reads
+            ],
+            [
+                path: { "${params.outdir}/bbduk/" },
+                mode: params.publish_dir_mode,
+                pattern: '*.log'
+            ],
+            [
+                path: { "${params.outdir}/analysis_ready_fastqs" },
+                mode: params.publish_dir_mode,
+                pattern: '*.fastq.gz',
+                enabled: params.save_analysis_ready_fastqs,
+                saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && params.shortread_complexityfilter_tool && params.save_analysis_ready_fastqs ? it : null }
+            ]
+        ]
+    }
+
+    withName: PRINSEQPLUSPLUS {
+        ext.args =  [
+                params.shortread_complexityfilter_prinseqplusplus_mode == 'dust' ? "-lc_dust=${params.shortread_complexityfilter_prinseqplusplus_dustscore}" : "-lc_entropy=${params.shortread_complexityfilter_entropy}",
+                "-trim_qual_left=0 -trim_qual_left=0 -trim_qual_window=0 -trim_qual_step=0",
+            ].join(' ').trim()
+        ext.prefix = { "${meta.id}_${meta.run_accession}" }
+        publishDir = [
+            [
+                path: { "${params.outdir}/prinseqplusplus/" },
+                mode: params.publish_dir_mode,
+                pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz}',
+                enabled: params.save_complexityfiltered_reads
+            ],
+            [
+                path: { "${params.outdir}/prinseqplusplus/" },
+                mode: params.publish_dir_mode,
+                pattern: '*.log'
+            ],
+            [
+                path: { "${params.outdir}/analysis_ready_fastqs" },
+                mode: params.publish_dir_mode,
+                pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz}',
+                enabled: params.save_analysis_ready_fastqs,
+                saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && params.shortread_complexityfilter_tool && params.save_analysis_ready_fastqs ? it : null }
            ]
        ]
    }

    withName: BOWTIE2_BUILD {
        publishDir = [
+            [
            path: { "${params.outdir}/bowtie2/build" },
            mode: params.publish_dir_mode,
            pattern: 'bowtie2',
            enabled: params.save_hostremoval_index
            ]
+        ]
    }

    // Saving unmapped reads as FQ comes via input channel!
@ -221,6 +337,14 @@ process {
                mode: params.publish_dir_mode,
                pattern: '*.fastq.gz',
                enabled: params.save_hostremoval_unmapped
+            ],
+            [
+                path: { "${params.outdir}/analysis_ready_fastqs" },
+                mode: params.publish_dir_mode,
+                enabled: params.perform_shortread_hostremoval,
+                pattern: '*.fastq.gz',
+                enabled: params.save_analysis_ready_fastqs,
+                saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && params.perform_shortread_hostremoval && params.save_analysis_ready_fastqs ? it : null }
            ]
        ]
    }
@ -253,10 +377,19 @@ process {
    withName: SAMTOOLS_FASTQ {
        ext.prefix = { "${meta.id}_${meta.run_accession}.unmapped" }
        publishDir = [
+            [
                path: { "${params.outdir}/samtools/fastq" },
                mode: params.publish_dir_mode,
                pattern: '*.fastq.gz',
                enabled: params.save_hostremoval_unmapped
+            ],
+            [
+                path: { "${params.outdir}/analysis_ready_fastqs" },
+                mode: params.publish_dir_mode,
+                pattern: '*.fq.gz',
+                enabled: params.save_analysis_ready_fastqs,
+                saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun) ) && params.perform_longread_hostremoval && params.save_analysis_ready_fastqs ? it : null }
+            ]
        ]
    }

@ -269,56 +402,21 @@ process {
        ]
    }

-    withName: BBMAP_BBDUK {
-        ext.args =  [
-                "entropy=${params.shortread_complexityfilter_entropy}",
-                "entropywindow=${params.shortread_complexityfilter_bbduk_windowsize}",
-                params.shortread_complexityfilter_bbduk_mask ?  "entropymask=t" : "entropymask=f"
-            ].join(' ').trim()
-        ext.prefix = { "${meta.id}_${meta.run_accession}" }
-        publishDir = [
-            [
-                path: { "${params.outdir}/bbduk/" },
-                mode: params.publish_dir_mode,
-                pattern: '*.{fastq.gz,log}',
-                enabled: params.save_complexityfiltered_reads
-            ],
-            [
-                path: { "${params.outdir}/bbduk/" },
-                mode: params.publish_dir_mode,
-                pattern: '*.log'
-            ]
-        ]
-    }
-
-    withName: PRINSEQPLUSPLUS {
-        ext.args =  [
-                params.shortread_complexityfilter_prinseqplusplus_mode == 'dust' ? "-lc_dust=${params.shortread_complexityfilter_prinseqplusplus_dustscore}" : "-lc_entropy=${params.shortread_complexityfilter_entropy}",
-                "-trim_qual_left=0 -trim_qual_left=0 -trim_qual_window=0 -trim_qual_step=0",
-            ].join(' ').trim()
-        ext.prefix = { "${meta.id}_${meta.run_accession}" }
-        publishDir = [
-            [
-                path: { "${params.outdir}/prinseqplusplus/" },
-                mode: params.publish_dir_mode,
-                pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz}',
-                enabled: params.save_complexityfiltered_reads
-            ],
-            [
-                path: { "${params.outdir}/prinseqplusplus/" },
-                mode: params.publish_dir_mode,
-                pattern: '*.log'
-            ]
-        ]
-    }
-
-    withName: CAT_FASTQ {
+    withName: MERGE_RUNS {
        ext.prefix = { "${meta.id}" }
        publishDir = [
+            [
                path: { "${params.outdir}/run_merging/" },
                mode: params.publish_dir_mode,
                pattern: '*.fastq.gz',
                enabled: params.save_runmerged_reads
+            ],
+            [
+                path: { "${params.outdir}/analysis_ready_fastqs" },
+                mode: params.publish_dir_mode,
+                pattern: '*.fastq.gz',
+                enabled: params.perform_runmerging && params.save_analysis_ready_fastqs
+            ]
        ]
    }

--- a/docs/output.md
+++ b/docs/output.md
@ -22,6 +22,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 - [minimap2](#minimap2) - Host removal for Nanopore reads
 - [SAMtools stats](#samtools-stats) - Statistics from host removal
 - [SAMtools fastq](#samtools-fastq) - Converts unmapped BAM file to fastq format (minimap2 only)
+- [Analysis Ready Reads](#analysis-read-reads) - Optional results directory containing the final processed reads used as input for classification/profiling.
 - [Bracken](#bracken) - Taxonomic classifier using k-mers and abundance estimations
 - [Kraken2](#kraken2) - Taxonomic classifier using exact k-mer matches
 - [KrakenUniq](#krakenuniq) - Taxonomic classifier that combines the k-mer-based classification and the number of unique k-mers found in each species
@ -102,7 +103,7 @@ You can change the default value for low complexity filtering by using the argum

 By default nf-core/taxprofiler will only provide the `.settings` file if AdapterRemoval is selected.

-You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. If this is selected, you may receive different combinations of `.fastq` files for each sample depending on the input types - e.g. whether you have merged or not, or if you're supplying both single- and paired-end reads.
+You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. If this is selected, you may receive different combinations of `.fastq` files for each sample depending on the input types - e.g. whether you have merged or not, or if you're supplying both single- and paired-end reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.

 > ⚠️ The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as complexity filtering, host removal, run merging etc..

@ -121,7 +122,7 @@ You will only find the `.fastq` files in the results directory if you provide `

 The output logs are saved in the output folder and are part of MultiQC report.You do not normally need to check these manually.

-You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`.
+You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.

 > ⚠️ We do **not** recommend using Porechop if you are already trimming the adapters with ONT's basecaller Guppy.

@ -140,7 +141,7 @@ It is used in nf-core/taxprofiler for complexity filtering using different algor

 </details>

-By default nf-core/taxprofiler will only provide the `.log` file if BBDuk is selected as the complexity filtering tool. You will only find the complexity filtered reads in your results directory if you provide ` --save_complexityfiltered_reads` .
+By default nf-core/taxprofiler will only provide the `.log` file if BBDuk is selected as the complexity filtering tool. You will only find the complexity filtered reads in your results directory if you provide ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.

 > ⚠️ The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as host removal, run merging etc..

@ -159,7 +160,7 @@ It is used in nf-core/taxprofiler for complexity filtering using different algor

 </details>

-By default nf-core/taxprofiler will only provide the `.log` file if PRINSEQ++ is selected as the complexity filtering tool. You will only find the complexity filtered `.fastq` files in your results directory if you supply ` --save_complexityfiltered_reads` .
+By default nf-core/taxprofiler will only provide the `.log` file if PRINSEQ++ is selected as the complexity filtering tool. You will only find the complexity filtered `.fastq` files in your results directory if you supply ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.

 > ⚠️ The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as host removal, run merging etc..

@ -176,7 +177,7 @@ By default nf-core/taxprofiler will only provide the `.log` file if PRINSEQ++ is

 </details>

-You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`.
+You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.

 > ⚠️ We do **not** recommend using Filtlong if you are performing filtering of low quality reads with ONT's basecaller Guppy.

@ -199,7 +200,7 @@ It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) and/

 </details>

-By default nf-core/taxprofiler will only provide the `.log` file if host removal is turned on. You will only have a `.bam` file if you specify `--save_hostremoval_bam`. This will contain _both_ mapped and unmapped reads. You will only get FASTQ files if you specify to save `--save_hostremoval_unmapped` - these contain only unmapped reads.
+By default nf-core/taxprofiler will only provide the `.log` file if host removal is turned on. You will only have a `.bam` file if you specify `--save_hostremoval_bam`. This will contain _both_ mapped and unmapped reads. You will only get FASTQ files if you specify to save `--save_hostremoval_unmapped` - these contain only unmapped reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.

 > ℹ️ Unmapped reads in FASTQ are only found in this directory for short-reads, for long-reads see [`samtools/fastq/`](#samtools-fastq)

@ -242,10 +243,26 @@ By default, nf-core/taxprofiler will only provide the `.bam` file containing map

 </details>

-This directory will be present and contain the unmapped reads from the `.fastq` format from long-read minimap2 host removal, if `--save_hostremoval_unmapped` is supplied
+This directory will be present and contain the unmapped reads from the `.fastq` format from long-read minimap2 host removal, if `--save_hostremoval_unmapped` is supplied. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.

 > ℹ️ For short-read unmapped reads, see [bowtie2](#bowtie2).

+### Analysis Ready Reads
+
+> ℹ️ This optional results directory will only be present in the pipeline results when supplying `--save_analysis_ready_reads`.
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `samtoolsstats`
+  - `<sample_id>_{fq,fastq}.gz`: Final reads that underwent preprocessing and were sent for classification/profiling.
+
+</details>
+
+The results directory will contain the 'final' processed reads used as input for classification/profiling. It will _only_ include the output of the _last_ step of any combinations of preprocessing steps that may have been specified in the run configuration. For example, if you perform the read QC and host-removal preprocessing steps, the final reads that are sent to classification/profiling are the host-removed FASTQ files - those will be the ones present in this directory.
+
+> ⚠️ If you turn off all preprocessing steps, then no results will be present in this directory. This happens independently for short- and long-reads. I.e. you will only have FASTQ files for short reads in this directory if you skip all long-read preprocessing.
+
 ### SAMtools stats

 [SAMtools stats](http://www.htslib.org/doc/samtools-stats.html) collects statistics from a `.sam`, `.bam`, or `.cram` alignment file and outputs in a text format.
@ -264,7 +281,7 @@ In most cases you do not need to check this file, as it is rendered in the Multi

 nf-core/taxprofiler offers the option to merge FASTQ files of multiple sequencing runs or libraries that derive from the same sample, as specified in the input samplesheet.

-This is the last preprocessing step, so if you have multiple runs or libraries (and run merging turned on), this will represent the final reads that will go into classification/profiling steps.
+This is the last possible preprocessing step, so if you have multiple runs or libraries (and run merging turned on), this will represent the final reads that will go into classification/profiling steps.

 <details markdown="1">
 <summary>Output files</summary>
@ -276,7 +293,7 @@ This is the last preprocessing step, so if you have multiple runs or libraries (

 Note that you will only find samples that went through the run merging step in this directory. For samples that had a single run or library will not go through this step of the pipeline and thus will not be present in this directory.

-⚠️ You must make sure to turn on the saving of the reads from the previous preprocessing step you may have turned on, if you have single-run or library reads in your pipeline run, and wish to save the final reads that go into classification/profiling!
+This directory and its FASTQ files will only be present if you supply `--save_runmerged_reads`.Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.

 ### Bracken

--- a/docs/usage.md
+++ b/docs/usage.md
@ -168,6 +168,8 @@ nf-core/taxprofiler offers four main preprocessing steps for preprocessing raw s
 - [**Host read-removal**](#host-read-removal): removal of reads aligning to reference genome(s) of a host.
 - [**Run merging**](#run-merging): concatenation of multiple FASTQ chunks/sequencing runs/libraries of a sample.

+> ℹ️ You can save the 'final' reads used for classification/profiling from any combination of these steps with `--save_analysis_ready_reads`.
+
 #### Read Processing

 Raw sequencing read processing in the form of adapter clipping and paired-end read merging can be activated via the `--perform_shortread_qc` or `--perform_longread_qc` flags.
--- a/nextflow.config
+++ b/nextflow.config
@ -105,6 +105,8 @@ params {
    save_hostremoval_bam                    = false
    save_hostremoval_unmapped              = false

+    // Publishing final reads going into profiling
+    save_analysis_ready_fastqs = false

    // MALT
    run_malt                   = false
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -69,6 +69,12 @@
                    "fa_icon": "fas fa-save",
                    "description": "Save reads from samples that went through the adapter clipping, pair-merging, and length filtering steps for both short and long reads",
                    "help_text": "This saves the FASTQ output from the following tools:\n\n- fastp\n- AdapterRemoval\n- Porechop\n- Filtlong\n\nThese reads will be a mixture of: adapter clipped, quality trimmed, pair-merged, and length filtered, depending on the parameters you set."
+                },
+                "save_analysis_ready_fastqs": {
+                    "type": "boolean",
+                    "fa_icon": "fas fa-save",
+                    "description": "Save only the final reads from all read processing steps (that are sent to classification/profiling) in results directory.",
+                    "help_text": "This flag will generate the directory `results/analysis_read_reads` that contains the reads from the last preprocessing (QC, host removal, run merging etc.) step of the pipeline run. \n\nThis can be useful if you wish to re-use the final cleaned-up and prepared reads - the data actually used for the actual classification/profiling steps of the pipeline - for other analyses or purposes (e.g., to reduce redundant preprocessing between different pipelines, e.g. [nf-core/mag](https://nf-co.re/mag)).\n\nIn most cases this will be preferred over similar parameters e.g. ` --save_preprocessed_reads` or ` --save_complexityfiltered_reads`, unless you wish to explore in more detail the output of each specific preprocessing step independently.\n\nNote if you do no preprocessing of any kind, nothing will be present in this directory. "
                }
            },
            "fa_icon": "fas fa-users-cog"
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@ -9,35 +9,58 @@ workflow INPUT_CHECK {
    samplesheet // file: /path/to/samplesheet.csv

    main:
-    parsed_samplesheet = SAMPLESHEET_CHECK ( samplesheet )
+
+    // Table to list, group per sample, detect if sample has multi-run,
+    // then spread back to per-run rows but with multi-run info added to meta
+    ch_split_samplesheet = SAMPLESHEET_CHECK ( samplesheet )
        .csv
        .splitCsv ( header:true, sep:',' )
+        .map{
+            row ->
+                [ [ row.sample.toString() ], row ]
+            }
+        .groupTuple()
+        .map {
+            sample, rows ->
+                def is_multirun = rows.size() > 1
+            [ rows, is_multirun ]
+        }
+        .transpose(by: 0)
+        .map {
+            row, is_multirun ->
+                row['is_multirun'] = is_multirun
+            return row
+        }
+
+    // Split for context-dependent channel generation
+    ch_parsed_samplesheet = ch_split_samplesheet
        .branch { row ->
            fasta: row.fasta != ''
            nanopore: row.instrument_platform == 'OXFORD_NANOPORE'
            fastq: true
        }

-    fastq = parsed_samplesheet.fastq
+    // Channel generation
+    ch_fastq = ch_parsed_samplesheet.fastq
        .map { create_fastq_channel(it) }

-    nanopore = parsed_samplesheet.nanopore
+    ch_nanopore = ch_parsed_samplesheet.nanopore
        .map { create_fastq_channel(it) }

-    fasta = parsed_samplesheet.fasta
+    ch_fasta = ch_parsed_samplesheet.fasta
        .map { create_fasta_channel(it) }

    emit:
-    fastq = fastq ?: []                       // channel: [ val(meta), [ reads ] ]
-    nanopore = nanopore ?: []                 // channel: [ val(meta), [ reads ] ]
-    fasta = fasta ?: []                       // channel: [ val(meta), fasta ]
+    fastq    = ch_fastq ?: []                    // channel: [ val(meta), [ reads ] ]
+    nanopore = ch_nanopore ?: []                 // channel: [ val(meta), [ reads ] ]
+    fasta    = ch_fasta ?: []                    // channel: [ val(meta), fasta ]
    versions = SAMPLESHEET_CHECK.out.versions    // channel: [ versions.yml ]
 }

 // Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
 def create_fastq_channel(LinkedHashMap row) {
    // create meta map
-    def meta = row.subMap(['sample', 'run_accession', 'instrument_platform'])
+    def meta = row.subMap(['sample', 'run_accession', 'instrument_platform', 'is_multirun'])
    meta.id         = meta.sample
    meta.single_end = row.single_end.toBoolean()
    meta.is_fasta   = false
@ -66,6 +89,8 @@ def create_fastq_channel(LinkedHashMap row) {

 // Function to get list of [ meta, fasta ]
 def create_fasta_channel(LinkedHashMap row) {
+
+    // don't include multi-run information as we don't do FASTA run merging
    def meta        = row.subMap(['sample', 'run_accession', 'instrument_platform' ])
    meta.id         = meta.sample
    meta.single_end = true
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -92,7 +92,7 @@ include { FASTQC                      } from '../modules/nf-core/fastqc/main'
 include { FALCO                       } from '../modules/nf-core/falco/main'
 include { MULTIQC                     } from '../modules/nf-core/multiqc/main'
 include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main'
-include { CAT_FASTQ                   } from '../modules/nf-core/cat/fastq/main'
+include { CAT_FASTQ as MERGE_RUNS     } from '../modules/nf-core/cat/fastq/main'

 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -122,6 +122,9 @@ workflow TAXPROFILER {
    )
    ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)

+    // Save final FASTA reads if requested, as otherwise no processing occurs on FASTA
+
+
    DB_CHECK (
        ch_databases
    )
@ -211,7 +214,7 @@ workflow TAXPROFILER {
                skip: true
            }

-        ch_reads_runmerged = CAT_FASTQ ( ch_reads_for_cat_branch.cat ).reads
+        ch_reads_runmerged = MERGE_RUNS ( ch_reads_for_cat_branch.cat ).reads
            .mix( ch_reads_for_cat_branch.skip )
            .map {
                meta, reads ->
@ -219,7 +222,7 @@ workflow TAXPROFILER {
            }
            .mix( INPUT_CHECK.out.fasta )

-        ch_versions = ch_versions.mix(CAT_FASTQ.out.versions)
+        ch_versions = ch_versions.mix(MERGE_RUNS.out.versions)

    } else {
        ch_reads_runmerged = ch_shortreads_hostremoved