From 7321daf4f6f141efae218a34b2b69c482d1dfeed Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Fri, 18 Feb 2022 08:41:34 +0100
Subject: [PATCH 01/34] Update README with approximate pipeline outline

---
 README.md | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8f33f2c..f57666d 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@
 ## Introduction
 
 <!-- TODO nf-core: Write a 1-2 sentence summary of what data the pipeline is for and what it does -->
-**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for Taxonomic profiling of shotgun metagenomic data.
+**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling against multiple profiling tools and databases and produces standardised output tables.
 
 The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!
 
@@ -29,7 +29,23 @@ On release, automated continuous integration tests run the pipeline on a full-si
 <!-- TODO nf-core: Fill in short bullet-pointed list of the default steps in the pipeline -->
 
 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
-2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
+2. Performs optional read pre-processing
+   - Adapter clipping and merging
+   - Low complexity filtering
+   - Host read removal
+   - Run merging
+3. Performs taxonomic profiling a choice of:
+   - Kraken2
+   - MetaPhlAn3
+   - MALT
+   - DIAMOND
+   - Centrifuge
+   - Kaiju
+   - mOTUs
+4.  Perform optional post-processing with:
+    -  bracken
+5.  Standardises output tables
+6. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
 
 ## Quick Start
 

From c68afbbf90c98e87716dc526d6c5a9631a341201 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates" <jfy133@gmail.com>
Date: Fri, 18 Feb 2022 08:45:06 +0100
Subject: [PATCH 02/34] Update nextflow.config

---
 nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index c2d51b5..186979c 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -121,7 +121,7 @@ if (!params.igenomes_ignore) {
 }
 
 // Export these variables to prevent local Python/R libraries from conflicting with those in the container
-// The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. 
+// The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container.
 // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable.
 
 env {

From f867c057a4e1f0c7f4fc23db44356ee120de261f Mon Sep 17 00:00:00 2001
From: maxibor <maxime.borry@gmail.com>
Date: Fri, 18 Feb 2022 11:53:13 +0100
Subject: [PATCH 03/34] add more columns to samplesheet

---
 bin/check_samplesheet.py | 138 +++++++++++++++++++++++++++++++++------
 1 file changed, 118 insertions(+), 20 deletions(-)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index 41dd9aa..eb6a7dc 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -3,6 +3,7 @@
 # TODO nf-core: Update the script to check the samplesheet
 # This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
 
+from distutils import extension
 import os
 import sys
 import errno
@@ -10,7 +11,9 @@ import argparse
 
 
 def parse_args(args=None):
-    Description = "Reformat nf-core/taxprofiler samplesheet file and check its contents."
+    Description = (
+        "Reformat nf-core/taxprofiler samplesheet file and check its contents."
+    )
     Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
 
     parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
@@ -43,25 +46,62 @@ def check_samplesheet(file_in, file_out):
     """
     This function checks that the samplesheet follows the following structure:
 
-    sample,fastq_1,fastq_2
-    SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
-    SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
-    SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
+    sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
+    2611,ERR5766174,ILLUMINA,NA,NA,FA_EXTENSIONSERX5474930_ERR5766174_1.fa.gz
+    2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz,NA
+    2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,NA,NA
+    2613,ERR5766181,ILLUMINA,ERX5474930_ERR5766174_1.fa.gz,ERX5474930_ERR5766174_2.fa.gz,NA
 
     For an example see:
     https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
     """
 
+    FQ_EXTENSIONS = (".fq", ".fq.gz", ".fastq", ".fastq.gz")
+    FA_EXTENSIONS = (
+        ".fa",
+        ".fa.gz",
+        ".fasta",
+        ".fasta.gz",
+        ".fna",
+        ".fna.gz",
+        ".fas",
+        ".fas.gz",
+    )
+    INSTRUMENT_PLATFORMS = [
+        "ABI_SOLID",
+        "BGISEQ",
+        "CAPILLARY",
+        "COMPLETE_GENOMICS",
+        "DNBSEQ",
+        "HELICOS",
+        "ILLUMINA",
+        "ION_TORRENT",
+        "LS454",
+        "OXFORD_NANOPORE",
+        "PACBIO_SMRT",
+    ]
+
     sample_mapping_dict = {}
     with open(file_in, "r") as fin:
 
         ## Check header
-        MIN_COLS = 2
+        MIN_COLS = 4
         # TODO nf-core: Update the column names for the input samplesheet
-        HEADER = ["sample", "fastq_1", "fastq_2"]
+        HEADER = [
+            "sample",
+            "run_accession",
+            "instrument_platform",
+            "fastq_1",
+            "fastq_2",
+            "fasta",
+        ]
         header = [x.strip('"') for x in fin.readline().strip().split(",")]
         if header[: len(HEADER)] != HEADER:
-            print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
+            print(
+                "ERROR: Please check samplesheet header -> {} != {}".format(
+                    ",".join(header), ",".join(HEADER)
+                )
+            )
             sys.exit(1)
 
         ## Check sample entries
@@ -78,13 +118,22 @@ def check_samplesheet(file_in, file_out):
             num_cols = len([x for x in lspl if x])
             if num_cols < MIN_COLS:
                 print_error(
-                    "Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
+                    "Invalid number of populated columns (minimum = {})!".format(
+                        MIN_COLS
+                    ),
                     "Line",
                     line,
                 )
 
             ## Check sample name entries
-            sample, fastq_1, fastq_2 = lspl[: len(HEADER)]
+            (
+                sample,
+                run_accession,
+                instrument_platform,
+                fastq_1,
+                fastq_2,
+                fasta,
+            ) = lspl[: len(HEADER)]
             sample = sample.replace(" ", "_")
             if not sample:
                 print_error("Sample entry has not been specified!", "Line", line)
@@ -94,23 +143,55 @@ def check_samplesheet(file_in, file_out):
                 if fastq:
                     if fastq.find(" ") != -1:
                         print_error("FastQ file contains spaces!", "Line", line)
-                    if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
+                    if not fastq.endswith(FQ_EXTENSIONS):
                         print_error(
-                            "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
+                            f"FastQ file does not have extension {' or '.join(list(FQ_EXTENSIONS))} !",
                             "Line",
                             line,
                         )
+            if fasta:
+                if fasta.find(" ") != -1:
+                    print_error("FastA file contains spaces!", "Line", line)
+                if not fasta.endswith(FA_EXTENSIONS):
+                    print_error(
+                        f"FastA file does not have extension {' or '.join(list(FA_EXTENSIONS))}!",
+                        "Line",
+                        line,
+                    )
+            sample_info = []
+
+            # Check run_accession
+            if not run_accession:
+                print_error("Run accession has not been specified!", "Line", line)
+            else:
+                sample_info.append(run_accession)
+
+            # Check instrument_platform
+            if not instrument_platform:
+                print_error("Instrument platform has not been specified!", "Line", line)
+            else:
+                if instrument_platform not in INSTRUMENT_PLATFORMS:
+                    print_error(
+                        f"Instrument platform {instrument_platform} is not supported!",
+                        f"List of supported platforms {', '.join(INSTRUMENT_PLATFORMS)}",
+                        "Line",
+                        line,
+                    )
+                sample_info.append(instrument_platform)
 
             ## Auto-detect paired-end/single-end
-            sample_info = []  ## [single_end, fastq_1, fastq_2]
             if sample and fastq_1 and fastq_2:  ## Paired-end short reads
-                sample_info = ["0", fastq_1, fastq_2]
+                sample_info.extend(["0", fastq_1, fastq_2, fasta])
             elif sample and fastq_1 and not fastq_2:  ## Single-end short reads
-                sample_info = ["1", fastq_1, fastq_2]
+                sample_info.extend(["1", fastq_1, fastq_2, fasta])
+            elif (
+                sample and fasta and not fastq_1 and not fastq_2
+            ):  ## Single-end long reads
+                sample_info.extend(["1", fastq_1, fastq_2, fasta])
             else:
                 print_error("Invalid combination of columns provided!", "Line", line)
 
-            ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] }
+            ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 , fasta, run_accession, instrument_platform] }
             if sample not in sample_mapping_dict:
                 sample_mapping_dict[sample] = [sample_info]
             else:
@@ -120,19 +201,36 @@ def check_samplesheet(file_in, file_out):
                     sample_mapping_dict[sample].append(sample_info)
 
     ## Write validated samplesheet with appropriate columns
+    HEADER_OUT = [
+        "sample",
+        "run_accession",
+        "instrument_platform",
+        "single_end",
+        "fastq_1",
+        "fastq_2",
+        "fasta",
+    ]
     if len(sample_mapping_dict) > 0:
         out_dir = os.path.dirname(file_out)
         make_dir(out_dir)
         with open(file_out, "w") as fout:
-            fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n")
+            fout.write(",".join(HEADER_OUT) + "\n")
             for sample in sorted(sample_mapping_dict.keys()):
 
                 ## Check that multiple runs of the same sample are of the same datatype
-                if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):
-                    print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample))
+                if not all(
+                    x[0] == sample_mapping_dict[sample][0][0]
+                    for x in sample_mapping_dict[sample]
+                ):
+                    print_error(
+                        "Multiple runs of a sample must be of the same datatype!",
+                        "Sample: {}".format(sample),
+                    )
 
                 for idx, val in enumerate(sample_mapping_dict[sample]):
-                    fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n")
+                    fout.write(
+                        ",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n"
+                    )
     else:
         print_error("No entries to process!", "Samplesheet: {}".format(file_in))
 

From 54a1a4fd459afa7e9987c72e5e1d4b7ac67683d0 Mon Sep 17 00:00:00 2001
From: maxibor <maxime.borry@gmail.com>
Date: Fri, 18 Feb 2022 13:11:18 +0100
Subject: [PATCH 04/34] update samplesheet specs

---
 bin/check_samplesheet.py | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index eb6a7dc..77a5107 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -47,13 +47,10 @@ def check_samplesheet(file_in, file_out):
     This function checks that the samplesheet follows the following structure:
 
     sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
-    2611,ERR5766174,ILLUMINA,NA,NA,FA_EXTENSIONSERX5474930_ERR5766174_1.fa.gz
-    2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz,NA
-    2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,NA,NA
-    2613,ERR5766181,ILLUMINA,ERX5474930_ERR5766174_1.fa.gz,ERX5474930_ERR5766174_2.fa.gz,NA
-
-    For an example see:
-    https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
+    2611,ERR5766174,ILLUMINA,,,ERX5474930_ERR5766174_1.fa.gz
+    2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz,
+    2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,,
+    2613,ERR5766181,ILLUMINA,ERX5474937_ERR5766181_1.fastq.gz,ERX5474937_ERR5766181_2.fastq.gz,
     """
 
     FQ_EXTENSIONS = (".fq", ".fq.gz", ".fastq", ".fastq.gz")
@@ -216,21 +213,9 @@ def check_samplesheet(file_in, file_out):
         with open(file_out, "w") as fout:
             fout.write(",".join(HEADER_OUT) + "\n")
             for sample in sorted(sample_mapping_dict.keys()):
-
-                ## Check that multiple runs of the same sample are of the same datatype
-                if not all(
-                    x[0] == sample_mapping_dict[sample][0][0]
-                    for x in sample_mapping_dict[sample]
-                ):
-                    print_error(
-                        "Multiple runs of a sample must be of the same datatype!",
-                        "Sample: {}".format(sample),
-                    )
-
                 for idx, val in enumerate(sample_mapping_dict[sample]):
-                    fout.write(
-                        ",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n"
-                    )
+                    fout.write(f"{sample},{','.join(val)}\n")
+                    # fout.write(f",".join(["{}".format(sample)] + val) + "\n")
     else:
         print_error("No entries to process!", "Samplesheet: {}".format(file_in))
 

From a6cfa0a1ba3b9c0ad2263d8225bcbc05117f4bc3 Mon Sep 17 00:00:00 2001
From: maxibor <maxime.borry@gmail.com>
Date: Fri, 18 Feb 2022 13:15:30 +0100
Subject: [PATCH 05/34] cleanup

---
 bin/check_samplesheet.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index 77a5107..0f19523 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -1,8 +1,5 @@
 #!/usr/bin/env python
 
-# TODO nf-core: Update the script to check the samplesheet
-# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
-
 from distutils import extension
 import os
 import sys
@@ -83,7 +80,6 @@ def check_samplesheet(file_in, file_out):
 
         ## Check header
         MIN_COLS = 4
-        # TODO nf-core: Update the column names for the input samplesheet
         HEADER = [
             "sample",
             "run_accession",
@@ -188,7 +184,7 @@ def check_samplesheet(file_in, file_out):
             else:
                 print_error("Invalid combination of columns provided!", "Line", line)
 
-            ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 , fasta, run_accession, instrument_platform] }
+            ## Create sample mapping dictionary = { sample: [ run_accession, instrument_platform, single_end, fastq_1, fastq_2 , fasta ] }
             if sample not in sample_mapping_dict:
                 sample_mapping_dict[sample] = [sample_info]
             else:
@@ -215,7 +211,6 @@ def check_samplesheet(file_in, file_out):
             for sample in sorted(sample_mapping_dict.keys()):
                 for idx, val in enumerate(sample_mapping_dict[sample]):
                     fout.write(f"{sample},{','.join(val)}\n")
-                    # fout.write(f",".join(["{}".format(sample)] + val) + "\n")
     else:
         print_error("No entries to process!", "Samplesheet: {}".format(file_in))
 

From 1b893cb039fee4544cef4d716668ef62c8551d17 Mon Sep 17 00:00:00 2001
From: maxibor <maxime.borry@gmail.com>
Date: Fri, 18 Feb 2022 13:27:10 +0100
Subject: [PATCH 06/34] add check for fastq with fasta

---
 bin/check_samplesheet.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index 0f19523..d6e7123 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -181,6 +181,12 @@ def check_samplesheet(file_in, file_out):
                 sample and fasta and not fastq_1 and not fastq_2
             ):  ## Single-end long reads
                 sample_info.extend(["1", fastq_1, fastq_2, fasta])
+            elif fasta and (fastq_1 or fastq_2):
+                print_error(
+                    "FastQ and FastA files cannot be specified together in the same library!",
+                    "Line",
+                    line,
+                )
             else:
                 print_error("Invalid combination of columns provided!", "Line", line)
 

From cf55cc592cf41868f8cd480f53116525ab08f960 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Fri, 18 Feb 2022 16:51:01 +0100
Subject: [PATCH 07/34] Get skeleton read processing to input for profiling

---
 README.md                                  |  6 +-
 conf/modules.config                        | 39 +++++++++++
 conf/test.config                           |  4 +-
 lib/WorkflowTaxprofiler.groovy             |  9 +--
 modules.json                               |  6 ++
 modules/nf-core/modules/cat/fastq/main.nf  | 51 ++++++++++++++
 modules/nf-core/modules/cat/fastq/meta.yml | 39 +++++++++++
 modules/nf-core/modules/fastp/main.nf      | 75 ++++++++++++++++++++
 modules/nf-core/modules/fastp/meta.yml     | 68 ++++++++++++++++++
 nextflow.config                            |  5 +-
 nextflow_schema.json                       |  9 ---
 subworkflows/local/input_check.nf          | 43 ++++++++++--
 workflows/taxprofiler.nf                   | 80 +++++++++++++++++++++-
 13 files changed, 407 insertions(+), 27 deletions(-)
 create mode 100644 modules/nf-core/modules/cat/fastq/main.nf
 create mode 100644 modules/nf-core/modules/cat/fastq/meta.yml
 create mode 100644 modules/nf-core/modules/fastp/main.nf
 create mode 100644 modules/nf-core/modules/fastp/meta.yml

diff --git a/README.md b/README.md
index f57666d..5f00709 100644
--- a/README.md
+++ b/README.md
@@ -42,9 +42,9 @@ On release, automated continuous integration tests run the pipeline on a full-si
    - Centrifuge
    - Kaiju
    - mOTUs
-4.  Perform optional post-processing with:
-    -  bracken
-5.  Standardises output tables
+4. Perform optional post-processing with:
+    - bracken
+5. Standardises output tables
 6. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
 
 ## Quick Start
diff --git a/conf/modules.config b/conf/modules.config
index a0506a4..2d533e9 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -28,8 +28,47 @@ process {
 
     withName: FASTQC {
         ext.args = '--quiet'
+        ext.prefix = { "${meta.id}_${meta.run_accession}_raw" }
+        publishDir = [
+            path: { "${params.outdir}/fastqc/raw" },
+            mode: 'copy',
+            pattern: '*.html'
+        ]
     }
 
+    withName: FASTP {
+        ext.prefix = { "${meta.id}_${meta.run_accession}" }
+        // TODO also include option to NOT merge
+        ext.args   = [
+            { ${meta.single_end} } == 0 ? "-m" : '',
+            params.fastp_exclude_unmerged ? '' : "--include_unmerged"
+        ].join(' ').trim()
+        publishDir = [
+            path: { "${params.outdir}/fastp" },
+            mode: 'copy',
+            pattern: '*.fastq.gz'
+        ]
+    }
+
+    withName: FASTQC_POST {
+        ext.args = '--quiet'
+        ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
+        publishDir = [
+            path: { "${params.outdir}/fastqc/processed" },
+            mode: 'copy',
+            pattern: '*.html'
+        ]
+    }
+
+    withName: CAT_FASTQ {
+        publishDir = [
+            path: { "${params.outdir}/prepared_sequences" },
+            mode: 'copy',
+            pattern: '*.fastq.gz'
+        ]
+    }
+
+
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },
diff --git a/conf/test.config b/conf/test.config
index 45f87af..5db566a 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -22,8 +22,6 @@ params {
     // Input data
     // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
     // TODO nf-core: Give any required params for the test so that command line flags are not needed
-    input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv'
+    input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
 
-    // Genome references
-    genome = 'R64-1-1'
 }
diff --git a/lib/WorkflowTaxprofiler.groovy b/lib/WorkflowTaxprofiler.groovy
index 53482ac..57a95e3 100755
--- a/lib/WorkflowTaxprofiler.groovy
+++ b/lib/WorkflowTaxprofiler.groovy
@@ -10,10 +10,11 @@ class WorkflowTaxprofiler {
     public static void initialise(params, log) {
         genomeExistsError(params, log)
 
-        if (!params.fasta) {
-            log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file."
-            System.exit(1)
-        }
+        // TODO update as necessary
+        //if (!params.fasta) {
+        //    log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file."
+        //    System.exit(1)
+        //}
     }
 
     //
diff --git a/modules.json b/modules.json
index 939b85f..6cf2b3e 100644
--- a/modules.json
+++ b/modules.json
@@ -3,9 +3,15 @@
     "homePage": "https://github.com/nf-core/taxprofiler",
     "repos": {
         "nf-core/modules": {
+            "cat/fastq": {
+                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
+            },
             "custom/dumpsoftwareversions": {
                 "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41"
             },
+            "fastp": {
+                "git_sha": "d0a1cbb703a130c19f6796c3fce24fbe7dfce789"
+            },
             "fastqc": {
                 "git_sha": "9d0cad583b9a71a6509b754fdf589cbfbed08961"
             },
diff --git a/modules/nf-core/modules/cat/fastq/main.nf b/modules/nf-core/modules/cat/fastq/main.nf
new file mode 100644
index 0000000..bf0877c
--- /dev/null
+++ b/modules/nf-core/modules/cat/fastq/main.nf
@@ -0,0 +1,51 @@
+process CAT_FASTQ {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda (params.enable_conda ? "conda-forge::sed=4.7" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img' :
+        'biocontainers/biocontainers:v1.2.0_cv1' }"
+
+    input:
+    tuple val(meta), path(reads, stageAs: "input*/*")
+
+    output:
+    tuple val(meta), path("*.merged.fastq.gz"), emit: reads
+    path "versions.yml"                       , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def readList = reads.collect{ it.toString() }
+    if (meta.single_end) {
+        if (readList.size > 1) {
+            """
+            cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz
+
+            cat <<-END_VERSIONS > versions.yml
+            "${task.process}":
+                cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//')
+            END_VERSIONS
+            """
+        }
+    } else {
+        if (readList.size > 2) {
+            def read1 = []
+            def read2 = []
+            readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v }
+            """
+            cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz
+            cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz
+
+            cat <<-END_VERSIONS > versions.yml
+            "${task.process}":
+                cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//')
+            END_VERSIONS
+            """
+        }
+    }
+}
diff --git a/modules/nf-core/modules/cat/fastq/meta.yml b/modules/nf-core/modules/cat/fastq/meta.yml
new file mode 100644
index 0000000..c836598
--- /dev/null
+++ b/modules/nf-core/modules/cat/fastq/meta.yml
@@ -0,0 +1,39 @@
+name: cat_fastq
+description: Concatenates fastq files
+keywords:
+  - fastq
+  - concatenate
+tools:
+  - cat:
+      description: |
+        The cat utility reads files sequentially, writing them to the standard output.
+      documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html
+      licence: ["GPL-3.0-or-later"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: list
+      description: |
+        List of input FastQ files to be concatenated.
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: Merged fastq file
+      pattern: "*.{merged.fastq.gz}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+authors:
+  - "@joseespinosa"
+  - "@drpatelh"
diff --git a/modules/nf-core/modules/fastp/main.nf b/modules/nf-core/modules/fastp/main.nf
new file mode 100644
index 0000000..5c9e3b8
--- /dev/null
+++ b/modules/nf-core/modules/fastp/main.nf
@@ -0,0 +1,75 @@
+process FASTP {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda (params.enable_conda ? 'bioconda::fastp=0.23.2' : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/fastp:0.23.2--h79da9fb_0' :
+        'quay.io/biocontainers/fastp:0.23.2--h79da9fb_0' }"
+
+    input:
+    tuple val(meta), path(reads)
+    val   save_trimmed_fail
+    val   save_merged
+
+    output:
+    tuple val(meta), path('*.trim.fastq.gz')  , optional:true, emit: reads
+    tuple val(meta), path('*.json')           , emit: json
+    tuple val(meta), path('*.html')           , emit: html
+    tuple val(meta), path('*.log')            , emit: log
+    path "versions.yml"                       , emit: versions
+    tuple val(meta), path('*.fail.fastq.gz')  , optional:true, emit: reads_fail
+    tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    // Added soft-links to original fastqs for consistent naming in MultiQC
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    if (meta.single_end) {
+        def fail_fastq = save_trimmed_fail ? "--failed_out ${prefix}.fail.fastq.gz" : ''
+        """
+        [ ! -f  ${prefix}.fastq.gz ] && ln -s $reads ${prefix}.fastq.gz
+        fastp \\
+            --in1 ${prefix}.fastq.gz \\
+            --out1 ${prefix}.trim.fastq.gz \\
+            --thread $task.cpus \\
+            --json ${prefix}.fastp.json \\
+            --html ${prefix}.fastp.html \\
+            $fail_fastq \\
+            $args \\
+            2> ${prefix}.fastp.log
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
+        END_VERSIONS
+        """
+    } else {
+        def fail_fastq  = save_trimmed_fail ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : ''
+        def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : ''
+        """
+        [ ! -f  ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz
+        [ ! -f  ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz
+        fastp \\
+            --in1 ${prefix}_1.fastq.gz \\
+            --in2 ${prefix}_2.fastq.gz \\
+            --out1 ${prefix}_1.trim.fastq.gz \\
+            --out2 ${prefix}_2.trim.fastq.gz \\
+            --json ${prefix}.fastp.json \\
+            --html ${prefix}.fastp.html \\
+            $fail_fastq \\
+            $merge_fastq \\
+            --thread $task.cpus \\
+            --detect_adapter_for_pe \\
+            $args \\
+            2> ${prefix}.fastp.log
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
+        END_VERSIONS
+        """
+    }
+}
diff --git a/modules/nf-core/modules/fastp/meta.yml b/modules/nf-core/modules/fastp/meta.yml
new file mode 100644
index 0000000..3274e41
--- /dev/null
+++ b/modules/nf-core/modules/fastp/meta.yml
@@ -0,0 +1,68 @@
+name: fastp
+description: Perform adapter/quality trimming on sequencing reads
+keywords:
+  - trimming
+  - quality control
+  - fastq
+tools:
+  - fastp:
+      description: |
+        A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance.
+      documentation: https://github.com/OpenGene/fastp
+      doi: https://doi.org/10.1093/bioinformatics/bty560
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: |
+        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+        respectively.
+  - save_trimmed_fail:
+      type: boolean
+      description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz`
+  - save_merged:
+      type: boolean
+      description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz`
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: The trimmed/modified/unmerged fastq reads
+      pattern: "*trim.fastq.gz"
+  - json:
+      type: file
+      description: Results in JSON format
+      pattern: "*.json"
+  - html:
+      type: file
+      description: Results in HTML format
+      pattern: "*.html"
+  - log:
+      type: file
+      description: fastq log file
+      pattern: "*.log"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - reads_fail:
+      type: file
+      description: Reads the failed the preprocessing
+      pattern: "*fail.fastq.gz"
+  - reads_merged:
+      type: file
+      description: Reads that were successfully merged
+      pattern: "*.{merged.fastq.gz}"
+authors:
+  - "@drpatelh"
+  - "@kevinmenden"
diff --git a/nextflow.config b/nextflow.config
index 186979c..160dba7 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -33,7 +33,7 @@ params {
     help                       = false
     validate_params            = true
     show_hidden_params         = false
-    schema_ignore_params       = 'genomes'
+    schema_ignore_params       = 'genomes,fasta'
     enable_conda               = false
 
     // Config options
@@ -50,6 +50,9 @@ params {
     max_cpus                   = 16
     max_time                   = '240.h'
 
+    // FASTQ preprocessing
+    fastp_clip_merge       = false
+    fastp_exclude_unmerged = true
 }
 
 // Load base.config by default for all pipelines
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 535e08d..cc43add 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -56,15 +56,6 @@
                     "fa_icon": "fas fa-book",
                     "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details."
                 },
-                "fasta": {
-                    "type": "string",
-                    "format": "file-path",
-                    "mimetype": "text/plain",
-                    "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$",
-                    "description": "Path to FASTA genome file.",
-                    "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.",
-                    "fa_icon": "far fa-file-code"
-                },
                 "igenomes_base": {
                     "type": "string",
                     "format": "directory-path",
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index cddcbb3..241a8a7 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -9,22 +9,38 @@ workflow INPUT_CHECK {
     samplesheet // file: /path/to/samplesheet.csv
 
     main:
-    SAMPLESHEET_CHECK ( samplesheet )
+    parsed_samplesheet = SAMPLESHEET_CHECK ( samplesheet )
         .csv
         .splitCsv ( header:true, sep:',' )
+        .dump(tag: "split_csv_out")
+        .branch {
+            fasta: it['fasta'] != ''
+            fastq: true
+        }
+
+    parsed_samplesheet.fastq
         .map { create_fastq_channels(it) }
-        .set { reads }
+        .dump(tag: "fastq_channel_init")
+        .set { fastq }
+
+    parsed_samplesheet.fasta
+        .map { create_fasta_channels(it) }
+        .dump(tag: "fasta_channel_init")
+        .set { fasta }
 
     emit:
-    reads                                     // channel: [ val(meta), [ reads ] ]
+    fastq                                     // channel: [ val(meta), [ reads ] ]
+    fasta                                     // channel: [ val(meta), fasta ]
     versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ]
 }
 
 // Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
 def create_fastq_channels(LinkedHashMap row) {
     def meta = [:]
-    meta.id           = row.sample
-    meta.single_end   = row.single_end.toBoolean()
+    meta.id                     = row.sample
+    meta.run_accession          = row.run_accession
+    meta.instrument_platform    = row.instrument_platform
+    meta.single_end             = row.single_end.toBoolean()
 
     def array = []
     if (!file(row.fastq_1).exists()) {
@@ -40,3 +56,20 @@ def create_fastq_channels(LinkedHashMap row) {
     }
     return array
 }
+
+// Function to get list of [ meta, fasta ]
+def create_fasta_channels(LinkedHashMap row) {
+    def meta = [:]
+    meta.id                     = row.sample
+    meta.run_accession          = row.run_accession
+    meta.instrument_platform    = row.instrument_platform
+    meta.single_end             = true
+
+    def array = []
+    if (!file(row.fasta).exists()) {
+        exit 1, "ERROR: Please check input samplesheet -> FastA file does not exist!\n${row.fasta}"
+    }
+    array = [ meta, [ file(row.fasta) ] ]
+
+    return array
+}
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index 56c532b..3f10a9d 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -11,7 +11,7 @@ WorkflowTaxprofiler.initialise(params, log)
 
 // TODO nf-core: Add all file path parameters for the pipeline to the list below
 // Check input path parameters to see if they exist
-def checkPathParamList = [ params.input, params.multiqc_config, params.fasta ]
+def checkPathParamList = [ params.input, params.multiqc_config ]
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 
 // Check mandatory parameters
@@ -50,6 +50,11 @@ include { FASTQC                      } from '../modules/nf-core/modules/fastqc/
 include { MULTIQC                     } from '../modules/nf-core/modules/multiqc/main'
 include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main'
 
+include { FASTP as FASTP_SINGLE       } from '../modules/nf-core/modules/fastp/main'
+include { FASTP as FASTP_PAIRED       } from '../modules/nf-core/modules/fastp/main'
+include { FASTQC as FASTQC_POST       } from '../modules/nf-core/modules/fastqc/main'
+include { CAT_FASTQ                   } from '../modules/nf-core/modules/cat/fastq/main'
+
 /*
 ========================================================================================
     RUN MAIN WORKFLOW
@@ -75,7 +80,7 @@ workflow TAXPROFILER {
     // MODULE: Run FastQC
     //
     FASTQC (
-        INPUT_CHECK.out.reads
+        INPUT_CHECK.out.fastq
     )
     ch_versions = ch_versions.mix(FASTQC.out.versions.first())
 
@@ -83,6 +88,71 @@ workflow TAXPROFILER {
         ch_versions.unique().collectFile(name: 'collated_versions.yml')
     )
 
+    //
+    // MODULE: Run Clip/Merge/Complexity
+    //
+    // TODO give option to clip only and retain pairs
+    // TODO give option to retain singletons (probably fastp option likely)
+    // TODO move to subworkflow
+    if ( params.fastp_clip_merge ) {
+
+        ch_input_for_fastp = INPUT_CHECK.out.fastq
+                                .dump(tag: "pre-fastp_branch")
+                                .branch{
+                                    single: it[0]['single_end'] == true
+                                    paired: it[0]['single_end'] == false
+                                }
+
+        ch_input_for_fastp.single.dump(tag: "input_fastp_single")
+        ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
+
+        FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
+        FASTP_PAIRED ( ch_input_for_fastp.paired, false, true )
+
+        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
+                                    .mix( FASTP_SINGLE.out.reads )
+                                    .map {
+                                        meta, reads ->
+                                        def meta_new = meta.clone()
+                                        meta_new['single_end'] = 1
+                                        [ meta_new, reads ]
+                                    }
+
+        FASTQC_POST ( ch_fastp_reads_prepped )
+
+        ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
+        ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
+
+        ch_processed_reads = ch_fastp_reads_prepped
+
+    } else {
+        ch_processed_reads = INPUT_CHECK.out.fastq
+    }
+
+
+    // MODULE: Cat merge runs of same sample
+    ch_processed_for_combine = ch_processed_reads
+        .dump(tag: "prep_for_combine_grouping")
+        .map {
+            meta, reads ->
+            def meta_new = meta.clone()
+            meta_new['run_accession'] = 'combined'
+            [ meta_new, reads ]
+        }
+        .groupTuple ( by: 0 )
+        .branch{
+            combine: it[1].size() >= 2
+            skip: it[1].size() < 2
+        }
+
+    CAT_FASTQ ( ch_processed_for_combine.combine )
+
+    // Ready for profiling!
+    ch_reads_for_profiling = ch_processed_for_combine.skip
+                                .dump(tag: "skip_combine")
+                                .mix( CAT_FASTQ.out.reads )
+                                .dump(tag: "files_for_profiling")
+
     //
     // MODULE: MultiQC
     //
@@ -95,6 +165,12 @@ workflow TAXPROFILER {
     ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
     ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
     ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
+    if (params.fastp_clip_merge) {
+        ch_multiqc_files = ch_multiqc_files.mix(FASTP_SINGLE.out.json.collect{it[1]}.ifEmpty([]))
+        ch_multiqc_files = ch_multiqc_files.mix(FASTP_PAIRED.out.json.collect{it[1]}.ifEmpty([]))
+        ch_multiqc_files = ch_multiqc_files.mix(FASTQC_POST.out.zip.collect{it[1]}.ifEmpty([]))
+    }
+
 
     MULTIQC (
         ch_multiqc_files.collect()

From 278f5605ca831338384b5045e34f500f0b5ca1a2 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Sat, 19 Feb 2022 12:36:08 +0100
Subject: [PATCH 08/34] Added database preparation and final channel for
 profiling

---
 modules/local/database_check.nf     | 25 ++++++++++
 nextflow.config                     |  3 ++
 subworkflows/local/db_check.nf      | 40 ++++++++++++++++
 subworkflows/local/input_check.nf   |  2 +-
 subworkflows/local/preprocessing.nf | 73 +++++++++++++++++++++++++++++
 workflows/taxprofiler.nf            | 65 ++++++++-----------------
 6 files changed, 161 insertions(+), 47 deletions(-)
 create mode 100644 modules/local/database_check.nf
 create mode 100644 subworkflows/local/db_check.nf
 create mode 100644 subworkflows/local/preprocessing.nf

diff --git a/modules/local/database_check.nf b/modules/local/database_check.nf
new file mode 100644
index 0000000..4da4313
--- /dev/null
+++ b/modules/local/database_check.nf
@@ -0,0 +1,25 @@
+process DATABASE_CHECK {
+    tag "$databasesheet"
+
+    conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/python:3.8.3' :
+        'quay.io/biocontainers/python:3.8.3' }"
+
+    input:
+    path databasesheet
+
+    output:
+    path '*.csv'       , emit: csv
+    path "versions.yml", emit: versions
+
+    script: // This script is bundled with the pipeline, in nf-core/taxprofiler/bin/
+    """
+    cat $databasesheet >> database_sheet.valid.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+}
diff --git a/nextflow.config b/nextflow.config
index 160dba7..60b5a42 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -50,6 +50,9 @@ params {
     max_cpus                   = 16
     max_time                   = '240.h'
 
+    // Databaess
+    databases = null
+
     // FASTQ preprocessing
     fastp_clip_merge       = false
     fastp_exclude_unmerged = true
diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf
new file mode 100644
index 0000000..909d98f
--- /dev/null
+++ b/subworkflows/local/db_check.nf
@@ -0,0 +1,40 @@
+//
+// Check input samplesheet and get read channels
+//
+
+include { DATABASE_CHECK } from '../../modules/local/database_check'
+
+workflow DB_CHECK {
+    take:
+    dbsheet // file: /path/to/dbsheet.csv
+
+    main:
+
+    // TODO: make database sheet check
+    parsed_samplesheet = DATABASE_CHECK ( dbsheet )
+        .csv
+        .splitCsv ( header:true, sep:',' )
+        .dump(tag: "db_split_csv_out")
+        .map { create_db_channels(it) }
+        .dump(tag: "db_channel_prepped")
+        .set{ dbs }
+
+    emit:
+    dbs                                       // channel: [ val(meta), [ db ] ]
+    versions = DATABASE_CHECK.out.versions // channel: [ versions.yml ]
+}
+
+def create_db_channels(LinkedHashMap row) {
+    def meta = [:]
+    meta.tool             = row.tool
+    meta.db_name          = row.db_name
+    meta.db_params        = row.db_params
+
+    def array = []
+    if (!file(row.db_path, type: 'dir').exists()) {
+        exit 1, "ERROR: Please check input samplesheet -> database could not be found!\n${row.db_path}"
+    }
+    array = [ meta, file(row.db_path) ]
+
+    return array
+}
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 241a8a7..8497faa 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -12,7 +12,7 @@ workflow INPUT_CHECK {
     parsed_samplesheet = SAMPLESHEET_CHECK ( samplesheet )
         .csv
         .splitCsv ( header:true, sep:',' )
-        .dump(tag: "split_csv_out")
+        .dump(tag: "input_split_csv_out")
         .branch {
             fasta: it['fasta'] != ''
             fastq: true
diff --git a/subworkflows/local/preprocessing.nf b/subworkflows/local/preprocessing.nf
new file mode 100644
index 0000000..5832824
--- /dev/null
+++ b/subworkflows/local/preprocessing.nf
@@ -0,0 +1,73 @@
+//
+// Check input samplesheet and get read channels
+//
+
+
+include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
+include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
+include { FASTQC as FASTQC_POST       } from '../../modules/nf-core/modules/fastqc/main'
+
+workflow FASTQ_PREPROCESSING {
+    take:
+    reads // file: /path/to/samplesheet.csv
+
+    main:
+    ch_versions = Channel.empty()
+    ch_multiqc_files      = Channel.empty()
+
+    //
+    // STEP: Read clipping and merging
+    //
+    // TODO give option to clip only and retain pairs
+    // TODO give option to retain singletons (probably fastp option likely)
+    // TODO move to subworkflow
+
+
+    if ( params.fastp_clip_merge ) {
+
+        ch_input_for_fastp = reads
+                                .dump(tag: "pre-fastp_branch")
+                                .branch{
+                                    single: it[0]['single_end'] == true
+                                    paired: it[0]['single_end'] == false
+                                }
+
+        ch_input_for_fastp.single.dump(tag: "input_fastp_single")
+        ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
+
+        FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
+        FASTP_PAIRED ( ch_input_for_fastp.paired, false, true )
+
+        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
+                                    .mix( FASTP_SINGLE.out.reads )
+                                    .map {
+                                        meta, reads ->
+                                        def meta_new = meta.clone()
+                                        meta_new['single_end'] = 1
+                                        [ meta_new, reads ]
+                                    }
+
+        FASTQC_POST ( ch_fastp_reads_prepped )
+
+        ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
+        ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
+
+        ch_processed_reads = ch_fastp_reads_prepped
+
+        ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} )
+        ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
+        ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
+
+        ch_multiqc_files.dump(tag: "preprocessing_mqc_final")
+
+    } else {
+        ch_processed_reads = reads
+    }
+
+
+    emit:
+    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
+    versions = ch_versions          // channel: [ versions.yml ]
+    mqc      = ch_multiqc_files
+}
+
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index 3f10a9d..4a356a5 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -11,11 +11,12 @@ WorkflowTaxprofiler.initialise(params, log)
 
 // TODO nf-core: Add all file path parameters for the pipeline to the list below
 // Check input path parameters to see if they exist
-def checkPathParamList = [ params.input, params.multiqc_config ]
+def checkPathParamList = [ params.input, params.databases, params.multiqc_config ]
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 
 // Check mandatory parameters
-if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' }
+if (params.input    ) { ch_input     = file(params.input)     } else { exit 1, 'Input samplesheet not specified!' }
+if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
 
 /*
 ========================================================================================
@@ -35,7 +36,11 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi
 //
 // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
 //
-include { INPUT_CHECK } from '../subworkflows/local/input_check'
+include { INPUT_CHECK         } from '../subworkflows/local/input_check'
+
+include { DB_CHECK            } from '../subworkflows/local/db_check'
+include { FASTQ_PREPROCESSING } from '../subworkflows/local/preprocessing'
+
 
 /*
 ========================================================================================
@@ -50,9 +55,6 @@ include { FASTQC                      } from '../modules/nf-core/modules/fastqc/
 include { MULTIQC                     } from '../modules/nf-core/modules/multiqc/main'
 include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main'
 
-include { FASTP as FASTP_SINGLE       } from '../modules/nf-core/modules/fastp/main'
-include { FASTP as FASTP_PAIRED       } from '../modules/nf-core/modules/fastp/main'
-include { FASTQC as FASTQC_POST       } from '../modules/nf-core/modules/fastqc/main'
 include { CAT_FASTQ                   } from '../modules/nf-core/modules/cat/fastq/main'
 
 /*
@@ -76,6 +78,10 @@ workflow TAXPROFILER {
     )
     ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)
 
+    DB_CHECK (
+        ch_databases
+    )
+
     //
     // MODULE: Run FastQC
     //
@@ -91,47 +97,12 @@ workflow TAXPROFILER {
     //
     // MODULE: Run Clip/Merge/Complexity
     //
-    // TODO give option to clip only and retain pairs
-    // TODO give option to retain singletons (probably fastp option likely)
-    // TODO move to subworkflow
     if ( params.fastp_clip_merge ) {
-
-        ch_input_for_fastp = INPUT_CHECK.out.fastq
-                                .dump(tag: "pre-fastp_branch")
-                                .branch{
-                                    single: it[0]['single_end'] == true
-                                    paired: it[0]['single_end'] == false
-                                }
-
-        ch_input_for_fastp.single.dump(tag: "input_fastp_single")
-        ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
-
-        FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
-        FASTP_PAIRED ( ch_input_for_fastp.paired, false, true )
-
-        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
-                                    .mix( FASTP_SINGLE.out.reads )
-                                    .map {
-                                        meta, reads ->
-                                        def meta_new = meta.clone()
-                                        meta_new['single_end'] = 1
-                                        [ meta_new, reads ]
-                                    }
-
-        FASTQC_POST ( ch_fastp_reads_prepped )
-
-        ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
-        ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
-
-        ch_processed_reads = ch_fastp_reads_prepped
-
-    } else {
-        ch_processed_reads = INPUT_CHECK.out.fastq
+        FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq )
     }
 
-
     // MODULE: Cat merge runs of same sample
-    ch_processed_for_combine = ch_processed_reads
+    ch_processed_for_combine = FASTQ_PREPROCESSING.out.reads
         .dump(tag: "prep_for_combine_grouping")
         .map {
             meta, reads ->
@@ -153,6 +124,10 @@ workflow TAXPROFILER {
                                 .mix( CAT_FASTQ.out.reads )
                                 .dump(tag: "files_for_profiling")
 
+    // Combine reads with possible databases
+
+    ch_reads_for_profiling.combine(DB_CHECK.out.dbs).dump(tag: "reads_plus_db")
+
     //
     // MODULE: MultiQC
     //
@@ -166,9 +141,7 @@ workflow TAXPROFILER {
     ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
     ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
     if (params.fastp_clip_merge) {
-        ch_multiqc_files = ch_multiqc_files.mix(FASTP_SINGLE.out.json.collect{it[1]}.ifEmpty([]))
-        ch_multiqc_files = ch_multiqc_files.mix(FASTP_PAIRED.out.json.collect{it[1]}.ifEmpty([]))
-        ch_multiqc_files = ch_multiqc_files.mix(FASTQC_POST.out.zip.collect{it[1]}.ifEmpty([]))
+        ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc)
     }
 
 

From 2c183ed2edc61ab058580cb63441917d516eb564 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Thu, 3 Mar 2022 17:42:02 +0100
Subject: [PATCH 09/34] Add Kraken2 and MALT/run as Proof of Concept (currnetly
 MQC issue)

---
 conf/modules.config                           | 20 ++++++
 modules.json                                  |  6 ++
 .../nf-core/modules/kraken2/kraken2/main.nf   | 49 +++++++++++++
 .../nf-core/modules/kraken2/kraken2/meta.yml  | 60 ++++++++++++++++
 modules/nf-core/modules/malt/run/main.nf      | 49 +++++++++++++
 modules/nf-core/modules/malt/run/meta.yml     | 58 ++++++++++++++++
 nextflow.config                               | 11 ++-
 workflows/taxprofiler.nf                      | 69 +++++++++++++++++--
 8 files changed, 315 insertions(+), 7 deletions(-)
 create mode 100644 modules/nf-core/modules/kraken2/kraken2/main.nf
 create mode 100644 modules/nf-core/modules/kraken2/kraken2/meta.yml
 create mode 100644 modules/nf-core/modules/malt/run/main.nf
 create mode 100644 modules/nf-core/modules/malt/run/meta.yml

diff --git a/conf/modules.config b/conf/modules.config
index 2d533e9..dbc926c 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -68,6 +68,26 @@ process {
         ]
     }
 
+    withName: MALT_RUN {
+        publishDir = [
+            path: { "${params.outdir}/malt/${meta.db_name}" },
+            mode: 'copy',
+            pattern: '*.{rma6,tab,text,sam,log}'
+        ]
+        ext.args = { "${meta.db_params}" }
+        ext.when = params.run_malt
+    }
+
+    withName: KRAKEN2_KRAKEN2 {
+        publishDir = [
+            path: { "${params.outdir}/kraken2/${meta.db_name}" },
+            mode: 'copy',
+            pattern: '.{fastq.gz,txt}'
+        ]
+        ext.args = { "${meta.db_params}" }
+        ext.when = params.run_kraken2
+        ext.prefix = { "${meta.id}-${meta.db_name}" }
+    }
 
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
diff --git a/modules.json b/modules.json
index 6cf2b3e..844b33a 100644
--- a/modules.json
+++ b/modules.json
@@ -15,6 +15,12 @@
             "fastqc": {
                 "git_sha": "9d0cad583b9a71a6509b754fdf589cbfbed08961"
             },
+            "kraken2/kraken2": {
+                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
+            },
+            "malt/run": {
+                "git_sha": "76cdd46f3f8a77fb5023fb5a39c4ab99925b8b56"
+            },
             "multiqc": {
                 "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41"
             }
diff --git a/modules/nf-core/modules/kraken2/kraken2/main.nf b/modules/nf-core/modules/kraken2/kraken2/main.nf
new file mode 100644
index 0000000..3ec5df5
--- /dev/null
+++ b/modules/nf-core/modules/kraken2/kraken2/main.nf
@@ -0,0 +1,49 @@
+process KRAKEN2_KRAKEN2 {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda (params.enable_conda ? 'bioconda::kraken2=2.1.2 conda-forge::pigz=2.6' : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' :
+        'quay.io/biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }"
+
+    input:
+    tuple val(meta), path(reads)
+    path  db
+
+    output:
+    tuple val(meta), path('*classified*')  , emit: classified
+    tuple val(meta), path('*unclassified*'), emit: unclassified
+    tuple val(meta), path('*report.txt')   , emit: txt
+    path "versions.yml"                    , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def paired       = meta.single_end ? "" : "--paired"
+    def classified   = meta.single_end ? "${prefix}.classified.fastq"   : "${prefix}.classified#.fastq"
+    def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq"
+    """
+    kraken2 \\
+        --db $db \\
+        --threads $task.cpus \\
+        --unclassified-out $unclassified \\
+        --classified-out $classified \\
+        --report ${prefix}.kraken2.report.txt \\
+        --gzip-compressed \\
+        $paired \\
+        $args \\
+        $reads
+
+    pigz -p $task.cpus *.fastq
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//')
+        pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/modules/kraken2/kraken2/meta.yml b/modules/nf-core/modules/kraken2/kraken2/meta.yml
new file mode 100644
index 0000000..9d6a385
--- /dev/null
+++ b/modules/nf-core/modules/kraken2/kraken2/meta.yml
@@ -0,0 +1,60 @@
+name: kraken2_kraken2
+description: Classifies metagenomic sequence data
+keywords:
+  - classify
+  - metagenomics
+  - fastq
+  - db
+tools:
+  - kraken2:
+      description: |
+        Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads
+      homepage: https://ccb.jhu.edu/software/kraken2/
+      documentation: https://github.com/DerrickWood/kraken2/wiki/Manual
+      doi: 10.1186/s13059-019-1891-0
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: |
+        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+        respectively.
+  - db:
+      type: directory
+      description: Kraken2 database
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - classified:
+      type: file
+      description: |
+        Reads classified to belong to any of the taxa
+        on the Kraken2 database.
+      pattern: "*{fastq.gz}"
+  - unclassified:
+      type: file
+      description: |
+        Reads not classified to belong to any of the taxa
+        on the Kraken2 database.
+      pattern: "*{fastq.gz}"
+  - txt:
+      type: file
+      description: |
+        Kraken2 report containing stats about classified
+        and not classifed reads.
+      pattern: "*.{report.txt}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@joseespinosa"
+  - "@drpatelh"
diff --git a/modules/nf-core/modules/malt/run/main.nf b/modules/nf-core/modules/malt/run/main.nf
new file mode 100644
index 0000000..61c02ec
--- /dev/null
+++ b/modules/nf-core/modules/malt/run/main.nf
@@ -0,0 +1,49 @@
+process MALT_RUN {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda (params.enable_conda ? "bioconda::malt=0.53" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/malt:0.53--hdfd78af_0' :
+        'quay.io/biocontainers/malt:0.53--hdfd78af_0' }"
+
+    input:
+    tuple val(meta), path(fastqs)
+    val mode
+    path index
+
+    output:
+    tuple val(meta), path("*.rma6")                          , emit: rma6
+    tuple val(meta), path("*.{tab,text,sam}"),  optional:true, emit: alignments
+    tuple val(meta), path("*.log")                           , emit: log
+    path "versions.yml"                                      , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def avail_mem = 6
+    if (!task.memory) {
+        log.info '[MALT_RUN] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.'
+    } else {
+        avail_mem = task.memory.giga
+    }
+
+    """
+    malt-run \\
+        -J-Xmx${avail_mem}g \\
+        -t $task.cpus \\
+        -v \\
+        -o . \\
+        $args \\
+        --inFile ${fastqs.join(' ')} \\
+        -m $mode \\
+        --index $index/ |&tee malt-run.log
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        malt: \$(malt-run --help  2>&1 | grep -o 'version.* ' | cut -f 1 -d ',' | cut -f2 -d ' ')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/modules/malt/run/meta.yml b/modules/nf-core/modules/malt/run/meta.yml
new file mode 100644
index 0000000..ae4277a
--- /dev/null
+++ b/modules/nf-core/modules/malt/run/meta.yml
@@ -0,0 +1,58 @@
+name: malt_run
+description: MALT, an acronym for MEGAN alignment tool, is a sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics.
+keywords:
+  - malt
+  - alignment
+  - metagenomics
+  - ancient DNA
+  - aDNA
+  - palaeogenomics
+  - archaeogenomics
+  - microbiome
+tools:
+  - malt:
+      description: A tool for mapping metagenomic data
+      homepage: https://www.wsi.uni-tuebingen.de/lehrstuehle/algorithms-in-bioinformatics/software/malt/
+      documentation: https://software-ab.informatik.uni-tuebingen.de/download/malt/manual.pdf
+      tool_dev_url: None
+      doi: "10.1038/s41559-017-0446-6"
+      licence: ["GPL v3"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - fastqs:
+      type: file
+      description: Input FASTQ files
+      pattern: "*.{fastq.gz,fq.gz}"
+  - mode:
+      type: string
+      description: Program mode
+      pattern: "Unknown|BlastN|BlastP|BlastX|Classifier"
+  - index:
+      type: directory
+      description: Index/database directory from malt-build
+      pattern: "*/"
+output:
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - rma6:
+      type: file
+      description: MEGAN6 RMA6 file
+      pattern: "*.rma6"
+  - sam:
+      type: file
+      description: Alignment files in Tab, Text or MEGAN-compatible SAM format
+      pattern: "*.{tab,txt,sam}"
+  - log:
+      type: file
+      description: Log of verbose MALT stdout
+      pattern: "malt-run.log"
+
+authors:
+  - "@jfy133"
diff --git a/nextflow.config b/nextflow.config
index 60b5a42..7991bdf 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -54,8 +54,15 @@ params {
     databases = null
 
     // FASTQ preprocessing
-    fastp_clip_merge       = false
-    fastp_exclude_unmerged = true
+    fastp_clip_merge           = false
+    fastp_exclude_unmerged     = true
+
+    // MALT
+    run_malt                   = false
+    malt_mode                  = 'BlastN'
+
+    // kraken2
+    run_kraken2                = false
 }
 
 // Load base.config by default for all pipelines
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index 4a356a5..bd25563 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -56,6 +56,9 @@ include { MULTIQC                     } from '../modules/nf-core/modules/multiqc
 include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main'
 
 include { CAT_FASTQ                   } from '../modules/nf-core/modules/cat/fastq/main'
+include { MALT_RUN                    } from '../modules/nf-core/modules/malt/run/main'
+include { KRAKEN2_KRAKEN2             } from '../modules/nf-core/modules/kraken2/kraken2/main'
+
 
 /*
 ========================================================================================
@@ -95,13 +98,15 @@ workflow TAXPROFILER {
     )
 
     //
-    // MODULE: Run Clip/Merge/Complexity
+    // PERFORM PREPROCESSING
     //
     if ( params.fastp_clip_merge ) {
         FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq )
     }
 
-    // MODULE: Cat merge runs of same sample
+    //
+    // PERFORM RUN MERGING
+    //
     ch_processed_for_combine = FASTQ_PREPROCESSING.out.reads
         .dump(tag: "prep_for_combine_grouping")
         .map {
@@ -118,15 +123,61 @@ workflow TAXPROFILER {
 
     CAT_FASTQ ( ch_processed_for_combine.combine )
 
-    // Ready for profiling!
     ch_reads_for_profiling = ch_processed_for_combine.skip
                                 .dump(tag: "skip_combine")
                                 .mix( CAT_FASTQ.out.reads )
                                 .dump(tag: "files_for_profiling")
 
-    // Combine reads with possible databases
+    //
+    // COMBINE READS WITH POSSIBLE DATABASES
+    //
+
+    // output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
+    ch_input_for_profiling = ch_reads_for_profiling
+            .combine(DB_CHECK.out.dbs)
+            .dump(tag: "reads_plus_db")
+            .branch {
+                malt:    it[2]['tool'] == 'malt'
+                kraken2: it[2]['tool'] == 'kraken2'
+                unknown: true
+            }
+
+    //
+    // PREP PROFILER INPUT CHANNELS ON PER TOOL BASIS
+    //
+
+    // We groupTuple to have all samples in one channel for MALT as database
+    // loading takes a long time, so we only want to run it once per database
+    ch_input_for_malt =  ch_input_for_profiling.malt
+                            .map {
+                                it ->
+                                    def temp_meta =  [ id: it[2]['db_name']]  + it[2]
+                                    def db = it[3]
+                                    [ temp_meta, it[1], db ]
+                            }
+                            .groupTuple(by: [0,2])
+                            .dump(tag: "input for malt")
+                            .multiMap {
+                                it ->
+                                    reads: [ it[0], it[1].flatten() ]
+                                    db: it[2]
+                            }
+
+    // We can run Kraken2 one-by-one sample-wise
+    ch_input_for_kraken2 =  ch_input_for_profiling.kraken2
+                            .dump(tag: "input for kraken")
+                            .multiMap {
+                                it ->
+                                    reads: [ it[0] + it[2], it[1] ]
+                                    db: it[3]
+                            }
+
+    //
+    // RUN PROFILING
+    //
+    MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db )
+    KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db  )
 
-    ch_reads_for_profiling.combine(DB_CHECK.out.dbs).dump(tag: "reads_plus_db")
 
     //
     // MODULE: MultiQC
@@ -143,6 +194,14 @@ workflow TAXPROFILER {
     if (params.fastp_clip_merge) {
         ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc)
     }
+    if (params.run_kraken2) {
+        ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]))
+        ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first())
+    }
+    if (params.run_malt) {
+        ch_multiqc_files = ch_multiqc_files.mix(MALT_RUN.out.log.collect{it[1]}.ifEmpty([]))
+        ch_versions = ch_versions.mix(MALT_RUN.out.versions.first())
+    }
 
 
     MULTIQC (

From e39c6a8ccb337b5889271df4f1e57bcbd3dd2c74 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Thu, 3 Mar 2022 18:04:03 +0100
Subject: [PATCH 10/34] Fix MALT multiqc report clash

---
 modules.json                              | 2 +-
 modules/nf-core/modules/malt/run/main.nf  | 3 ++-
 modules/nf-core/modules/malt/run/meta.yml | 2 +-
 workflows/taxprofiler.nf                  | 3 ++-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/modules.json b/modules.json
index 844b33a..6a785b8 100644
--- a/modules.json
+++ b/modules.json
@@ -19,7 +19,7 @@
                 "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
             },
             "malt/run": {
-                "git_sha": "76cdd46f3f8a77fb5023fb5a39c4ab99925b8b56"
+                "git_sha": "72b96f4e504eef673f2b5c13560a9d90b669129b"
             },
             "multiqc": {
                 "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41"
diff --git a/modules/nf-core/modules/malt/run/main.nf b/modules/nf-core/modules/malt/run/main.nf
index 61c02ec..4e2e50c 100644
--- a/modules/nf-core/modules/malt/run/main.nf
+++ b/modules/nf-core/modules/malt/run/main.nf
@@ -23,6 +23,7 @@ process MALT_RUN {
 
     script:
     def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
     def avail_mem = 6
     if (!task.memory) {
         log.info '[MALT_RUN] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.'
@@ -39,7 +40,7 @@ process MALT_RUN {
         $args \\
         --inFile ${fastqs.join(' ')} \\
         -m $mode \\
-        --index $index/ |&tee malt-run.log
+        --index $index/ |&tee ${prefix}-malt-run.log
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/nf-core/modules/malt/run/meta.yml b/modules/nf-core/modules/malt/run/meta.yml
index ae4277a..66f2d7a 100644
--- a/modules/nf-core/modules/malt/run/meta.yml
+++ b/modules/nf-core/modules/malt/run/meta.yml
@@ -52,7 +52,7 @@ output:
   - log:
       type: file
       description: Log of verbose MALT stdout
-      pattern: "malt-run.log"
+      pattern: "*-malt-run.log"
 
 authors:
   - "@jfy133"
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index bd25563..bf3e6ee 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -203,7 +203,8 @@ workflow TAXPROFILER {
         ch_versions = ch_versions.mix(MALT_RUN.out.versions.first())
     }
 
-
+    // TODO MALT results overwriting per database?
+    // TODO Versions for Karken/MALT not report?
     MULTIQC (
         ch_multiqc_files.collect()
     )

From 424f11f5ed0898a0e0524a7386b35164f58beccb Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates" <jfy133@gmail.com>
Date: Thu, 17 Mar 2022 13:16:16 +0100
Subject: [PATCH 11/34] Update README.md

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 60baf67..5d0c74b 100644
--- a/README.md
+++ b/README.md
@@ -61,10 +61,10 @@ On release, automated continuous integration tests run the pipeline on a full-si
 
     Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string.
 
-    > * The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`.
-    > * Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile <institute>` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment.
-    > * If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs.
-    > * If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs.
+    > - The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`.
+    > - Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile <institute>` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment.
+    > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs.
+    > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs.
 
 4. Start running your own analysis!
 

From 0f0ed6cd4698df3bea9d4168c056085a820eb056 Mon Sep 17 00:00:00 2001
From: ljmesi <37740329+ljmesi@users.noreply.github.com>
Date: Fri, 18 Mar 2022 10:45:06 +0100
Subject: [PATCH 12/34] Fix function name

---
 subworkflows/local/input_check.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index d66fb3a..481028f 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -15,6 +15,7 @@ workflow INPUT_CHECK {
         .dump(tag: "input_split_csv_out")
         .branch {
             fasta: it['fasta'] != ''
+            nanopore: it['instrument_platform'] == 'OXFORD_NANOPORE'
             fastq: true
         }
 

From 41b3d8db822caab916ec82fe4b4f581f17ab1ca5 Mon Sep 17 00:00:00 2001
From: ljmesi <37740329+ljmesi@users.noreply.github.com>
Date: Fri, 18 Mar 2022 10:47:41 +0100
Subject: [PATCH 13/34] Add nanopore channel

---
 subworkflows/local/input_check.nf | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 481028f..2e30bcc 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -20,10 +20,15 @@ workflow INPUT_CHECK {
         }
 
     parsed_samplesheet.fastq
-        .map { create_fastq_channels(it) }
+        .map { create_fastq_channel(it) }
         .dump(tag: "fastq_channel_init")
         .set { fastq }
 
+    parsed_samplesheet.nanopore
+        .map { create_fastq_channel(it) }
+        .dump(tag: "fastq_nanopore_channel_init")
+        .set { nanopore }
+
     parsed_samplesheet.fasta
         .map { create_fasta_channels(it) }
         .dump(tag: "fasta_channel_init")
@@ -31,6 +36,7 @@ workflow INPUT_CHECK {
 
     emit:
     fastq                                     // channel: [ val(meta), [ reads ] ]
+    nanopore                                  // channel: [ val(meta), [ reads ] ]
     fasta                                     // channel: [ val(meta), fasta ]
     versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ]
 }
@@ -52,10 +58,17 @@ def create_fastq_channel(LinkedHashMap row) {
     if (meta.single_end) {
         fastq_meta = [ meta, [ file(row.fastq_1) ] ]
     } else {
-        if (!file(row.fastq_2).exists()) {
-            exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}"
+        if (meta.instrument_platform == 'OXFORD_NANOPORE') {
+            if (row.fastq_2 != '') {
+                exit 1, "ERROR: Please check input samplesheet -> For Oxford Nanopore reads Read 2 FastQ should be empty!\n${row.fastq_2}"
+            }
+            fastq_meta = [ meta, [ file(row.fastq_1) ] ]
+        } else {
+            if (!file(row.fastq_2).exists()) {
+                exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}"
+            }
+            fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
         }
-        fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
     }
     return fastq_meta
 }

From 7f7ddc9f14237f1616918963a002cbc64fea2687 Mon Sep 17 00:00:00 2001
From: ljmesi <37740329+ljmesi@users.noreply.github.com>
Date: Fri, 18 Mar 2022 10:48:06 +0100
Subject: [PATCH 14/34] Update comment

---
 bin/check_samplesheet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index 16e668b..d10ee90 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -173,7 +173,7 @@ def check_samplesheet(file_in, file_out):
             ## Auto-detect paired-end/single-end
             if sample and fastq_1 and fastq_2:  ## Paired-end short reads
                 sample_info.extend(["0", fastq_1, fastq_2, fasta])
-            elif sample and fastq_1 and not fastq_2:  ## Single-end short reads
+            elif sample and fastq_1 and not fastq_2:  ## Single-end short/long fastq reads
                 sample_info.extend(["1", fastq_1, fastq_2, fasta])
             elif (
                 sample and fasta and not fastq_1 and not fastq_2

From 2e1b6c5d0a3b7c455bba5cb4d20dbbceac43dffe Mon Sep 17 00:00:00 2001
From: ljmesi <37740329+ljmesi@users.noreply.github.com>
Date: Fri, 18 Mar 2022 11:00:36 +0100
Subject: [PATCH 15/34] Add info on Nanopore reads to fastq_1 column

---
 docs/usage.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/usage.md b/docs/usage.md
index a8b0448..38c063e 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -44,11 +44,11 @@ TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz,
 TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz,
 ```
 
-| Column         | Description                                                                                                                                                                            |
-|----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `sample`       | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
-| `fastq_1`      | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
-| `fastq_2`      | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
+| Column    | Description                                                                                                                                                                            |
+| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `sample`  | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
+| `fastq_1` | Full path to FastQ file for Illumina short reads 1 or Nanopore reads. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                           |
+| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 

From c8e49c56f4f6b26dde209acfd474fc5c4f43caf7 Mon Sep 17 00:00:00 2001
From: ljmesi <37740329+ljmesi@users.noreply.github.com>
Date: Fri, 18 Mar 2022 13:47:44 +0100
Subject: [PATCH 16/34] Perform fastqc on nanopore reads before trimming

---
 workflows/taxprofiler.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index f740324..c3d3eb6 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -89,7 +89,7 @@ workflow TAXPROFILER {
     // MODULE: Run FastQC
     //
     FASTQC (
-        INPUT_CHECK.out.fastq
+        INPUT_CHECK.out.fastq.concat( INPUT_CHECK.out.nanopore )
     )
     ch_versions = ch_versions.mix(FASTQC.out.versions.first())
 

From 0936b9b28e52986576d9bb62473373f209565d6d Mon Sep 17 00:00:00 2001
From: Lauri Mesilaakso <john.mesilaakso@gmail.com>
Date: Fri, 18 Mar 2022 14:18:38 +0100
Subject: [PATCH 17/34] Update workflows/taxprofiler.nf

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>
---
 workflows/taxprofiler.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index c3d3eb6..f48cff6 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -89,7 +89,7 @@ workflow TAXPROFILER {
     // MODULE: Run FastQC
     //
     FASTQC (
-        INPUT_CHECK.out.fastq.concat( INPUT_CHECK.out.nanopore )
+        INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore )
     )
     ch_versions = ch_versions.mix(FASTQC.out.versions.first())
 

From 16be676d72f9964038a52fb0ddc58c9bdafd4f9b Mon Sep 17 00:00:00 2001
From: ljmesi <37740329+ljmesi@users.noreply.github.com>
Date: Fri, 18 Mar 2022 14:34:10 +0100
Subject: [PATCH 18/34] Add Porechop module

---
 conf/modules.config                       |  9 ++++
 modules.json                              |  3 ++
 modules/nf-core/modules/porechop/main.nf  | 35 ++++++++++++++++
 modules/nf-core/modules/porechop/meta.yml | 50 +++++++++++++++++++++++
 4 files changed, 97 insertions(+)
 create mode 100644 modules/nf-core/modules/porechop/main.nf
 create mode 100644 modules/nf-core/modules/porechop/meta.yml

diff --git a/conf/modules.config b/conf/modules.config
index 9e334bc..050772e 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -50,6 +50,15 @@ process {
         ]
     }
 
+    withName: PORECHOP {
+        ext.prefix = { "${meta.id}_${meta.run_accession}" }
+        publishDir = [
+            path: { "${params.outdir}/porechop" },
+            mode: 'copy',
+            pattern: '*.fastq.gz'
+        ]
+    }
+
     withName: FASTQC_POST {
         ext.args = '--quiet'
         ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
diff --git a/modules.json b/modules.json
index 6a785b8..284cf13 100644
--- a/modules.json
+++ b/modules.json
@@ -23,6 +23,9 @@
             },
             "multiqc": {
                 "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41"
+            },
+            "porechop": {
+                "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046"
             }
         }
     }
diff --git a/modules/nf-core/modules/porechop/main.nf b/modules/nf-core/modules/porechop/main.nf
new file mode 100644
index 0000000..65982b8
--- /dev/null
+++ b/modules/nf-core/modules/porechop/main.nf
@@ -0,0 +1,35 @@
+process PORECHOP {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda (params.enable_conda ? "bioconda::porechop=0.2.4" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/porechop:0.2.4--py39h7cff6ad_2' :
+        'quay.io/biocontainers/porechop:0.2.4--py39h7cff6ad_2' }"
+
+    input:
+    tuple val(meta), path(reads)
+
+    output:
+    tuple val(meta), path("*.fastq.gz"), emit: reads
+    path "versions.yml"                , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    porechop \\
+        -i $reads \\
+        -t $task.cpus \\
+        $args \\
+        -o ${prefix}.fastq.gz
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        porechop: \$( porechop --version )
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/modules/porechop/meta.yml b/modules/nf-core/modules/porechop/meta.yml
new file mode 100644
index 0000000..81399d2
--- /dev/null
+++ b/modules/nf-core/modules/porechop/meta.yml
@@ -0,0 +1,50 @@
+name: porechop
+description: Adapter removal and demultiplexing of Oxford Nanopore reads
+keywords:
+  - adapter
+  - nanopore
+  - demultiplexing
+tools:
+  - porechop:
+      description: Adapter removal and demultiplexing of Oxford Nanopore reads
+      homepage: "https://github.com/rrwick/Porechop"
+      documentation: "https://github.com/rrwick/Porechop"
+      tool_dev_url: "https://github.com/rrwick/Porechop"
+      doi: "10.1099/mgen.0.000132"
+      licence: ["GPL v3"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: fastq/fastq.gz file
+      pattern: "*.{fastq,fastq.gz,fq,fq.gz}"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - reads:
+      type: file
+      description: Demultiplexed and/or adapter-trimmed fastq.gz file
+      pattern: "*.{fastq.gz}"
+
+authors:
+  - "@ggabernet"
+  - "@jasmezz"
+  - "@d4straub"
+  - "@LaurenceKuhl"
+  - "@SusiJo"
+  - "@jonasscheid"
+  - "@jonoave"
+  - "@GokceOGUZ"

From 1e42f1d9f295e909cc75d10b6306cce2d1f4bf22 Mon Sep 17 00:00:00 2001
From: ljmesi <37740329+ljmesi@users.noreply.github.com>
Date: Fri, 18 Mar 2022 15:10:44 +0100
Subject: [PATCH 19/34] Add long read preprocessing subworkflow

---
 subworkflows/local/longread_preprocessing.nf | 34 ++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 subworkflows/local/longread_preprocessing.nf

diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf
new file mode 100644
index 0000000..da1049a
--- /dev/null
+++ b/subworkflows/local/longread_preprocessing.nf
@@ -0,0 +1,34 @@
+
+include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main'
+include { PORECHOP              } from '../../modules/nf-core/modules/porechop/main'
+
+workflow LONGREAD_PREPROCESSING {
+    take:
+    reads
+
+    main:
+    ch_versions      = Channel.empty()
+    ch_multiqc_files = Channel.empty()
+
+    PORECHOP ( reads )
+
+    ch_processed_reads = PORECHOP.out.reads
+                                .dump(tag: "pre_fastqc_check")
+                                .map {
+                                        meta, reads ->
+                                        def meta_new = meta.clone()
+                                        meta_new['single_end'] = 1
+                                        [ meta_new, reads ]
+                                    }
+
+    FASTQC_POST ( PORECHOP.out.reads )
+    ch_versions = ch_versions.mix(PORECHOP.out.versions.first())
+    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} )
+
+
+    emit:
+    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
+    versions = ch_versions          // channel: [ versions.yml ]
+    mqc      = ch_multiqc_files
+}
+

From 582aaa105fff0328f8df314d88ab84fe6c8e528b Mon Sep 17 00:00:00 2001
From: ljmesi <37740329+ljmesi@users.noreply.github.com>
Date: Fri, 18 Mar 2022 15:12:07 +0100
Subject: [PATCH 20/34] Include long reads preprocessing subworkflow

---
 workflows/taxprofiler.nf | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index f48cff6..9e52b59 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -40,7 +40,7 @@ include { INPUT_CHECK         } from '../subworkflows/local/input_check'
 
 include { DB_CHECK            } from '../subworkflows/local/db_check'
 include { FASTQ_PREPROCESSING } from '../subworkflows/local/preprocessing'
-
+include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -104,6 +104,10 @@ workflow TAXPROFILER {
         FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq )
     }
 
+    LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore )
+
+    ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first())
+
     //
     // PERFORM RUN MERGING
     //
@@ -191,6 +195,7 @@ workflow TAXPROFILER {
     ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
     ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
     ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
+    ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc)
     if (params.fastp_clip_merge) {
         ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc)
     }

From d09a3c170edac3afb7e6932cbcca824d6b77e202 Mon Sep 17 00:00:00 2001
From: ljmesi <37740329+ljmesi@users.noreply.github.com>
Date: Fri, 18 Mar 2022 15:46:03 +0100
Subject: [PATCH 21/34] Add Porechop

---
 CITATIONS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CITATIONS.md b/CITATIONS.md
index 192b2f4..53c53c3 100644
--- a/CITATIONS.md
+++ b/CITATIONS.md
@@ -15,6 +15,8 @@
 * [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
     > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
 
+* [Porechop](https://github.com/rrwick/Porechop)
+
 ## Software packaging/containerisation tools
 
 * [Anaconda](https://anaconda.com)

From 24a01529f5053c51a0f548ad04790f9bd5e3df9d Mon Sep 17 00:00:00 2001
From: ljmesi <37740329+ljmesi@users.noreply.github.com>
Date: Fri, 18 Mar 2022 15:47:46 +0100
Subject: [PATCH 22/34] Add mentioning about Nanopore reads pre-processing

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 5d0c74b..622b8de 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@ On release, automated continuous integration tests run the pipeline on a full-si
    - Low complexity filtering
    - Host read removal
    - Run merging
+   - Adapter and quality trimming of Nanopore reads
 3. Performs taxonomic profiling a choice of:
    - Kraken2
    - MetaPhlAn3

From c7f022008c561e70e2ed4a875c17ea1ece109f43 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Mon, 21 Mar 2022 15:07:59 +0100
Subject: [PATCH 23/34] Start getting database prep ready

---
 modules.json                           |  3 +++
 modules/nf-core/modules/untar/main.nf  | 36 ++++++++++++++++++++++++++
 modules/nf-core/modules/untar/meta.yml | 28 ++++++++++++++++++++
 subworkflows/local/db_check.nf         | 14 +++++++++-
 subworkflows/local/input_check.nf      |  2 +-
 5 files changed, 81 insertions(+), 2 deletions(-)
 create mode 100644 modules/nf-core/modules/untar/main.nf
 create mode 100644 modules/nf-core/modules/untar/meta.yml

diff --git a/modules.json b/modules.json
index 6a785b8..9a7f1df 100644
--- a/modules.json
+++ b/modules.json
@@ -23,6 +23,9 @@
             },
             "multiqc": {
                 "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41"
+            },
+            "untar": {
+                "git_sha": "7ec09d0ef4df89617baacc9b2dafcddb7cd4b05a"
             }
         }
     }
diff --git a/modules/nf-core/modules/untar/main.nf b/modules/nf-core/modules/untar/main.nf
new file mode 100644
index 0000000..bbae948
--- /dev/null
+++ b/modules/nf-core/modules/untar/main.nf
@@ -0,0 +1,36 @@
+process UNTAR {
+    tag "$archive"
+    label 'process_low'
+
+    conda (params.enable_conda ? "conda-forge::tar=1.32" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img' :
+        'biocontainers/biocontainers:v1.2.0_cv1' }"
+
+    input:
+    path archive
+
+    output:
+    path "$untar"      , emit: untar
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def args2 = task.ext.args2 ?: ''
+    untar        = archive.toString() - '.tar.gz'
+    """
+    tar \\
+        -xzvf \\
+        $args \\
+        $archive \\
+        $args2 \\
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/modules/untar/meta.yml b/modules/nf-core/modules/untar/meta.yml
new file mode 100644
index 0000000..e877a97
--- /dev/null
+++ b/modules/nf-core/modules/untar/meta.yml
@@ -0,0 +1,28 @@
+name: untar
+description: Extract files.
+keywords:
+  - untar
+  - uncompress
+tools:
+  - untar:
+      description: |
+        Extract tar.gz files.
+      documentation: https://www.gnu.org/software/tar/manual/
+      licence: ["GPL-3.0-or-later"]
+input:
+  - archive:
+      type: file
+      description: File to be untar
+      pattern: "*.{tar}.{gz}"
+output:
+  - untar:
+      type: file
+      description:
+      pattern: "*.*"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@joseespinosa"
+  - "@drpatelh"
diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf
index 909d98f..6f061d7 100644
--- a/subworkflows/local/db_check.nf
+++ b/subworkflows/local/db_check.nf
@@ -19,8 +19,20 @@ workflow DB_CHECK {
         .dump(tag: "db_channel_prepped")
         .set{ dbs }
 
+
+    parsed_samplesheet
+        .branch {
+            untar: it[0]['db_path'].toString().endsWith(".tar.gz")
+            skip: true
+        }
+        .set{ ch_dbs_for_untar }
+
+    UNTAR ( ch_dbs_for_untar.untar )
+
+    ch_final_dbs = ch_dbs_for_untar.skip.mix( ch_dbs_untarred )
+
     emit:
-    dbs                                       // channel: [ val(meta), [ db ] ]
+    dbs = ch_final_dbs                        // channel: [ val(meta), [ db ] ]
     versions = DATABASE_CHECK.out.versions // channel: [ versions.yml ]
 }
 
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index d66fb3a..938c87f 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -35,7 +35,7 @@ workflow INPUT_CHECK {
 }
 
 // Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
-def create_fastq_channel(LinkedHashMap row) {
+def create_fastq_channels(LinkedHashMap row) {
     // create meta map
     def meta = [:]
     meta.id                     = row.sample

From 631c115e1003fc7cb9b0c893bf7cde761f4287ef Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates" <jfy133@gmail.com>
Date: Mon, 21 Mar 2022 14:58:19 +0000
Subject: [PATCH 24/34] Adds PoC of untarring system

---
 conf/modules.config                    |  2 +-
 modules.json                           |  2 +-
 modules/nf-core/modules/untar/main.nf  | 10 +++++-----
 modules/nf-core/modules/untar/meta.yml | 10 ++++++++++
 subworkflows/local/db_check.nf         |  9 +++++----
 5 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 9e334bc..620ae1d 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -82,7 +82,7 @@ process {
         publishDir = [
             path: { "${params.outdir}/kraken2/${meta.db_name}" },
             mode: 'copy',
-            pattern: '.{fastq.gz,txt}'
+            pattern: '*.{fastq.gz,txt}'
         ]
         ext.args = { "${meta.db_params}" }
         ext.when = params.run_kraken2
diff --git a/modules.json b/modules.json
index 9a7f1df..96a43d8 100644
--- a/modules.json
+++ b/modules.json
@@ -25,7 +25,7 @@
                 "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41"
             },
             "untar": {
-                "git_sha": "7ec09d0ef4df89617baacc9b2dafcddb7cd4b05a"
+                "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918"
             }
         }
     }
diff --git a/modules/nf-core/modules/untar/main.nf b/modules/nf-core/modules/untar/main.nf
index bbae948..dc43fb7 100644
--- a/modules/nf-core/modules/untar/main.nf
+++ b/modules/nf-core/modules/untar/main.nf
@@ -8,19 +8,19 @@ process UNTAR {
         'biocontainers/biocontainers:v1.2.0_cv1' }"
 
     input:
-    path archive
+    tuple val(meta), path(archive)
 
     output:
-    path "$untar"      , emit: untar
-    path "versions.yml", emit: versions
+    tuple val(meta), path("$untar"), emit: untar
+    path "versions.yml"            , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
-    def args = task.ext.args ?: ''
+    def args  = task.ext.args ?: ''
     def args2 = task.ext.args2 ?: ''
-    untar        = archive.toString() - '.tar.gz'
+    untar     = archive.toString() - '.tar.gz'
     """
     tar \\
         -xzvf \\
diff --git a/modules/nf-core/modules/untar/meta.yml b/modules/nf-core/modules/untar/meta.yml
index e877a97..d426919 100644
--- a/modules/nf-core/modules/untar/meta.yml
+++ b/modules/nf-core/modules/untar/meta.yml
@@ -10,11 +10,21 @@ tools:
       documentation: https://www.gnu.org/software/tar/manual/
       licence: ["GPL-3.0-or-later"]
 input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
   - archive:
       type: file
       description: File to be untar
       pattern: "*.{tar}.{gz}"
 output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
   - untar:
       type: file
       description:
diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf
index 6f061d7..641108d 100644
--- a/subworkflows/local/db_check.nf
+++ b/subworkflows/local/db_check.nf
@@ -3,6 +3,7 @@
 //
 
 include { DATABASE_CHECK } from '../../modules/local/database_check'
+include { UNTAR          } from '../../modules/nf-core/modules/untar/main'
 
 workflow DB_CHECK {
     take:
@@ -17,19 +18,19 @@ workflow DB_CHECK {
         .dump(tag: "db_split_csv_out")
         .map { create_db_channels(it) }
         .dump(tag: "db_channel_prepped")
-        .set{ dbs }
-
 
     parsed_samplesheet
         .branch {
-            untar: it[0]['db_path'].toString().endsWith(".tar.gz")
+            untar: it[1].toString().endsWith(".tar.gz")
             skip: true
         }
         .set{ ch_dbs_for_untar }
 
+    // TODO Filter to only run UNTAR on DBs of tools actually using?
+    // TODO make optional whether to save
     UNTAR ( ch_dbs_for_untar.untar )
 
-    ch_final_dbs = ch_dbs_for_untar.skip.mix( ch_dbs_untarred )
+    ch_final_dbs = ch_dbs_for_untar.skip.mix( UNTAR.out.untar )
 
     emit:
     dbs = ch_final_dbs                        // channel: [ val(meta), [ db ] ]

From c97de32434d03fdc97c0a5dc9c75cf328b027193 Mon Sep 17 00:00:00 2001
From: ljmesi <37740329+ljmesi@users.noreply.github.com>
Date: Mon, 21 Mar 2022 18:17:08 +0100
Subject: [PATCH 25/34] Make adapter and quality trimming optional

---
 nextflow.config          |  1 +
 workflows/taxprofiler.nf | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 7b897ab..4a3a56d 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -57,6 +57,7 @@ params {
     // FASTQ preprocessing
     fastp_clip_merge           = false
     fastp_exclude_unmerged     = true
+    remove_adapters            = false
 
     // MALT
     run_malt                   = false
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index 9e52b59..0e144f3 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -104,9 +104,16 @@ workflow TAXPROFILER {
         FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq )
     }
 
-    LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore )
+    ch_multiqc_files = Channel.empty()
 
+    if ( params.remove_adapters ) {
+        ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ).reads
+                                        .map { it -> [ it[0], [it[1]] ] }
     ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first())
+        ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc)
+    } else {
+        ch_longreads_preprocessed = INPUT_CHECK.out.nanopore
+    }
 
     //
     // PERFORM RUN MERGING
@@ -138,6 +145,7 @@ workflow TAXPROFILER {
 
     // output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
     ch_input_for_profiling = ch_reads_for_profiling
+            .mix( ch_longreads_preprocessed )
             .combine(DB_CHECK.out.dbs)
             .dump(tag: "reads_plus_db")
             .branch {
@@ -189,13 +197,11 @@ workflow TAXPROFILER {
     workflow_summary    = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params)
     ch_workflow_summary = Channel.value(workflow_summary)
 
-    ch_multiqc_files = Channel.empty()
     ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config))
     ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([]))
     ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
     ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
     ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
-    ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc)
     if (params.fastp_clip_merge) {
         ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc)
     }

From f6fe26de466446379f49ba00dd80eb995bafd0bb Mon Sep 17 00:00:00 2001
From: ljmesi <37740329+ljmesi@users.noreply.github.com>
Date: Mon, 21 Mar 2022 18:25:56 +0100
Subject: [PATCH 26/34] Rename shortread subworkflow to be more consistent

---
 .../{preprocessing.nf => shortread_preprocessing.nf}      | 0
 workflows/taxprofiler.nf                                  | 8 ++++----
 2 files changed, 4 insertions(+), 4 deletions(-)
 rename subworkflows/local/{preprocessing.nf => shortread_preprocessing.nf} (100%)

diff --git a/subworkflows/local/preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf
similarity index 100%
rename from subworkflows/local/preprocessing.nf
rename to subworkflows/local/shortread_preprocessing.nf
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index 0e144f3..22c7518 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -39,7 +39,7 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi
 include { INPUT_CHECK         } from '../subworkflows/local/input_check'
 
 include { DB_CHECK            } from '../subworkflows/local/db_check'
-include { FASTQ_PREPROCESSING } from '../subworkflows/local/preprocessing'
+include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing'
 include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing'
 
 /*
@@ -101,7 +101,7 @@ workflow TAXPROFILER {
     // PERFORM PREPROCESSING
     //
     if ( params.fastp_clip_merge ) {
-        FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq )
+        SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq )
     }
 
     ch_multiqc_files = Channel.empty()
@@ -118,7 +118,7 @@ workflow TAXPROFILER {
     //
     // PERFORM RUN MERGING
     //
-    ch_processed_for_combine = FASTQ_PREPROCESSING.out.reads
+    ch_processed_for_combine = SHORTREAD_PREPROCESSING.out.reads
         .dump(tag: "prep_for_combine_grouping")
         .map {
             meta, reads ->
@@ -203,7 +203,7 @@ workflow TAXPROFILER {
     ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
     ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
     if (params.fastp_clip_merge) {
-        ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc)
+        ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_PREPROCESSING.out.mqc)
     }
     if (params.run_kraken2) {
         ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]))

From 4940ec57ffee76378b9bfe8dcff1d73f3097846e Mon Sep 17 00:00:00 2001
From: ljmesi <37740329+ljmesi@users.noreply.github.com>
Date: Mon, 21 Mar 2022 18:26:26 +0100
Subject: [PATCH 27/34] Remove unnecessary extra point

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 622b8de..d454a9b 100644
--- a/README.md
+++ b/README.md
@@ -30,11 +30,10 @@ On release, automated continuous integration tests run the pipeline on a full-si
 
 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
 2. Performs optional read pre-processing
-   - Adapter clipping and merging
+   - Adapter clipping and merging (short, and nanopore reads)
    - Low complexity filtering
    - Host read removal
    - Run merging
-   - Adapter and quality trimming of Nanopore reads
 3. Performs taxonomic profiling a choice of:
    - Kraken2
    - MetaPhlAn3

From 5b1b48e59e17d271d03450daca083dd8cec502af Mon Sep 17 00:00:00 2001
From: ljmesi <37740329+ljmesi@users.noreply.github.com>
Date: Mon, 21 Mar 2022 18:29:47 +0100
Subject: [PATCH 28/34] Update subworkflow name to be more consistent

---
 subworkflows/local/shortread_preprocessing.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf
index 5832824..406c198 100644
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@@ -7,7 +7,7 @@ include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fast
 include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
 include { FASTQC as FASTQC_POST       } from '../../modules/nf-core/modules/fastqc/main'
 
-workflow FASTQ_PREPROCESSING {
+workflow SHORTREAD_PREPROCESSING {
     take:
     reads // file: /path/to/samplesheet.csv
 

From 80129985424b214ee22213a8db7cc139e2793ff5 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Mon, 21 Mar 2022 19:52:50 +0100
Subject: [PATCH 29/34] Make parameter naming more consistent for clipmerge

---
 conf/modules.config                           |  5 ++---
 nextflow.config                               |  6 ++---
 subworkflows/local/shortread_preprocessing.nf |  2 +-
 workflows/taxprofiler.nf                      | 22 ++++++++++++-------
 4 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 050772e..c09a011 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -41,7 +41,7 @@ process {
         // TODO also include option to NOT merge
         ext.args   = [
             { ${meta.single_end} } == 0 ? "-m" : '',
-            params.fastp_exclude_unmerged ? '' : "--include_unmerged"
+            params.shortread_excludeunmerged ? '' : "--include_unmerged"
         ].join(' ').trim()
         publishDir = [
             path: { "${params.outdir}/fastp" },
@@ -84,7 +84,7 @@ process {
             pattern: '*.{rma6,tab,text,sam,log}'
         ]
         ext.args = { "${meta.db_params}" }
-        ext.when = params.run_malt
+        ext.prefix = { "${meta.id}-${meta.db_name}" }
     }
 
     withName: KRAKEN2_KRAKEN2 {
@@ -94,7 +94,6 @@ process {
             pattern: '.{fastq.gz,txt}'
         ]
         ext.args = { "${meta.db_params}" }
-        ext.when = params.run_kraken2
         ext.prefix = { "${meta.id}-${meta.db_name}" }
     }
 
diff --git a/nextflow.config b/nextflow.config
index 4a3a56d..5f7aec6 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -55,9 +55,9 @@ params {
     databases = null
 
     // FASTQ preprocessing
-    fastp_clip_merge           = false
-    fastp_exclude_unmerged     = true
-    remove_adapters            = false
+    shortread_clipmerge           = false
+    shortread_excludeunmerged        = true
+    longread_clip                 = false
 
     // MALT
     run_malt                   = false
diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf
index 406c198..d996a76 100644
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@@ -23,7 +23,7 @@ workflow SHORTREAD_PREPROCESSING {
     // TODO move to subworkflow
 
 
-    if ( params.fastp_clip_merge ) {
+    if ( params.shortread_clipmerge ) {
 
         ch_input_for_fastp = reads
                                 .dump(tag: "pre-fastp_branch")
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index 22c7518..4aa0684 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -100,17 +100,14 @@ workflow TAXPROFILER {
     //
     // PERFORM PREPROCESSING
     //
-    if ( params.fastp_clip_merge ) {
+    if ( params.shortread_clipmerge ) {
         SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq )
     }
 
-    ch_multiqc_files = Channel.empty()
-
-    if ( params.remove_adapters ) {
+    if ( params.longread_clip ) {
         ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ).reads
                                         .map { it -> [ it[0], [it[1]] ] }
     ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first())
-        ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc)
     } else {
         ch_longreads_preprocessed = INPUT_CHECK.out.nanopore
     }
@@ -187,9 +184,13 @@ workflow TAXPROFILER {
     //
     // RUN PROFILING
     //
-    MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db )
-    KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db  )
+    if ( params.run_malt ) {
+        MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db )
+    }
 
+    if ( params.run_kraken2 ) {
+        KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db  )
+    }
 
     //
     // MODULE: MultiQC
@@ -197,14 +198,19 @@ workflow TAXPROFILER {
     workflow_summary    = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params)
     ch_workflow_summary = Channel.value(workflow_summary)
 
+    ch_multiqc_files = Channel.empty()
     ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config))
     ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([]))
     ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
     ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
     ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
-    if (params.fastp_clip_merge) {
+
+    if (params.shortread_clipmerge) {
         ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_PREPROCESSING.out.mqc)
     }
+    if (params.longread_clip) {
+        ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc)
+    }
     if (params.run_kraken2) {
         ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]))
         ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first())

From 07eed435c628ae0518246aa31e87d81fb02c69ad Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Mon, 21 Mar 2022 19:54:51 +0100
Subject: [PATCH 30/34] Replace set with explicity assignment

---
 subworkflows/local/db_check.nf | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf
index 641108d..890e373 100644
--- a/subworkflows/local/db_check.nf
+++ b/subworkflows/local/db_check.nf
@@ -19,12 +19,11 @@ workflow DB_CHECK {
         .map { create_db_channels(it) }
         .dump(tag: "db_channel_prepped")
 
-    parsed_samplesheet
+    ch_dbs_for_untar = parsed_samplesheet
         .branch {
             untar: it[1].toString().endsWith(".tar.gz")
             skip: true
         }
-        .set{ ch_dbs_for_untar }
 
     // TODO Filter to only run UNTAR on DBs of tools actually using?
     // TODO make optional whether to save

From 81bfb629cadfa76ac6f639f320f3bd0fb0ae4dfd Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Mon, 21 Mar 2022 20:28:09 +0100
Subject: [PATCH 31/34] Add working basic test to begin

---
 .github/workflows/ci.yml          |  1 +
 conf/modules.config               | 16 ++++++++++++++++
 conf/test.config                  |  8 +++++++-
 subworkflows/local/input_check.nf |  6 +++---
 workflows/taxprofiler.nf          | 13 ++++++++-----
 5 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 033eb63..5fe2777 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -48,3 +48,4 @@ jobs:
         # Remember that you can parallelise this by using strategy.matrix
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
+        # TODO Add test that runs with pre-downloaded and decompressed databases
diff --git a/conf/modules.config b/conf/modules.config
index ab8f021..b9c1008 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -26,6 +26,22 @@ process {
         ]
     }
 
+    withName: DATABASE_CHECK {
+        publishDir = [
+            path: { "${params.outdir}/pipeline_info" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: UNTAR {
+       publishDir = [
+            path: { "${params.outdir}/databases" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
     withName: FASTQC {
         ext.args = '--quiet'
         ext.prefix = { "${meta.id}_${meta.run_accession}_raw" }
diff --git a/conf/test.config b/conf/test.config
index 51f3bb6..42d8de6 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -22,6 +22,12 @@ params {
     // Input data
     // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
     // TODO nf-core: Give any required params for the test so that command line flags are not needed
-    input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
+    input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
+    outdir              = "./results"
+    // TODO replace with official once ready
+    databases           = 'https://raw.githubusercontent.com/jfy133/nf-core-test-datasets/taxprofiler/database.csv'
+    run_kraken2         = true
+    run_malt            = true
+    shortread_clipmerge = true
 
 }
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 67dadc2..4501386 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -30,7 +30,7 @@ workflow INPUT_CHECK {
         .set { nanopore }
 
     parsed_samplesheet.fasta
-        .map { create_fasta_channels(it) }
+        .map { create_fasta_channel(it) }
         .dump(tag: "fasta_channel_init")
         .set { fasta }
 
@@ -42,7 +42,7 @@ workflow INPUT_CHECK {
 }
 
 // Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
-def create_fastq_channels(LinkedHashMap row) {
+def create_fastq_channel(LinkedHashMap row) {
     // create meta map
     def meta = [:]
     meta.id                     = row.sample
@@ -74,7 +74,7 @@ def create_fastq_channels(LinkedHashMap row) {
 }
 
 // Function to get list of [ meta, fasta ]
-def create_fasta_channels(LinkedHashMap row) {
+def create_fasta_channel(LinkedHashMap row) {
     def meta = [:]
     meta.id                     = row.sample
     meta.run_accession          = row.run_accession
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index 4aa0684..6fc5450 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -101,7 +101,9 @@ workflow TAXPROFILER {
     // PERFORM PREPROCESSING
     //
     if ( params.shortread_clipmerge ) {
-        SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq )
+        ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads
+    } else {
+        ch_shortreads_preprocessed = INPUT_CHECK.out.fastq
     }
 
     if ( params.longread_clip ) {
@@ -113,9 +115,10 @@ workflow TAXPROFILER {
     }
 
     //
-    // PERFORM RUN MERGING
+    // PERFORM SHORT READ RUN MERGING
+    // TODO: Check not necessary for long reads too?
     //
-    ch_processed_for_combine = SHORTREAD_PREPROCESSING.out.reads
+    ch_processed_for_combine = ch_shortreads_preprocessed
         .dump(tag: "prep_for_combine_grouping")
         .map {
             meta, reads ->
@@ -140,7 +143,7 @@ workflow TAXPROFILER {
     // COMBINE READS WITH POSSIBLE DATABASES
     //
 
-    // output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
+    // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
     ch_input_for_profiling = ch_reads_for_profiling
             .mix( ch_longreads_preprocessed )
             .combine(DB_CHECK.out.dbs)
@@ -152,7 +155,7 @@ workflow TAXPROFILER {
             }
 
     //
-    // PREP PROFILER INPUT CHANNELS ON PER TOOL BASIS
+    // PREPARE PROFILER INPUT CHANNELS
     //
 
     // We groupTuple to have all samples in one channel for MALT as database

From 358b89a4c6d18f195c7aae3115e28080cfa32b3e Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Mon, 21 Mar 2022 20:30:29 +0100
Subject: [PATCH 32/34] Linting

---
 conf/modules.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/modules.config b/conf/modules.config
index b9c1008..29a5135 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -35,7 +35,7 @@ process {
     }
 
     withName: UNTAR {
-       publishDir = [
+        publishDir = [
             path: { "${params.outdir}/databases" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }

From 48b6ef508dbb0d0a35b91aa8e670616f640b36ae Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Wed, 23 Mar 2022 10:06:01 +0100
Subject: [PATCH 33/34] Update path to samplesheet

---
 conf/test.config | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/conf/test.config b/conf/test.config
index 42d8de6..2e08499 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -24,8 +24,7 @@ params {
     // TODO nf-core: Give any required params for the test so that command line flags are not needed
     input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
     outdir              = "./results"
-    // TODO replace with official once ready
-    databases           = 'https://raw.githubusercontent.com/jfy133/nf-core-test-datasets/taxprofiler/database.csv'
+    databases           = 'https://raw.githubusercontent.com/nf-core/nf-core-test-datasets/taxprofiler/database.csv'
     run_kraken2         = true
     run_malt            = true
     shortread_clipmerge = true

From 038a8d106a3e027d48c781ab15197270ec38e7a5 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates" <jfy133@gmail.com>
Date: Wed, 23 Mar 2022 11:25:47 +0100
Subject: [PATCH 34/34] Fix test with correct URL

---
 conf/test.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/test.config b/conf/test.config
index 2e08499..5924d7a 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -24,7 +24,7 @@ params {
     // TODO nf-core: Give any required params for the test so that command line flags are not needed
     input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
     outdir              = "./results"
-    databases           = 'https://raw.githubusercontent.com/nf-core/nf-core-test-datasets/taxprofiler/database.csv'
+    databases           = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
     run_kraken2         = true
     run_malt            = true
     shortread_clipmerge = true