Add centrifuge classificatioN

2024-11-22 05:09:55 +00:00 · 2022-04-04 13:51:51 +02:00 · 2022-04-04 13:51:51 +02:00 · d897c922b2
commit d897c922b2
parent c552819c72 fe628b3578
21 changed files with 751 additions and 159 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -28,6 +28,10 @@ jobs:
          # Test latest edge release of Nextflow
          - NXF_VER: ""
            NXF_EDGE: "1"
+        parameters:
+          - "--shortread_clipmerge_tool fastp"
+          - "--shortread_clipmerge_tool adapterremoval"
+
    steps:
      - name: Check out pipeline code
        uses: actions/checkout@v2
@ -42,11 +46,19 @@ jobs:
          wget -qO- get.nextflow.io | bash
          sudo mv nextflow /usr/local/bin/

+      - name: Show current locale
+        run: locale
+
+      - name: Set UTF-8 enabled locale
+        run: |
+          sudo locale-gen en_US.UTF-8
+          sudo update-locale LANG=en_US.UTF-8
+
      - name: Run pipeline with test data
        # TODO nf-core: You can customise CI pipeline run tests as required
        # For example: adding multiple test runs with different parameters
        # Remember that you can parallelise this by using strategy.matrix
        run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
+          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results ${{ matrix.parameters }}

 #
--- a/CITATIONS.md
+++ b/CITATIONS.md
@ -13,9 +13,30 @@
 - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)

 - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
+
  > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.

-* [Porechop](https://github.com/rrwick/Porechop)
+- [fastp](https://doi.org/10.1093/bioinformatics/bty560)
+
+  > Chen, Shifu, Yanqing Zhou, Yaru Chen, and Jia Gu. 2018. “Fastp: An Ultra-Fast All-in-One FASTQ Preprocessor.” Bioinformatics 34 (17): i884-90. 10.1093/bioinformatics/bty560.
+
+- [AdapterRemoval2](https://doi.org/10.1186/s13104-016-1900-2)
+
+  > Schubert, Mikkel, Stinus Lindgreen, and Ludovic Orlando. 2016. “AdapterRemoval v2: Rapid Adapter Trimming, Identification, and Read Merging.” BMC Research Notes 9 (February): 88. doi:10.1186/s13104-016-1900-2.
+
+- [Porechop](https://github.com/rrwick/Porechop)
+
+- [Kraken2](https://doi.org/10.1186/s13059-019-1891-0)
+
+  > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. “Improved Metagenomic Analysis with Kraken 2.” Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0.
+
+- [MALT](https://doi.org/10.1038/s41559-017-0446-6)
+
+  > Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. “Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico.” Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6.
+
+- [MetaPhlAn3](https://doi.org/10.7554/eLife.65088)
+
+  > Beghini, Francesco, Lauren J McIver, Aitor Blanco-Míguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, et al. 2021. “Integrating Taxonomic, Functional, and Strain-Level Profiling of Diverse Microbial Communities with BioBakery 3.” Edited by Peter Turnbaugh, Eduardo Franco, and C Titus Brown. ELife 10 (May): e65088.

 ## Software packaging/containerisation tools

--- a/conf/modules.config
+++ b/conf/modules.config
@ -52,13 +52,25 @@ process {
        ]
    }

-    withName: FASTP {
-        ext.prefix = { "${meta.id}_${meta.run_accession}" }
-        // TODO also include option to NOT merge
+    withName: FASTQC_PROCESSED {
+        ext.args = '--quiet'
+        ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
+        publishDir = [
+            path: { "${params.outdir}/fastqc/processed" },
+            mode: 'copy',
+            pattern: '*.html'
+        ]
+    }
+
+    withName: FASTP_SINGLE {
        ext.args   = [
-            { ${meta.single_end} } == 0 ? "-m" : '',
-            params.shortread_excludeunmerged ? '' : "--include_unmerged"
+            // trimming options
+            params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "",
+            params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
+            // filtering options
+            "--length_required ${params.shortread_clipmerge_minlength}"
        ].join(' ').trim()
+        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
            path: { "${params.outdir}/fastp" },
            mode: 'copy',
@ -66,6 +78,61 @@ process {
        ]
    }

+    withName: FASTP_PAIRED {
+        ext.args   = [
+            // collapsing options - option to retain singletons
+            params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged",
+            // trimming options
+            params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "",
+            params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
+            params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe",
+            // filtering options
+            "--length_required ${params.shortread_clipmerge_minlength}"
+        ].join(' ').trim()
+        ext.prefix = { "${meta.id}_${meta.run_accession}" }
+        publishDir = [
+            path: { "${params.outdir}/fastp" },
+            mode: 'copy',
+            pattern: '*.fastq.gz'
+        ]
+    }
+
+    withName: ADAPTERREMOVAL_SINGLE {
+        ext.args   = [
+            // trimming options
+            params.shortread_clipmerge_skipadaptertrim ? "--adapter1 '' --adapter2 ''" : "",
+            params.shortread_clipmerge_adapter1 ? "--adapter1 ${params.shortread_clipmerge_adapter1}" : "",
+            // filtering options
+            "--minlength ${params.shortread_clipmerge_minlength}"
+        ].join(' ').trim()
+        ext.prefix = { "${meta.id}_${meta.run_accession}" }
+        publishDir = [
+            path: { "${params.outdir}/adapterremoval" },
+            mode: 'copy',
+            pattern: '*.fastq.gz'
+        ]
+    }
+
+    withName: ADAPTERREMOVAL_PAIRED {
+        ext.args   = [
+            // collapsing options
+            params.shortread_clipmerge_mergepairs ? "--collapse" : "",
+            // trimming options
+            params.shortread_clipmerge_skipadaptertrim ? "--adapter1 '' --adapter2 ''" : "",
+            params.shortread_clipmerge_adapter1 ? "--adapter1 ${params.shortread_clipmerge_adapter1}" : "",
+            params.shortread_clipmerge_adapter2 ? "--adapter2 ${params.shortread_clipmerge_adapter2}" : "",
+            // filtering options
+            "--minlength ${params.shortread_clipmerge_minlength}"
+        ].join(' ').trim()
+        ext.prefix = { "${meta.id}_${meta.run_accession}" }
+        publishDir = [
+            path: { "${params.outdir}/adapterremoval" },
+            mode: 'copy',
+            pattern: '*.fastq.gz'
+        ]
+    }
+
+
    withName: PORECHOP {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
@ -75,16 +142,6 @@ process {
        ]
    }

-    withName: FASTQC_POST {
-        ext.args = '--quiet'
-        ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
-        publishDir = [
-            path: { "${params.outdir}/fastqc/processed" },
-            mode: 'copy',
-            pattern: '*.html'
-        ]
-    }
-
    withName: CAT_FASTQ {
        publishDir = [
            path: { "${params.outdir}/prepared_sequences" },
@ -94,23 +151,32 @@ process {
    }

    withName: MALT_RUN {
+        ext.args = { "${meta.db_params}" }
+        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
        publishDir = [
            path: { "${params.outdir}/malt/${meta.db_name}" },
            mode: 'copy',
            pattern: '*.{rma6,tab,text,sam,log}'
        ]
-        ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.db_name}" }
    }

    withName: KRAKEN2_KRAKEN2 {
+        ext.args = { "${meta.db_params}" }
+        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
        publishDir = [
            path: { "${params.outdir}/kraken2/${meta.db_name}" },
            mode: 'copy',
            pattern: '*.{fastq.gz,txt}'
        ]
-        ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.db_name}" }
+    }
+
+    withName: METAPHLAN3 {
+        publishDir = [
+            path: { "${params.outdir}/metaphlan3/${meta.db_name}" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{biom,txt}'
+        ]
+        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
    }

    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
@ -128,7 +194,7 @@ process {
            pattern: '*.{fastq.gz,txt}'
        ]
        ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.db_name}" }
+        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
    }

 }
--- a/conf/test.config
+++ b/conf/test.config
@ -23,10 +23,10 @@ params {
    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
    // TODO nf-core: Give any required params for the test so that command line flags are not needed
    input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
-    outdir              = "./results"
    databases           = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
    run_kraken2         = true
    run_malt            = true
+    run_metaphlan3      = true
    shortread_clipmerge = true
    run_centrifuge      = true

--- a/modules.json
+++ b/modules.json
@ -3,6 +3,9 @@
    "homePage": "https://github.com/nf-core/taxprofiler",
    "repos": {
        "nf-core/modules": {
+            "adapterremoval": {
+                "git_sha": "f0800157544a82ae222931764483331a81812012"
+            },
            "cat/fastq": {
                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
            },
@ -21,17 +24,20 @@
            "malt/run": {
                "git_sha": "72b96f4e504eef673f2b5c13560a9d90b669129b"
            },
-            "multiqc": {
+            "metaphlan3": {
                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
            },
-            "untar": {
-                "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918"
+            "multiqc": {
+                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
            },
            "porechop": {
                "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046"
            },
            "centrifuge": {
                "git_sha": "ea41a8a6f761b9993d857570e872abaae3fea555"
+            },
+            "untar": {
+                "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918"
            }
        }
    }
--- a/modules/local/ensure_fastq_extension.nf
+++ b/modules/local/ensure_fastq_extension.nf
@ -0,0 +1,31 @@
+process ENSURE_FASTQ_EXTENSION {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda (params.enable_conda ? "conda-forge::bash=5.0" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv2/biocontainers_v1.2.0_cv2.img' :
+        'biocontainers/biocontainers:v1.2.0_cv2' }"
+
+
+    input:
+    tuple val(meta), path(reads)
+
+    output:
+    tuple val(meta), path('*.fastq.gz'), emit: reads
+
+    script:
+    if (meta.single_end) {
+        fastq = "${reads.baseName}.fastq.gz"
+        """
+        ln -s '${reads}' '${fastq}'
+        """
+    } else {
+        first = "${reads[0].baseName}.fastq.gz"
+        second = "${reads[1].baseName}.fastq.gz"
+        """
+        ln -s '${reads[0]}' '${first}'
+        ln -s '${reads[1]}' '${second}'
+        """
+    }
+}
--- a/modules/nf-core/modules/adapterremoval/main.nf
+++ b/modules/nf-core/modules/adapterremoval/main.nf
@ -0,0 +1,70 @@
+process ADAPTERREMOVAL {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda (params.enable_conda ? "bioconda::adapterremoval=2.3.2" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/adapterremoval:2.3.2--hb7ba0dd_0' :
+        'quay.io/biocontainers/adapterremoval:2.3.2--hb7ba0dd_0' }"
+
+    input:
+    tuple val(meta), path(reads)
+    path(adapterlist)
+
+    output:
+    tuple val(meta), path("${prefix}.truncated.gz")            , optional: true, emit: singles_truncated
+    tuple val(meta), path("${prefix}.discarded.gz")            , optional: true, emit: discarded
+    tuple val(meta), path("${prefix}.pair1.truncated.gz")      , optional: true, emit: pair1_truncated
+    tuple val(meta), path("${prefix}.pair2.truncated.gz")      , optional: true, emit: pair2_truncated
+    tuple val(meta), path("${prefix}.collapsed.gz")            , optional: true, emit: collapsed
+    tuple val(meta), path("${prefix}.collapsed.truncated.gz")  , optional: true, emit: collapsed_truncated
+    tuple val(meta), path("${prefix}.paired.gz")               , optional: true, emit: paired_interleaved
+    tuple val(meta), path('*.log')                             , emit: log
+    path "versions.yml"                                        , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def list = adapterlist ? "--adapter-list ${adapterlist}" : ""
+    prefix = task.ext.prefix ?: "${meta.id}"
+
+    if (meta.single_end) {
+        """
+        AdapterRemoval  \\
+            --file1 $reads \\
+            $args \\
+            $adapterlist \\
+            --basename ${prefix} \\
+            --threads ${task.cpus} \\
+            --settings ${prefix}.log \\
+            --seed 42 \\
+            --gzip
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g")
+        END_VERSIONS
+        """
+    } else {
+        """
+        AdapterRemoval  \\
+            --file1 ${reads[0]} \\
+            --file2 ${reads[1]} \\
+            $args \\
+            $adapterlist \\
+            --basename ${prefix} \\
+            --threads $task.cpus \\
+            --settings ${prefix}.log \\
+            --seed 42 \\
+            --gzip
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g")
+        END_VERSIONS
+        """
+    }
+
+}
--- a/modules/nf-core/modules/adapterremoval/meta.yml
+++ b/modules/nf-core/modules/adapterremoval/meta.yml
@ -0,0 +1,90 @@
+name: adapterremoval
+description: Trim sequencing adapters and collapse overlapping reads
+keywords:
+  - trimming
+  - adapters
+  - merging
+  - fastq
+tools:
+  - adapterremoval:
+      description: The AdapterRemoval v2 tool for merging and clipping reads.
+      homepage: https://github.com/MikkelSchubert/adapterremoval
+      documentation: https://adapterremoval.readthedocs.io
+      licence: ["GPL v3"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: |
+        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+        respectively.
+      pattern: "*.{fq,fastq,fq.gz,fastq.gz}"
+  - adapterlist:
+      type: file
+      description: Optional text file containing list of adapters to look for for removal
+        with one adapter per line. Otherwise will look for default adapters (see
+        AdapterRemoval man page), or can be modified to remove user-specified
+        adapters via ext.args.
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - singles_truncated:
+      type: file
+      description: |
+        Adapter trimmed FastQ files of either single-end reads, or singleton
+        'orphaned' reads from merging of paired-end data (i.e., one of the pair
+        was lost due to filtering thresholds).
+      pattern: "*.truncated.gz"
+  - discarded:
+      type: file
+      description: |
+        Adapter trimmed FastQ files of reads that did not pass filtering
+        thresholds.
+      pattern: "*.discarded.gz"
+  - pair1_truncated:
+      type: file
+      description: |
+        Adapter trimmed R1 FastQ files of paired-end reads that did not merge
+        with their respective R2 pair due to long templates. The respective pair
+        is stored in 'pair2_truncated'.
+      pattern: "*.pair1.truncated.gz"
+  - pair2_truncated:
+      type: file
+      description: |
+        Adapter trimmed R2 FastQ files of paired-end reads that did not merge
+        with their respective R1 pair due to long templates. The respective pair
+        is stored in 'pair1_truncated'.
+      pattern: "*.pair2.truncated.gz"
+  - collapsed:
+      type: file
+      description: |
+        Collapsed FastQ of paired-end reads that successfully merged with their
+        respective R1 pair but were not trimmed.
+      pattern: "*.collapsed.gz"
+  - collapsed_truncated:
+      type: file
+      description: |
+        Collapsed FastQ of paired-end reads that successfully merged with their
+        respective R1 pair and were trimmed of adapter due to sufficient overlap.
+      pattern: "*.collapsed.truncated.gz"
+  - log:
+      type: file
+      description: AdapterRemoval log file
+      pattern: "*.log"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+authors:
+  - "@maxibor"
+  - "@jfy133"
--- a/modules/nf-core/modules/centrifuge/main.nf
+++ b/modules/nf-core/modules/centrifuge/main.nf
@ -10,6 +10,7 @@ process CENTRIFUGE {
    input:
    tuple val(meta), path(reads)
    path db
+    val db_name
    val save_unaligned
    val save_aligned
    val sam_format
@ -42,9 +43,8 @@ process CENTRIFUGE {
    }
    def sam_output = sam_format ? "--out-fmt 'sam'" : ''
    """
-    tar -xf $db
    centrifuge \\
-        -x $db_name \\
+        -x ${db}/${db_name} \\
        -p $task.cpus \\
        $paired \\
        --report-file ${prefix}.report.txt \\
--- a/modules/nf-core/modules/centrifuge/meta.yml
+++ b/modules/nf-core/modules/centrifuge/meta.yml
@ -27,6 +27,9 @@ input:
      type: directory
      description: Centrifuge database in .tar.gz format
      pattern: "*.tar.gz"
+  - db_name:
+      type: string
+      description: Centrifuge database filenames without the suffix ".cf"
  - save_unaligned:
      type: value
      description: If true unmapped fastq files are saved
--- a/modules/nf-core/modules/metaphlan3/main.nf
+++ b/modules/nf-core/modules/metaphlan3/main.nf
@ -0,0 +1,45 @@
+process METAPHLAN3 {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda (params.enable_conda ? 'bioconda::metaphlan=3.0.12' : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/metaphlan:3.0.12--pyhb7b1952_0' :
+        'quay.io/biocontainers/metaphlan:3.0.12--pyhb7b1952_0' }"
+
+    input:
+    tuple val(meta), path(input)
+    path metaphlan_db
+
+    output:
+    tuple val(meta), path("*_profile.txt")   ,                emit: profile
+    tuple val(meta), path("*.biom")          ,                emit: biom
+    tuple val(meta), path('*.bowtie2out.txt'), optional:true, emit: bt2out
+    path "versions.yml"                      ,                emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def input_type  = ("$input".endsWith(".fastq.gz")) ? "--input_type fastq" :  ("$input".contains(".fasta")) ? "--input_type fasta" : ("$input".endsWith(".bowtie2out.txt")) ? "--input_type bowtie2out" : "--input_type sam"
+    def input_data  = ("$input_type".contains("fastq")) && !meta.single_end ? "${input[0]},${input[1]}" : "$input"
+    def bowtie2_out = "$input_type" == "--input_type bowtie2out" || "$input_type" == "--input_type sam" ? '' : "--bowtie2out ${prefix}.bowtie2out.txt"
+
+    """
+    metaphlan \\
+        --nproc $task.cpus \\
+        $input_type \\
+        $input_data \\
+        $args \\
+        $bowtie2_out \\
+        --bowtie2db ${metaphlan_db} \\
+        --biom ${prefix}.biom \\
+        --output_file ${prefix}_profile.txt
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        metaphlan3: \$(metaphlan --version 2>&1 | awk '{print \$3}')
+    END_VERSIONS
+    """
+}
--- a/modules/nf-core/modules/metaphlan3/meta.yml
+++ b/modules/nf-core/modules/metaphlan3/meta.yml
@ -0,0 +1,52 @@
+name: metaphlan3
+description: MetaPhlAn is a tool for profiling the composition of microbial communities from metagenomic shotgun sequencing data.
+keywords:
+  - metagenomics
+  - classification
+  - fastq
+  - bam
+  - fasta
+tools:
+  - metaphlan3:
+      description: Identify clades (phyla to species) present in the metagenome obtained from a microbiome sample and their relative abundance
+      homepage: https://huttenhower.sph.harvard.edu/metaphlan/
+      documentation: https://github.com/biobakery/MetaPhlAn
+      doi: "10.7554/eLife.65088"
+      licence: ["MIT License"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - input:
+      type: file
+      description: Metaphlan 3.0 can classify the metagenome from a variety of input data types, including FASTQ files (single-end and paired-end), FASTA, bowtie2-produced SAM files (produced from alignments to the MetaPHlAn marker database) and intermediate bowtie2 alignment files (bowtie2out)
+      pattern: "*.{fastq.gz, fasta, fasta.gz, sam, bowtie2out.txt}"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - profile:
+      type: file
+      description: Tab-separated output file of the predicted taxon relative abundances
+      pattern: "*.{txt}"
+  - biom:
+      type: file
+      description: General-use format for representing biological sample by observation contingency tables
+      pattern: "*.{biom}"
+  - bowtie2out:
+      type: file
+      description: Intermediate Bowtie2 output produced from mapping the metagenome against the MetaPHlAn marker database ( not compatible with `bowtie2out` files generated with MetaPhlAn versions below 3 )
+      pattern: "*.{bowtie2out.txt}"
+
+authors:
+  - "@MGordon09"
--- a/nextflow.config
+++ b/nextflow.config
@ -55,9 +55,15 @@ params {
    databases = null

    // FASTQ preprocessing
-    shortread_clipmerge           = false
-    shortread_excludeunmerged     = true
-    longread_clip                 = false
+    shortread_clipmerge                     = false
+    shortread_clipmerge_tool                = 'fastp'
+    shortread_clipmerge_skipadaptertrim     = false
+    shortread_clipmerge_mergepairs          = false
+    shortread_clipmerge_excludeunmerged     = false
+    shortread_clipmerge_adapter1            = null
+    shortread_clipmerge_adapter2            = null
+    shortread_clipmerge_minlength           = 15
+    longread_clip                           = false

    // MALT
    run_malt                   = false
@ -68,9 +74,12 @@ params {

    // centrifuge
    run_centrifuge             = false
+    centrifuge_db_name         = false
    centrifuge_save_unaligned  = false
    centrifuge_save_aligned    = false
    centrifuge_sam_format      = false
+    // metaphlan3
+    run_metaphlan3             = false
 }

 // Load base.config by default for all pipelines
@ -155,7 +164,7 @@ if (!params.igenomes_ignore) {
 // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable.

 env {
-    PYTHONNOUSERSITE = 1
+    PYTHONNOUSERSITE = '1'
    R_PROFILE_USER   = "/.Rprofile"
    R_ENVIRON_USER   = "/.Renviron"
    JULIA_DEPOT_PATH = "/usr/local/share/julia"
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -10,7 +10,10 @@
            "type": "object",
            "fa_icon": "fas fa-terminal",
            "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input", "outdir"],
+            "required": [
+                "input",
+                "outdir"
+            ],
            "properties": {
                "input": {
                    "type": "string",
@ -173,7 +176,14 @@
                    "description": "Method used to save pipeline results to output directory.",
                    "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                    "fa_icon": "fas fa-copy",
-                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
+                    "enum": [
+                        "symlink",
+                        "rellink",
+                        "link",
+                        "copy",
+                        "copyNoFollow",
+                        "move"
+                    ],
                    "hidden": true
                },
                "email_on_fail": {
@ -265,9 +275,9 @@
        "shortread_clipmerge": {
            "type": "boolean"
        },
-        "shortread_excludeunmerged": {
+        "shortread_clipmerge_excludeunmerged": {
            "type": "boolean",
-            "default": true
+            "default": false
        },
        "longread_clip": {
            "type": "boolean"
@ -293,6 +303,40 @@
        },
        "centrifuge_sam_format": {
            "type": "boolean"
+        },
+        "run_metaphlan3": {
+            "type": "boolean",
+            "description": "Enable MetaPhlAn for taxonomic profiling"
+        },
+        "shortread_clipmerge_tool": {
+            "type": "string",
+            "default": "fastp",
+            "enum": [
+                "fastp",
+                "adapterremoval"
+            ]
+        },
+        "shortread_clipmerge_skipadaptertrim": {
+            "type": "boolean"
+        },
+        "shortread_clipmerge_mergepairs": {
+            "type": "boolean"
+        },
+        "shortread_clipmerge_adapter1": {
+            "type": "string",
+            "default": "None"
+        },
+        "shortread_clipmerge_adapter2": {
+            "type": "string",
+            "default": "None"
+        },
+        "shortread_clipmerge_minlength": {
+            "type": "integer",
+            "default": 15
+        },
+        "centrifuge_db_name": {
+            "type": "string",
+            "default": "false"
        }
    }
-}
+}
--- a/subworkflows/local/db_check.nf
+++ b/subworkflows/local/db_check.nf
@ -12,16 +12,17 @@ workflow DB_CHECK {
    main:

    // TODO: make database sheet check
+    // Checks:
+    // 1) no duplicates,
+    // 2) args do not have quotes, e.g. just `,,` and NOT `,"",`
    parsed_samplesheet = DATABASE_CHECK ( dbsheet )
        .csv
        .splitCsv ( header:true, sep:',' )
-        .dump(tag: "db_split_csv_out")
        .map { create_db_channels(it) }
-        .dump(tag: "db_channel_prepped")

    ch_dbs_for_untar = parsed_samplesheet
        .branch {
-            untar: it[1].toString().endsWith(".tar.gz") && it[0]['tool'] != "centrifuge"
+            untar: it[1].toString().endsWith(".tar.gz")
            skip: true
        }

--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@ -12,7 +12,6 @@ workflow INPUT_CHECK {
    parsed_samplesheet = SAMPLESHEET_CHECK ( samplesheet )
        .csv
        .splitCsv ( header:true, sep:',' )
-        .dump(tag: "input_split_csv_out")
        .branch {
            fasta: it['fasta'] != ''
            nanopore: it['instrument_platform'] == 'OXFORD_NANOPORE'
@ -21,23 +20,20 @@ workflow INPUT_CHECK {

    parsed_samplesheet.fastq
        .map { create_fastq_channel(it) }
-        .dump(tag: "fastq_channel_init")
        .set { fastq }

    parsed_samplesheet.nanopore
        .map { create_fastq_channel(it) }
-        .dump(tag: "fastq_nanopore_channel_init")
        .set { nanopore }

    parsed_samplesheet.fasta
        .map { create_fasta_channel(it) }
-        .dump(tag: "fasta_channel_init")
        .set { fasta }

    emit:
-    fastq                                     // channel: [ val(meta), [ reads ] ]
-    nanopore                                  // channel: [ val(meta), [ reads ] ]
-    fasta                                     // channel: [ val(meta), fasta ]
+    fastq = fastq ?: []                       // channel: [ val(meta), [ reads ] ]
+    nanopore = nanopore ?: []                 // channel: [ val(meta), [ reads ] ]
+    fasta = fasta ?: []                       // channel: [ val(meta), fasta ]
    versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ]
 }

@ -72,9 +68,7 @@ def create_fastq_channel(LinkedHashMap row) {

    }
    return fastq_meta
-}
-
-// Function to get list of [ meta, fasta ]
+}// Function to get list of [ meta, fasta ]
 def create_fasta_channel(LinkedHashMap row) {
    def meta = [:]
    meta.id                     = row.sample
--- a/subworkflows/local/longread_preprocessing.nf
+++ b/subworkflows/local/longread_preprocessing.nf
@ -1,6 +1,9 @@
+/*
+Process long raw reads with porechop
+*/

-include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main'
-include { PORECHOP              } from '../../modules/nf-core/modules/porechop/main'
+include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main'
+include { PORECHOP                   } from '../../modules/nf-core/modules/porechop/main'

 workflow LONGREAD_PREPROCESSING {
    take:
@ -13,7 +16,6 @@ workflow LONGREAD_PREPROCESSING {
    PORECHOP ( reads )

    ch_processed_reads = PORECHOP.out.reads
-                                .dump(tag: "pre_fastqc_check")
                                .map {
                                        meta, reads ->
                                        def meta_new = meta.clone()
@ -21,9 +23,9 @@ workflow LONGREAD_PREPROCESSING {
                                        [ meta_new, reads ]
                                    }

-    FASTQC_POST ( PORECHOP.out.reads )
+    FASTQC_PROCESSED ( PORECHOP.out.reads )
    ch_versions = ch_versions.mix(PORECHOP.out.versions.first())
-    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )


    emit:
--- a/subworkflows/local/shortread_adapterremoval.nf
+++ b/subworkflows/local/shortread_adapterremoval.nf
@ -0,0 +1,129 @@
+/*
+Process short raw reads with AdapterRemoval
+*/
+
+include { ADAPTERREMOVAL as ADAPTERREMOVAL_SINGLE       } from '../../modules/nf-core/modules/adapterremoval/main'
+include { ADAPTERREMOVAL as ADAPTERREMOVAL_PAIRED       } from '../../modules/nf-core/modules/adapterremoval/main'
+include { CAT_FASTQ                                     } from '../../modules/nf-core/modules/cat/fastq/main'
+include {
+    ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION1;
+    ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION2;
+    ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION3;
+} from '../../modules/local/ensure_fastq_extension'
+
+workflow SHORTREAD_ADAPTERREMOVAL {
+
+    take:
+    reads // [[meta], [reads]]
+
+    main:
+    ch_versions = Channel.empty()
+    ch_multiqc_files      = Channel.empty()
+
+    ch_input_for_adapterremoval = reads
+                                    .branch{
+                                        single: it[0].single_end
+                                        paired: !it[0].single_end
+                                    }
+
+    ADAPTERREMOVAL_SINGLE ( ch_input_for_adapterremoval.single, [] )
+    ADAPTERREMOVAL_PAIRED ( ch_input_for_adapterremoval.paired, [] )
+
+    /*
+     * Due to the ~slightly~ very ugly output implementation of the current AdapterRemoval2 version, each file
+     * has to be exported in a separate channel and we must manually recombine when necessary.
+     */
+
+    if ( params.shortread_clipmerge_mergepairs && !params.shortread_clipmerge_excludeunmerged ) {
+
+        ENSURE_FASTQ_EXTENSION1(
+            Channel.empty().mix(
+                ADAPTERREMOVAL_PAIRED.out.collapsed,
+                ADAPTERREMOVAL_PAIRED.out.collapsed_truncated,
+                ADAPTERREMOVAL_PAIRED.out.singles_truncated,
+                ADAPTERREMOVAL_PAIRED.out.pair1_truncated,
+                ADAPTERREMOVAL_PAIRED.out.pair2_truncated
+            )
+            .map { meta, reads ->
+                meta.single_end = true
+                [meta, reads]
+            }
+        )
+
+        CAT_FASTQ(
+            ENSURE_FASTQ_EXTENSION1.out.reads
+                .groupTuple()
+        )
+
+        ENSURE_FASTQ_EXTENSION2(ADAPTERREMOVAL_SINGLE.out.singles_truncated)
+
+        ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads
+            .mix(ENSURE_FASTQ_EXTENSION2.out.reads)
+
+    } else if ( params.shortread_clipmerge_mergepairs && params.shortread_clipmerge_excludeunmerged ) {
+
+        ENSURE_FASTQ_EXTENSION1(
+            Channel.empty().mix(
+                ADAPTERREMOVAL_PAIRED.out.collapsed,
+                ADAPTERREMOVAL_PAIRED.out.collapsed_truncated
+            )
+            .map { meta, reads ->
+                meta.single_end = true
+                [meta, reads]
+            }
+        )
+
+        CAT_FASTQ(
+            ENSURE_FASTQ_EXTENSION1.out.reads
+                .groupTuple()
+        )
+
+        ENSURE_FASTQ_EXTENSION2(ADAPTERREMOVAL_SINGLE.out.singles_truncated)
+
+        ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads
+            .mix(ENSURE_FASTQ_EXTENSION2.out.reads)
+
+    } else {
+
+        ENSURE_FASTQ_EXTENSION1(
+            ADAPTERREMOVAL_PAIRED.out.pair1_truncated
+            .map { meta, reads ->
+                meta.single_end = true
+                [meta, reads]
+            }
+        )
+
+        ENSURE_FASTQ_EXTENSION2(
+            ADAPTERREMOVAL_PAIRED.out.pair2_truncated
+            .map { meta, reads ->
+                meta.single_end = true
+                [meta, reads]
+            }
+        )
+
+        ENSURE_FASTQ_EXTENSION3(ADAPTERREMOVAL_SINGLE.out.singles_truncated)
+
+        ch_adapterremoval_reads_prepped = ENSURE_FASTQ_EXTENSION1.out.reads
+            .join(ENSURE_FASTQ_EXTENSION2.out.reads)
+            .groupTuple()
+            .map { meta, pair1, pair2 ->
+                meta.single_end = false
+                [ meta, [ pair1, pair2 ].flatten() ]
+            }
+            .mix(ENSURE_FASTQ_EXTENSION3.out.reads)
+
+    }
+
+    ch_versions = ch_versions.mix( ADAPTERREMOVAL_SINGLE.out.versions.first() )
+    ch_versions = ch_versions.mix( ADAPTERREMOVAL_PAIRED.out.versions.first() )
+    ch_multiqc_files = ch_multiqc_files.mix(
+        ADAPTERREMOVAL_PAIRED.out.log.collect{it[1]},
+        ADAPTERREMOVAL_SINGLE.out.log.collect{it[1]}
+    )
+
+    emit:
+    reads    = ch_adapterremoval_reads_prepped  // channel: [ val(meta), [ reads ] ]
+    versions = ch_versions  // channel: [ versions.yml ]
+    mqc      = ch_multiqc_files
+}
+
--- a/subworkflows/local/shortread_fastp.nf
+++ b/subworkflows/local/shortread_fastp.nf
@ -0,0 +1,55 @@
+/*
+Process short raw reads with FastP
+*/
+
+include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
+include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
+
+workflow SHORTREAD_FASTP {
+    take:
+    reads // [[meta], [reads]]
+
+    main:
+    ch_versions = Channel.empty()
+    ch_multiqc_files      = Channel.empty()
+
+    ch_input_for_fastp = reads
+                            .branch{
+                                single: it[0]['single_end'] == true
+                                paired: it[0]['single_end'] == false
+                            }
+
+    FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
+    // Last parameter here turns on merging of PE data
+    FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs )
+
+    if ( params.shortread_clipmerge_mergepairs ) {
+        ch_fastp_reads_prepped_pe = FASTP_PAIRED.out.reads_merged
+                                        .map {
+                                            meta, reads ->
+                                                def meta_new = meta.clone()
+                                                meta_new['single_end'] = 1
+                                                [ meta_new, reads ]
+                                        }
+
+        ch_fastp_reads_prepped = ch_fastp_reads_prepped_pe.mix( FASTP_SINGLE.out.reads )
+
+    } else {
+        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads
+                                    .mix( FASTP_SINGLE.out.reads )
+    }
+
+    ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
+    ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
+
+    ch_processed_reads = ch_fastp_reads_prepped
+
+    ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
+
+    emit:
+    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
+    versions = ch_versions          // channel: [ versions.yml ]
+    mqc      = ch_multiqc_files
+}
+
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@ -3,67 +3,33 @@
 //


-include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
-include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
-include { FASTQC as FASTQC_POST       } from '../../modules/nf-core/modules/fastqc/main'
+include { SHORTREAD_FASTP             } from './shortread_fastp'
+include { SHORTREAD_ADAPTERREMOVAL    } from './shortread_adapterremoval'
+include { FASTQC as FASTQC_PROCESSED       } from '../../modules/nf-core/modules/fastqc/main'

 workflow SHORTREAD_PREPROCESSING {
    take:
    reads // file: /path/to/samplesheet.csv

    main:
-    ch_versions = Channel.empty()
-    ch_multiqc_files      = Channel.empty()
-
-    //
-    // STEP: Read clipping and merging
-    //
-    // TODO give option to clip only and retain pairs
-    // TODO give option to retain singletons (probably fastp option likely)
-    // TODO move to subworkflow
-
-
-    if ( params.shortread_clipmerge ) {
-
-        ch_input_for_fastp = reads
-                                .dump(tag: "pre-fastp_branch")
-                                .branch{
-                                    single: it[0]['single_end'] == true
-                                    paired: it[0]['single_end'] == false
-                                }
-
-        ch_input_for_fastp.single.dump(tag: "input_fastp_single")
-        ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
-
-        FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
-        FASTP_PAIRED ( ch_input_for_fastp.paired, false, true )
-
-        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
-                                    .mix( FASTP_SINGLE.out.reads )
-                                    .map {
-                                        meta, reads ->
-                                        def meta_new = meta.clone()
-                                        meta_new['single_end'] = 1
-                                        [ meta_new, reads ]
-                                    }
-
-        FASTQC_POST ( ch_fastp_reads_prepped )
-
-        ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
-        ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
-
-        ch_processed_reads = ch_fastp_reads_prepped
-
-        ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} )
-        ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
-        ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
-
-        ch_multiqc_files.dump(tag: "preprocessing_mqc_final")
+    ch_versions       = Channel.empty()
+    ch_multiqc_files  = Channel.empty()

+    if ( params.shortread_clipmerge_tool == "fastp" ) {
+        ch_processed_reads = SHORTREAD_FASTP ( reads ).reads
+        ch_versions        =  ch_versions.mix( SHORTREAD_FASTP.out.versions )
+        ch_multiqc_files   =  ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc )
+    } else if ( params.shortread_clipmerge_tool == "adapterremoval" ) {
+        ch_processed_reads = SHORTREAD_ADAPTERREMOVAL ( reads ).reads
+        ch_versions        = ch_versions.mix( SHORTREAD_ADAPTERREMOVAL.out.versions )
+        ch_multiqc_files   = ch_multiqc_files.mix( SHORTREAD_ADAPTERREMOVAL.out.mqc )
    } else {
        ch_processed_reads = reads
    }

+    FASTQC_PROCESSED ( ch_processed_reads )
+    ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )

    emit:
    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -17,6 +17,8 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true
 // Check mandatory parameters
 if (params.input    ) { ch_input     = file(params.input)     } else { exit 1, 'Input samplesheet not specified!' }
 if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
+if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not except uncollapsed paired-reads. Pairs will be profiled as separate files."
+if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "[nf-core/taxprofiler] error: cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs"

 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -36,11 +38,11 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi
 //
 // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
 //
-include { INPUT_CHECK         } from '../subworkflows/local/input_check'
+include { INPUT_CHECK             } from '../subworkflows/local/input_check'

-include { DB_CHECK            } from '../subworkflows/local/db_check'
+include { DB_CHECK                } from '../subworkflows/local/db_check'
 include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing'
-include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing'
+include { LONGREAD_PREPROCESSING  } from '../subworkflows/local/longread_preprocessing'

 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -59,6 +61,7 @@ include { CAT_FASTQ                   } from '../modules/nf-core/modules/cat/fas
 include { MALT_RUN                    } from '../modules/nf-core/modules/malt/run/main'
 include { KRAKEN2_KRAKEN2             } from '../modules/nf-core/modules/kraken2/kraken2/main'
 include { CENTRIFUGE                  } from '../modules/nf-core/modules/centrifuge/main'
+include { METAPHLAN3                  } from '../modules/nf-core/modules/metaphlan3/main'

 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -73,9 +76,9 @@ workflow TAXPROFILER {

    ch_versions = Channel.empty()

-    //
-    // SUBWORKFLOW: Read in samplesheet, validate and stage input files
-    //
+    /*
+        SUBWORKFLOW: Read in samplesheet, validate and stage input files
+    */
    INPUT_CHECK (
        ch_input
    )
@ -85,22 +88,24 @@ workflow TAXPROFILER {
        ch_databases
    )

-    //
-    // MODULE: Run FastQC
-    //
+    /*
+        MODULE: Run FastQC
+    */
    ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ).dump(tag: "input_to_fastq")
+
    FASTQC (
        ch_input_for_fastqc
    )
+
    ch_versions = ch_versions.mix(FASTQC.out.versions.first())

    CUSTOM_DUMPSOFTWAREVERSIONS (
        ch_versions.unique().collectFile(name: 'collated_versions.yml')
    )

-    //
-    // PERFORM PREPROCESSING
-    //
+    /*
+        SUBWORKFLOW: PERFORM PREPROCESSING
+    */
    if ( params.shortread_clipmerge ) {
        ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads
    } else {
@ -115,54 +120,31 @@ workflow TAXPROFILER {
        ch_longreads_preprocessed = INPUT_CHECK.out.nanopore
    }

-    //
-    // PERFORM SHORT READ RUN MERGING
-    // TODO: Check not necessary for long reads too?
-    //
-    ch_processed_for_combine = ch_shortreads_preprocessed
-        .dump(tag: "prep_for_combine_grouping")
-        .map {
-            meta, reads ->
-            def meta_new = meta.clone()
-            meta_new['run_accession'] = 'combined'
-            [ meta_new, reads ]
-        }
-        .groupTuple ( by: 0 )
-        .branch{
-            combine: it[1].size() >= 2
-            skip: it[1].size() < 2
-        }
-
-    CAT_FASTQ ( ch_processed_for_combine.combine )
-
-    ch_reads_for_profiling = ch_processed_for_combine.skip
-                                .dump(tag: "skip_combine")
-                                .mix( CAT_FASTQ.out.reads )
-                                .dump(tag: "files_for_profiling")
-
-    //
-    // COMBINE READS WITH POSSIBLE DATABASES
-    //
+    /*
+        COMBINE READS WITH POSSIBLE DATABASES
+    */

    // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
-    ch_input_for_profiling = ch_reads_for_profiling
+    ch_input_for_profiling = ch_shortreads_preprocessed
            .mix( ch_longreads_preprocessed )
            .combine(DB_CHECK.out.dbs)
-            .dump(tag: "reads_plus_db")
            .branch {
-                malt:       it[2]['tool'] == 'malt'
-                kraken2:    it[2]['tool'] == 'kraken2'
+                malt:    it[2]['tool'] == 'malt'
+                kraken2: it[2]['tool'] == 'kraken2'
+                metaphlan3: it[2]['tool'] == 'metaphlan3'
                centrifuge: it[2]['tool'] == 'centrifuge'
-                unknown:    true
+                unknown: true
            }

-    //
-    // PREPARE PROFILER INPUT CHANNELS
-    //
+    /*
+        PREPARE PROFILER INPUT CHANNELS
+    */

    // We groupTuple to have all samples in one channel for MALT as database
    // loading takes a long time, so we only want to run it once per database
+    // TODO document somewhere we only accept illumina short reads for MALT?
    ch_input_for_malt =  ch_input_for_profiling.malt
+                            .filter { it[0]['instrument_platform'] == 'ILLUMINA' }
                            .map {
                                it ->
                                    def temp_meta =  [ id: it[2]['db_name']]  + it[2]
@ -170,7 +152,6 @@ workflow TAXPROFILER {
                                    [ temp_meta, it[1], db ]
                            }
                            .groupTuple(by: [0,2])
-                            .dump(tag: "input for malt")
                            .multiMap {
                                it ->
                                    reads: [ it[0], it[1].flatten() ]
@ -179,7 +160,6 @@ workflow TAXPROFILER {

    // We can run Kraken2 one-by-one sample-wise
    ch_input_for_kraken2 =  ch_input_for_profiling.kraken2
-                            .dump(tag: "input for kraken")
                            .multiMap {
                                it ->
                                    reads: [ it[0] + it[2], it[1] ]
@ -198,6 +178,17 @@ workflow TAXPROFILER {
    //
    // RUN PROFILING
    //
+    ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3
+                            .dump(tag: "input_metaphlan3")
+                            .multiMap {
+                                it ->
+                                    reads: [it[0] + it[2], it[1]]
+                                    db: it[3]
+                            }
+
+    /*
+        MODULE: RUN PROFILING
+    */
    if ( params.run_malt ) {
        MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db )
    }
@ -207,12 +198,16 @@ workflow TAXPROFILER {
    }

    if ( params.run_centrifuge ) {
-        CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format  )
+        CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_db_name, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format  )
    }

-    //
-    // MODULE: MultiQC
-    //
+    if ( params.run_metaphlan3 ) {
+        METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db )
+    }
+
+    /*
+        MODULE: MultiQC
+    */
    workflow_summary    = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params)
    ch_workflow_summary = Channel.value(workflow_summary)

@ -240,6 +235,7 @@ workflow TAXPROFILER {

    // TODO MALT results overwriting per database?
    // TODO Versions for Karken/MALT not report?
+    // TODO create multiQC module for metaphlan
    MULTIQC (
        ch_multiqc_files.collect()
    )