Merge branch 'dev' into run-merging

2024-12-22 15:28:16 +00:00 · 2022-04-12 09:28:10 +02:00 · 2022-04-12 09:28:10 +02:00 · 967b1f7d6e
commit 967b1f7d6e
parent a5f4fc42d5 1ac5a129a8
15 changed files with 381 additions and 137 deletions
--- a/README.md
+++ b/README.md
@ -18,7 +18,7 @@

 <!-- TODO nf-core: Write a 1-2 sentence summary of what data the pipeline is for and what it does -->

-**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling against multiple profiling tools and databases and produces standardised output tables.
+**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling with multiple profiling tools against multiple databases, produces standardised output tables.

 The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!

@ -32,20 +32,20 @@ On release, automated continuous integration tests run the pipeline on a full-si

 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
 2. Performs optional read pre-processing
-   - Adapter clipping and merging (short, and nanopore reads)
-   - Low complexity filtering
-   - Host read removal
+   - Adapter clipping and merging (short read: [fastp](https://github.com/OpenGene/fastp), [AdapterRemoval2](https://github.com/MikkelSchubert/adapterremoval); long read: [porechop](https://github.com/rrwick/Porechop))
+   - Low complexity filtering ([bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus))
+   - Host read removal ([BowTie2](http://bowtie-bio.sourceforge.net/bowtie2/))
   - Run merging
-3. Performs taxonomic profiling a choice of:
-   - Kraken2
-   - MetaPhlAn3
-   - MALT
-   - DIAMOND
-   - Centrifuge
-   - Kaiju
-   - mOTUs
+3. Performs taxonomic profiling using one or more of:
+   - [Kraken2](https://ccb.jhu.edu/software/kraken2/)
+   - [MetaPhlAn3](https://huttenhower.sph.harvard.edu/metaphlan/)
+   - [MALT](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/malt/)
+   - [DIAMOND](https://github.com/bbuchfink/diamond)
+   - [Centrifuge](https://ccb.jhu.edu/software/centrifuge/)
+   - [Kaiju](https://kaiju.binf.ku.dk/)
+   - [mOTUs](https://motu-tool.org/)
 4. Perform optional post-processing with:
-   - bracken
+   - [bracken](https://ccb.jhu.edu/software/bracken/)
 5. Standardises output tables
 6. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))

@ -70,10 +70,8 @@ On release, automated continuous integration tests run the pipeline on a full-si

 4. Start running your own analysis!

-   <!-- TODO nf-core: Update the example "typical command" below used to run the pipeline -->
-
   ```console
-   nextflow run nf-core/taxprofiler --input samplesheet.csv --outdir <OUTDIR> --genome GRCh37 -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
+   nextflow run nf-core/taxprofiler --input samplesheet.csv --databases database.csv --outdir <OUTDIR> --run_<TOOL1> --run_<TOOL1> -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
   ```

 ## Documentation
@ -86,7 +84,7 @@ nf-core/taxprofiler was originally written by nf-core community.

 We thank the following people for their extensive assistance in the development of this pipeline:

-<!-- TODO nf-core: If applicable, make list of people who have also contributed -->
+[James A. Fellows Yates](https://github.com/jfy133), [Moritz Beber](https://github.com/Midnighter), [Lauri Mesilaakso](https://github.com/ljmesi), [Sofia Stamouli](https://github.com/sofsam), [Maxime Borry](https://github.com/maxibor).

 ## Contributions and Support

--- a/assets/samplesheet.csv
+++ b/assets/samplesheet.csv
@ -1,3 +1,6 @@
-sample,fastq_1,fastq_2
-SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz
-SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz,
+sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
+2611,ERR5766174,ILLUMINA,,,/<path>/<to>/fasta/ERX5474930_ERR5766174_1.fa.gz
+2612,ERR5766176,ILLUMINA,/<path>/<to>/fastq/ERX5474932_ERR5766176_1.fastq.gz,/<path>/<to>/fastq/ERX5474932_ERR5766176_2.fastq.gz,
+2612,ERR5766180,ILLUMINA,/<path>/<to>/fastq/ERX5474936_ERR5766180_1.fastq.gz,,
+2613,ERR5766181,ILLUMINA,/<path>/<to>/fastq/ERX5474937_ERR5766181_1.fastq.gz,/<path>/<to>/fastq/ERX5474937_ERR5766181_2.fastq.gz,
+ERR3201952,ERR3201952,OXFORD_NANOPORE,/<path>/<to>/fastq/ERR3201952.fastq.gz,,
--- a/conf/modules.config
+++ b/conf/modules.config
@ -177,7 +177,7 @@ process {
        publishDir = [
            path: { "${params.outdir}/malt/${meta.db_name}" },
            mode: params.publish_dir_mode,
-            pattern: '*.{rma6,tab,text,sam,log}'
+            pattern: '*.{log}'
        ]
    }

@ -187,7 +187,7 @@ process {
        publishDir = [
            path: { "${params.outdir}/kraken2/${meta.db_name}" },
            mode: params.publish_dir_mode,
-            pattern: '*.{fastq.gz,txt}'
+            pattern: '*.{txt}'
        ]
    }

@ -201,6 +201,16 @@ process {
        ]
    }

+    withName: CENTRIFUGE_CENTRIFUGE {
+        publishDir = [
+            path: { "${params.outdir}/centrifuge/${meta.db_name}" },
+            mode: params.publish_dir_mode,
+            pattern: '*.txt'
+        ]
+        ext.args = { "${meta.db_params}" }
+        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
+    }
+
    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
        publishDir = [
            path: { "${params.outdir}/pipeline_info" },
@ -209,4 +219,12 @@ process {
        ]
    }

+    withName: MULTIQC {
+        publishDir = [
+            path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
 }
--- a/conf/test.config
+++ b/conf/test.config
@ -27,6 +27,7 @@ params {
    run_kraken2                   = true
    run_malt                      = true
    run_metaphlan3                = true
+    run_centrifuge                = true
    shortread_clipmerge           = true
    longread_clip                 = false
    shortread_complexityfilter    = true
--- a/docs/usage.md
+++ b/docs/usage.md
@ -8,56 +8,90 @@

 <!-- TODO nf-core: Add documentation about anything specific to running your pipeline. For general topics, please point to (and add to) the main nf-core website. -->

-## Samplesheet input
+## Samplesheet inputs

-You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below.
+You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row as shown in the examples below. Furthermother, nf-core/taxprofiler also requires a second comma-separated file of 3 columns with a header row as in the examples below.
+
+This samplesheet is then specified on the command line as follows:

 ```console
--input '[path to samplesheet file]'
+--input '[path to samplesheet file]' --databases '[path to database sheet file]'
 ```

 ### Multiple runs of the same sample

-The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes:
+The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will process reads before performing profiling. Below is an example for the same sample sequenced across 3 lanes:

 ```console
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
-CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz
-CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz
+sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
+2612,run1,ILLUMINA,2612_run1_R1.fq.gz,,
+2612,run2,ILLUMINA,2612_run2_R1.fq.gz,,
+2612,run3,ILLUMINA,2612_run3_R1.fq.gz,2612_run3_R2.fq.gz,
+
 ```

+> ⚠️ Runs of the same sample sequenced on Illumina platforms with a combination of single and paired-end data will **not** be run-wise concatenated, unless pair-merging is specified. In the example above, `run3` will be profiled independently of `run1` and `run2` if pairs are not merged.
+
 ### Full samplesheet

-The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below.
+The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 6 columns to match those defined in the table below.

-A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice.
+A final samplesheet file consisting of both single- and paired-end data, as well as long-read FASTA fies may look something like the one below. This is for 6 samples, where `2612` has been sequenced twice.

 ```console
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
-CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz
-CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz
-TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz,
-TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz,
-TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz,
-TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz,
+2611,ERR5766174,ILLUMINA,,,/<path>/<to>/fasta/ERX5474930_ERR5766174_1.fa.gz
+2612,ERR5766176,ILLUMINA,/<path>/<to>/fastq/ERX5474932_ERR5766176_1.fastq.gz,/<path>/<to>/fastq/ERX5474932_ERR5766176_2.fastq.gz,
+2612,ERR5766180,ILLUMINA,/<path>/<to>/fastq/ERX5474936_ERR5766180_1.fastq.gz,,
+2613,ERR5766181,ILLUMINA,/<path>/<to>/fastq/ERX5474937_ERR5766181_1.fastq.gz,/<path>/<to>/fastq/ERX5474937_ERR5766181_2.fastq.gz,
+ERR3201952,ERR3201952,OXFORD_NANOPORE,/<path>/<to>/fastq/ERR3201952.fastq.gz,,
 ```

-| Column    | Description                                                                                                                                                                            |
-| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `sample`  | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
-| `fastq_1` | Full path to FastQ file for Illumina short reads 1 or Nanopore reads. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                           |
-| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
+| Column                | Description                                                                                                                                                                                              |
+| --------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `sample`              | Unique sample name [required].                                                                                                                                                                           |
+| `run_accession`       | Run ID or name unique for each (pairs of) file(s) .Can also supply sample name again here, if only a single run was generated [required].                                                                |
+| `instrument_platform` | Sequencing platform reads generated on, selected from the EBI ENA [controlled vocabulary](https://www.ebi.ac.uk/ena/portal/api/controlledVocab?field=instrument_platform) [required].                    |
+| `fastq_1`             | Path or URL to sequencing reads or for Illumina R1 sequencing reads in FASTQ format. GZipped compressed files accepted. Can be left empty if data in FASTA is specifed. Cannot be combined with `fasta`. |
+| `fastq_2`             | Path or URL to Illumina R2 sequencing reads in FASTQ format. GZipped compressed files accepted. Can be left empty if single end data. Cannot be combined with `fasta`.                                   |
+| `fasta`               | Path or URL to long-reads or contigs in FASTA format. GZipped compressed files accepted. Can be left empty if data in FASTA is specifed. Cannot be combined with `fastq_1` or `fastq_2`.                 |

 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.

+### Full database sheet
+
+nf-core/taxprofiler supports multiple databases being profiled in parallel for each tool. These databases, and specific parameters for each, can be specified in a 4 column comma-separated sheet.
+
+> ⚠️ nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user.
+
+An example database sheet can look as follows, where 4 tools are being used, and `malt` and `kraken2` will be used against two databases each.
+
+```console
+tool,db_name,db_params,db_path
+malt,malt85,-id 85,/<path>/<to>/malt/testdb-malt/
+malt,malt95,-id 90,/<path>/<to>/malt/testdb-malt.tar.gz
+kraken2,db1,,/<path>/<to>/kraken2/testdb-kraken2.tar.gz
+kraken2,db2,--quick,/<path>/<to>/kraken2/testdb-kraken2.tar.gz
+centrifuge,db1,,/<path>/<to>/centrifuge/minigut_cf.tar.gz
+metaphlan3,db1,,/<path>/<to>/metaphlan3/metaphlan_database/
+```
+
+Column specifications are as follows:
+
+| Column      | Description                                                                                                                                                                                                                                             |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tool`      | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required].                                                                                                                                          |
+| `db_name`   | A unique name of the particular database [required].                                                                                                                                                                                                    |
+| `db_params` | Any parameters of the given taxonomic profiler that you wish to specify that the taxonomic profiling tool should use when profiling against this specific. Can be empty to use taxonomic profiler defaults Must not be surrounded by quotes [required]. |
+| `db_path`   | Path to the database. Can either be a path to a directory containing the database index files or a `.tar.gz` file which contains the compressed database directory with the same name as the tar archive, minus `.tar.gz` [required].                   |
+
+> 💡 You can also specify the same database directory/file twice (ensuring unique `db_name`s) and specify different parameters for each database to compare the effect of different parameters during profiling.
+
 ## Running the pipeline

 The typical command for running the pipeline is as follows:

 ```console
-nextflow run nf-core/taxprofiler --input samplesheet.csv --outdir <OUTDIR> --genome GRCh37 -profile docker
+nextflow run nf-core/taxprofiler --input samplesheet.csv --databases databases.csv --outdir <OUTDIR> -profile docker --run_<TOOL1> --run_<TOOL2>
 ```

 This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.
@ -66,7 +100,7 @@ Note that the pipeline will create the following files in your working directory

 ```console
 work                # Directory containing the nextflow working files
-<OUTIDR>            # Finished results in specified location (defined with --outdir)
+<OUTDIR>            # Finished results in specified location (defined with --outdir)
 .nextflow_log       # Log file from Nextflow
 # Other nextflow hidden files, eg. history of pipeline runs and old logs.
 ```
--- a/modules.json
+++ b/modules.json
@ -12,6 +12,9 @@
            "cat/fastq": {
                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
            },
+            "centrifuge/centrifuge": {
+                "git_sha": "d2726fcf75063960f06b36d2229a4c0966614108"
+            },
            "custom/dumpsoftwareversions": {
                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
            },
--- a/modules/nf-core/modules/centrifuge/centrifuge/main.nf
+++ b/modules/nf-core/modules/centrifuge/centrifuge/main.nf
@ -0,0 +1,61 @@
+process CENTRIFUGE_CENTRIFUGE {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda (params.enable_conda ? "bioconda::centrifuge=1.0.4_beta" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6' :
+        'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }"
+
+    input:
+    tuple val(meta), path(reads)
+    path db
+    val save_unaligned
+    val save_aligned
+    val sam_format
+
+    output:
+    tuple val(meta), path('*report.txt')                 , emit: report
+    tuple val(meta), path('*results.txt')                , emit: results
+    tuple val(meta), path('*.sam')                       , optional: true, emit: sam
+    tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz')   , optional: true, emit: fastq_mapped
+    tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped
+    path "versions.yml"                                  , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def paired = meta.single_end ? "-U ${reads}" :  "-1 ${reads[0]} -2 ${reads[1]}"
+    def unaligned = ''
+    def aligned = ''
+    if (meta.single_end) {
+        unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : ''
+        aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : ''
+    } else {
+        unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : ''
+        aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : ''
+    }
+    def sam_output = sam_format ? "--out-fmt 'sam'" : ''
+    """
+    ## we add "-no-name ._" to ensure silly Mac OSX metafiles files aren't included
+    db_name=`find -L ${db} -name "*.1.cf" -not -name "._*"  | sed 's/.1.cf//'`
+    centrifuge \\
+        -x \$db_name \\
+        -p $task.cpus \\
+        $paired \\
+        --report-file ${prefix}.report.txt \\
+        -S ${prefix}.results.txt \\
+        $unaligned \\
+        $aligned \\
+        $sam_output \\
+        $args
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        centrifuge: \$( centrifuge --version  | sed -n 1p | sed 's/^.*centrifuge-class version //')
+    END_VERSIONS
+    """
+}
--- a/modules/nf-core/modules/centrifuge/centrifuge/meta.yml
+++ b/modules/nf-core/modules/centrifuge/centrifuge/meta.yml
@ -0,0 +1,66 @@
+name: centrifuge_centrifuge
+description: Classifies metagenomic sequence data
+keywords:
+  - classify
+  - metagenomics
+  - fastq
+  - db
+tools:
+  - centrifuge:
+      description: Centrifuge is a classifier for metagenomic sequences.
+      homepage: https://ccb.jhu.edu/software/centrifuge/
+      documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml
+      doi: 10.1101/gr.210641.116
+      licence: ["GPL v3"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: |
+        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+        respectively.
+  - db:
+      type: directory
+      description: Path to directory containing centrifuge database files
+  - save_unaligned:
+      type: value
+      description: If true unmapped fastq files are saved
+  - save_aligned:
+      type: value
+      description: If true mapped fastq files are saved
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - report:
+      type: file
+      description: |
+        File containing a classification summary
+      pattern: "*.{report.txt}"
+  - results:
+      type: file
+      description: |
+        File containing classification results
+      pattern: "*.{results.txt}"
+  - fastq_unmapped:
+      type: file
+      description: Unmapped fastq files
+      pattern: "*.unmapped.fastq.gz"
+  - fastq_mapped:
+      type: file
+      description: Mapped fastq files
+      pattern: "*.mapped.fastq.gz"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@sofstam"
+  - "@jfy133"
+  - "@sateeshperi"
--- a/nextflow.config
+++ b/nextflow.config
@ -87,6 +87,11 @@ params {
    // kraken2
    run_kraken2                = false

+    // centrifuge
+    run_centrifuge             = false
+    centrifuge_save_unaligned  = false
+    centrifuge_save_aligned    = false
+    centrifuge_sam_format      = false
    // metaphlan3
    run_metaphlan3             = false
 }
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -281,6 +281,18 @@
        "run_kraken2": {
            "type": "boolean"
        },
+        "run_centrifuge": {
+            "type": "boolean"
+        },
+        "centrifuge_save_unaligned": {
+            "type": "boolean"
+        },
+        "centrifuge_save_aligned": {
+            "type": "boolean"
+        },
+        "centrifuge_sam_format": {
+            "type": "boolean"
+        },
        "run_metaphlan3": {
            "type": "boolean",
            "description": "Enable MetaPhlAn for taxonomic profiling"
--- a/subworkflows/local/db_check.nf
+++ b/subworkflows/local/db_check.nf
@ -22,7 +22,7 @@ workflow DB_CHECK {

    ch_dbs_for_untar = parsed_samplesheet
        .branch {
-            untar: it[1].toString().endsWith(".tar.gz") && it[0]['tool'] != 'centrifuge'
+            untar: it[1].toString().endsWith(".tar.gz")
            skip: true
        }

--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@ -63,13 +63,12 @@ def create_fastq_channel(LinkedHashMap row) {
            if (!file(row.fastq_2).exists()) {
                exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}"
            }
-            fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
+         fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
        }
+
    }
    return fastq_meta
-}
-
-// Function to get list of [ meta, fasta ]
+}// Function to get list of [ meta, fasta ]
 def create_fasta_channel(LinkedHashMap row) {
    def meta = [:]
    meta.id                     = row.sample
--- a/subworkflows/local/profiling.nf
+++ b/subworkflows/local/profiling.nf
@ -0,0 +1,119 @@
+//
+// Run profiling
+//
+
+include { MALT_RUN                    } from '../../modules/nf-core/modules/malt/run/main'
+include { KRAKEN2_KRAKEN2             } from '../../modules/nf-core/modules/kraken2/kraken2/main'
+include { CENTRIFUGE_CENTRIFUGE       } from '../../modules/nf-core/modules/centrifuge/centrifuge/main'
+include { METAPHLAN3                  } from '../../modules/nf-core/modules/metaphlan3/main'
+
+workflow PROFILING {
+    take:
+    shortreads // [ [ meta ], [ reads ] ]
+    longreads // [ [ meta ], [ reads ] ]
+    databases // [ [ meta ], path ]
+
+    main:
+    ch_versions       = Channel.empty()
+    ch_multiqc_files  = Channel.empty()
+
+/*
+        COMBINE READS WITH POSSIBLE DATABASES
+    */
+
+    // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
+    ch_input_for_profiling = shortreads
+            .mix( longreads )
+            .combine(databases)
+            .branch {
+                malt:    it[2]['tool'] == 'malt'
+                kraken2: it[2]['tool'] == 'kraken2'
+                metaphlan3: it[2]['tool'] == 'metaphlan3'
+                centrifuge: it[2]['tool'] == 'centrifuge'
+                unknown: true
+            }
+
+    /*
+        PREPARE PROFILER INPUT CHANNELS
+    */
+
+    // Each tool as a slightly different input structure and generally separate
+    // input channels for reads vs databases. We restructure the channel tuple
+    // for each tool and make liberal use of multiMap to keep reads/databases
+    // channel element order in sync with each other
+
+    // MALT: We groupTuple to have all samples in one channel for MALT as database
+    // loading takes a long time, so we only want to run it once per database
+    // TODO document somewhere we only accept illumina short reads for MALT?
+    ch_input_for_malt =  ch_input_for_profiling.malt
+                            .filter { it[0]['instrument_platform'] == 'ILLUMINA' }
+                            .map {
+                                it ->
+                                    def temp_meta =  [ id: it[2]['db_name']]  + it[2]
+                                    def db = it[3]
+                                    [ temp_meta, it[1], db ]
+                            }
+                            .groupTuple(by: [0,2])
+                            .multiMap {
+                                it ->
+                                    reads: [ it[0], it[1].flatten() ]
+                                    db: it[2]
+                            }
+
+    // All subsequent tools can easily run on a per-sample basis
+
+    ch_input_for_kraken2 =  ch_input_for_profiling.kraken2
+                            .multiMap {
+                                it ->
+                                    reads: [ it[0] + it[2], it[1] ]
+                                    db: it[3]
+                            }
+
+    ch_input_for_centrifuge =  ch_input_for_profiling.centrifuge
+                                .multiMap {
+                                    it ->
+                                        reads: [ it[0] + it[2], it[1] ]
+                                        db: it[3]
+                                }
+
+    ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3
+                            .multiMap {
+                                it ->
+                                    reads: [it[0] + it[2], it[1]]
+                                    db: it[3]
+                            }
+
+    /*
+        RUN PROFILING
+    */
+
+    if ( params.run_malt ) {
+        MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db )
+        ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([])  )
+        ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() )
+    }
+
+    if ( params.run_kraken2 ) {
+        KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db  )
+        ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([])  )
+        ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() )
+    }
+
+    if ( params.run_centrifuge ) {
+        CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format  )
+        ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() )
+    }
+
+    if ( params.run_metaphlan3 ) {
+        METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db )
+        ch_versions = ch_versions.mix( METAPHLAN3.out.versions.first() )
+    }
+
+
+    emit:
+    // TODO work out if there is enough standardisation of output to export as one?
+    //output    = ch_filtered_reads    // channel: [ val(meta), [ reads ] ]
+    versions = ch_versions          // channel: [ versions.yml ]
+    mqc      = ch_multiqc_files
+}
+
--- a/subworkflows/local/shortread_fastp.nf
+++ b/subworkflows/local/shortread_fastp.nf
@ -10,7 +10,7 @@ workflow SHORTREAD_FASTP {
    reads // [[meta], [reads]]

    main:
-    ch_versions = Channel.empty()
+    ch_versions           = Channel.empty()
    ch_multiqc_files      = Channel.empty()

    ch_input_for_fastp = reads
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -44,6 +44,7 @@ include { DB_CHECK                      } from '../subworkflows/local/db_check'
 include { SHORTREAD_PREPROCESSING       } from '../subworkflows/local/shortread_preprocessing'
 include { LONGREAD_PREPROCESSING        } from '../subworkflows/local/longread_preprocessing'
 include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_complexityfiltering'
+include { PROFILING                     } from '../subworkflows/local/profiling'

 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -59,9 +60,6 @@ include { MULTIQC                     } from '../modules/nf-core/modules/multiqc
 include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main'

 include { CAT_FASTQ                   } from '../modules/nf-core/modules/cat/fastq/main'
-include { MALT_RUN                    } from '../modules/nf-core/modules/malt/run/main'
-include { KRAKEN2_KRAKEN2             } from '../modules/nf-core/modules/kraken2/kraken2/main'
-include { METAPHLAN3                  } from '../modules/nf-core/modules/metaphlan3/main'

 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -87,6 +85,7 @@ workflow TAXPROFILER {
    DB_CHECK (
        ch_databases
    )
+    ch_versions = ch_versions.mix(DB_CHECK.out.versions)

    /*
        MODULE: Run FastQC
@ -103,6 +102,7 @@ workflow TAXPROFILER {
        SUBWORKFLOW: PERFORM PREPROCESSING
    */
    if ( params.shortread_clipmerge ) {
+
        ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads
    } else {
        ch_shortreads_preprocessed = INPUT_CHECK.out.fastq
@ -145,8 +145,8 @@ workflow TAXPROFILER {
                    [ meta, reads.flatten() ]
            }
            .branch {
-                // we can't concate files if there is not a second run, we branch
-                // here to separate them out, and mix after
+                // we can't concatenate files if there is not a second run, we branch
+                // here to separate them out, and mix back in after for efficiency
                cat: ( it[0]['single_end'] && it[1].size() > 1 ) || ( !it[0]['single_end'] && it[1].size() > 2 )
                skip: true
            }
@ -164,77 +164,11 @@ workflow TAXPROFILER {
    }

    /*
-        COMBINE READS WITH POSSIBLE DATABASES
+        SUBWORKFLOW: PROFILING
    */

-    // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], [ <reads_path>/2612.merged.fastq.gz ], ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
-    ch_input_for_profiling = ch_reads_runmerged
-            .map {
-                meta, reads ->
-                    def meta_new = meta.clone()
-                    pairtype = meta_new['single_end'] ? '_se' : '_pe'
-                    meta_new['id'] =  meta_new['id'] + pairtype
-                    [meta_new, reads]
-            }
-            .combine(DB_CHECK.out.dbs)
-            .branch {
-                malt:    it[2]['tool'] == 'malt'
-                kraken2: it[2]['tool'] == 'kraken2'
-                metaphlan3: it[2]['tool'] == 'metaphlan3'
-                unknown: true
-            }
-
-    /*
-        PREPARE PROFILER INPUT CHANNELS
-    */
-
-    // We groupTuple to have all samples in one channel for MALT as database
-    // loading takes a long time, so we only want to run it once per database
-    // TODO document somewhere we only accept illumina short reads for MALT?
-    ch_input_for_malt =  ch_input_for_profiling.malt
-                            .filter { it[0]['instrument_platform'] == 'ILLUMINA' }
-                            .map {
-                                it ->
-                                    def temp_meta =  [ id: it[2]['db_name']]  + it[2]
-                                    def db = it[3]
-                                    [ temp_meta, it[1], db ]
-                            }
-                            .groupTuple(by: [0,2])
-                            .multiMap {
-                                it ->
-                                    reads: [ it[0], it[1].flatten() ]
-                                    db: it[2]
-                            }
-
-    // We can run Kraken2 one-by-one sample-wise
-    ch_input_for_kraken2 =  ch_input_for_profiling.kraken2
-                            .multiMap {
-                                it ->
-                                    reads: [ it[0] + it[2], it[1] ]
-                                    db: it[3]
-                            }
-
-    ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3
-                            .multiMap {
-                                it ->
-                                    reads: [it[0] + it[2], it[1]]
-                                    db: it[3]
-                            }
-
-    /*
-        MODULE: RUN PROFILING
-    */
-    if ( params.run_malt ) {
-        MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db )
-    }
-
-    if ( params.run_kraken2 ) {
-        KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db  )
-    }
-
-    if ( params.run_metaphlan3 ) {
-        METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db )
-    }
+    PROFILING ( ch_reads_runmerged, ch_longreads_preprocessed, DB_CHECK.out.dbs )
+    ch_versions = ch_versions.mix( PROFILING.out.versions )

    /*
        MODULE: MultiQC
@ -274,17 +208,8 @@ workflow TAXPROFILER {
        ch_versions = ch_versions.mix(CAT_FASTQ.out.versions)
    }

-    if (params.run_kraken2) {
-        ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([])  )
-        ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() )
-    }
+    ch_multiqc_files = ch_multiqc_files.mix( PROFILING.out.mqc )

-    if (params.run_malt) {
-        ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([])  )
-        ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() )
-    }
-
-    // TODO Versions for Karken/MALT not report?
    // TODO create multiQC module for metaphlan
    MULTIQC (
        ch_multiqc_files.collect()