Merge branch 'dev' into hostremoval

2024-11-26 02:09:54 +00:00 · 2022-04-11 13:36:03 +02:00 · 2022-04-11 13:36:03 +02:00 · 025083061e
commit 025083061e
parent b554aa3e4d fb2f5ed73e
17 changed files with 509 additions and 132 deletions
--- a/README.md
+++ b/README.md
@ -18,7 +18,7 @@
 <!-- TODO nf-core: Write a 1-2 sentence summary of what data the pipeline is for and what it does -->
-**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling against multiple profiling tools and databases and produces standardised output tables.
+**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling with multiple profiling tools against multiple databases, produces standardised output tables.
 The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!
@ -32,20 +32,20 @@ On release, automated continuous integration tests run the pipeline on a full-si
 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
 2. Performs optional read pre-processing
-   - Adapter clipping and merging (short, and nanopore reads)
+   - Adapter clipping and merging (short read: [fastp](https://github.com/OpenGene/fastp), [AdapterRemoval2](https://github.com/MikkelSchubert/adapterremoval); long read: [porechop](https://github.com/rrwick/Porechop))
-   - Low complexity filtering
+   - Low complexity filtering ([bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus))
-   - Host read removal
+   - Host read removal ([BowTie2](http://bowtie-bio.sourceforge.net/bowtie2/))
   - Run merging
-3. Performs taxonomic profiling a choice of:
+3. Performs taxonomic profiling using one or more of:
-   - Kraken2
+   - [Kraken2](https://ccb.jhu.edu/software/kraken2/)
-   - MetaPhlAn3
+   - [MetaPhlAn3](https://huttenhower.sph.harvard.edu/metaphlan/)
-   - MALT
+   - [MALT](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/malt/)
-   - DIAMOND
+   - [DIAMOND](https://github.com/bbuchfink/diamond)
-   - Centrifuge
+   - [Centrifuge](https://ccb.jhu.edu/software/centrifuge/)
-   - Kaiju
+   - [Kaiju](https://kaiju.binf.ku.dk/)
-   - mOTUs
+   - [mOTUs](https://motu-tool.org/)
 4. Perform optional post-processing with:
-   - bracken
+   - [bracken](https://ccb.jhu.edu/software/bracken/)
 5. Standardises output tables
 6. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
@ -70,10 +70,8 @@ On release, automated continuous integration tests run the pipeline on a full-si
 4. Start running your own analysis!
   <!-- TODO nf-core: Update the example "typical command" below used to run the pipeline -->
   ```console
-   nextflow run nf-core/taxprofiler --input samplesheet.csv --outdir <OUTDIR> --genome GRCh37 -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
+   nextflow run nf-core/taxprofiler --input samplesheet.csv --databases database.csv --outdir <OUTDIR> --run_<TOOL1> --run_<TOOL1> -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
   ```
 ## Documentation
@ -86,7 +84,7 @@ nf-core/taxprofiler was originally written by nf-core community.
 We thank the following people for their extensive assistance in the development of this pipeline:
-<!-- TODO nf-core: If applicable, make list of people who have also contributed -->
+[James A. Fellows Yates](https://github.com/jfy133), [Moritz Beber](https://github.com/Midnighter), [Lauri Mesilaakso](https://github.com/ljmesi), [Sofia Stamouli](https://github.com/sofsam), [Maxime Borry](https://github.com/maxibor).
 ## Contributions and Support
--- a/assets/samplesheet.csv
+++ b/assets/samplesheet.csv
@ -1,3 +1,6 @@
-sample,fastq_1,fastq_2
+sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
-SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz
+2611,ERR5766174,ILLUMINA,,,/<path>/<to>/fasta/ERX5474930_ERR5766174_1.fa.gz
-SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz,
+2612,ERR5766176,ILLUMINA,/<path>/<to>/fastq/ERX5474932_ERR5766176_1.fastq.gz,/<path>/<to>/fastq/ERX5474932_ERR5766176_2.fastq.gz,
 2612,ERR5766180,ILLUMINA,/<path>/<to>/fastq/ERX5474936_ERR5766180_1.fastq.gz,,
 2613,ERR5766181,ILLUMINA,/<path>/<to>/fastq/ERX5474937_ERR5766181_1.fastq.gz,/<path>/<to>/fastq/ERX5474937_ERR5766181_2.fastq.gz,
 ERR3201952,ERR3201952,OXFORD_NANOPORE,/<path>/<to>/fastq/ERR3201952.fastq.gz,,
--- a/conf/modules.config
+++ b/conf/modules.config
@ -185,7 +185,7 @@ process {
        publishDir = [
            path: { "${params.outdir}/malt/${meta.db_name}" },
            mode: params.publish_dir_mode,
-            pattern: '*.{rma6,tab,text,sam,log}'
+            pattern: '*.{log}'
        ]
    }
@ -195,7 +195,7 @@ process {
        publishDir = [
            path: { "${params.outdir}/kraken2/${meta.db_name}" },
            mode: params.publish_dir_mode,
-            pattern: '*.{fastq.gz,txt}'
+            pattern: '*.{txt}'
        ]
    }
@ -208,6 +208,16 @@ process {
        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
    }
    withName: CENTRIFUGE_CENTRIFUGE {
        publishDir = [
            path: { "${params.outdir}/centrifuge/${meta.db_name}" },
            mode: params.publish_dir_mode,
            pattern: '*.txt'
        ]
        ext.args = { "${meta.db_params}" }
        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
    }
    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
        publishDir = [
            path: { "${params.outdir}/pipeline_info" },
@ -216,4 +226,12 @@ process {
        ]
    }
    withName: MULTIQC {
        publishDir = [
            path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
            mode: params.publish_dir_mode,
            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
        ]
    }
 }
--- a/conf/test.config
+++ b/conf/test.config
@ -22,15 +22,16 @@ params {
    // Input data
    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
    // TODO nf-core: Give any required params for the test so that command line flags are not needed
    input                           = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
    databases                       = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
    run_kraken2                     = true
    run_malt                        = true
    run_metaphlan3                  = true
    run_centrifuge                  = true
    shortread_clipmerge             = true
    longread_clip                   = false
    shortread_complexityfilter      = true
    shortread_clipmerge             = true
    shortread_hostremoval           = true
    shortread_hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
 }
--- a/docs/usage.md
+++ b/docs/usage.md
@ -8,56 +8,90 @@
 <!-- TODO nf-core: Add documentation about anything specific to running your pipeline. For general topics, please point to (and add to) the main nf-core website. -->
-## Samplesheet input
+## Samplesheet inputs
-You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below.
+You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row as shown in the examples below. Furthermother, nf-core/taxprofiler also requires a second comma-separated file of 3 columns with a header row as in the examples below.
 This samplesheet is then specified on the command line as follows:
 ```console
--input '[path to samplesheet file]'
+--input '[path to samplesheet file]' --databases '[path to database sheet file]'
 ```
 ### Multiple runs of the same sample
-The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes:
+The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will process reads before performing profiling. Below is an example for the same sample sequenced across 3 lanes:
 ```console
-sample,fastq_1,fastq_2
+sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
+2612,run1,ILLUMINA,2612_run1_R1.fq.gz,,
-CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz
+2612,run2,ILLUMINA,2612_run2_R1.fq.gz,,
-CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz
+2612,run3,ILLUMINA,2612_run3_R1.fq.gz,2612_run3_R2.fq.gz,
 ```
 > ⚠️ Runs of the same sample sequenced on Illumina platforms with a combination of single and paired-end data will **not** be run-wise concatenated, unless pair-merging is specified. In the example above, `run3` will be profiled independently of `run1` and `run2` if pairs are not merged.
 ### Full samplesheet
-The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below.
+The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 6 columns to match those defined in the table below.
-A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice.
+A final samplesheet file consisting of both single- and paired-end data, as well as long-read FASTA fies may look something like the one below. This is for 6 samples, where `2612` has been sequenced twice.
 ```console
-sample,fastq_1,fastq_2
+2611,ERR5766174,ILLUMINA,,,/<path>/<to>/fasta/ERX5474930_ERR5766174_1.fa.gz
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
+2612,ERR5766176,ILLUMINA,/<path>/<to>/fastq/ERX5474932_ERR5766176_1.fastq.gz,/<path>/<to>/fastq/ERX5474932_ERR5766176_2.fastq.gz,
-CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz
+2612,ERR5766180,ILLUMINA,/<path>/<to>/fastq/ERX5474936_ERR5766180_1.fastq.gz,,
-CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz
+2613,ERR5766181,ILLUMINA,/<path>/<to>/fastq/ERX5474937_ERR5766181_1.fastq.gz,/<path>/<to>/fastq/ERX5474937_ERR5766181_2.fastq.gz,
-TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz,
+ERR3201952,ERR3201952,OXFORD_NANOPORE,/<path>/<to>/fastq/ERR3201952.fastq.gz,,
 TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz,
 TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz,
 TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz,
 ```
 | Column                | Description                                                                                                                                                                                              |
-| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| --------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `sample`  | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
+| `sample`              | Unique sample name [required].                                                                                                                                                                           |
-| `fastq_1` | Full path to FastQ file for Illumina short reads 1 or Nanopore reads. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                           |
+| `run_accession`       | Run ID or name unique for each (pairs of) file(s) .Can also supply sample name again here, if only a single run was generated [required].                                                                |
-| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
+| `instrument_platform` | Sequencing platform reads generated on, selected from the EBI ENA [controlled vocabulary](https://www.ebi.ac.uk/ena/portal/api/controlledVocab?field=instrument_platform) [required].                    |
 | `fastq_1`             | Path or URL to sequencing reads or for Illumina R1 sequencing reads in FASTQ format. GZipped compressed files accepted. Can be left empty if data in FASTA is specifed. Cannot be combined with `fasta`. |
 | `fastq_2`             | Path or URL to Illumina R2 sequencing reads in FASTQ format. GZipped compressed files accepted. Can be left empty if single end data. Cannot be combined with `fasta`.                                   |
 | `fasta`               | Path or URL to long-reads or contigs in FASTA format. GZipped compressed files accepted. Can be left empty if data in FASTA is specifed. Cannot be combined with `fastq_1` or `fastq_2`.                 |
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 ### Full database sheet
 nf-core/taxprofiler supports multiple databases being profiled in parallel for each tool. These databases, and specific parameters for each, can be specified in a 4 column comma-separated sheet.
 > ⚠️ nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user.
 An example database sheet can look as follows, where 4 tools are being used, and `malt` and `kraken2` will be used against two databases each.
 ```console
 tool,db_name,db_params,db_path
 malt,malt85,-id 85,/<path>/<to>/malt/testdb-malt/
 malt,malt95,-id 90,/<path>/<to>/malt/testdb-malt.tar.gz
 kraken2,db1,,/<path>/<to>/kraken2/testdb-kraken2.tar.gz
 kraken2,db2,--quick,/<path>/<to>/kraken2/testdb-kraken2.tar.gz
 centrifuge,db1,,/<path>/<to>/centrifuge/minigut_cf.tar.gz
 metaphlan3,db1,,/<path>/<to>/metaphlan3/metaphlan_database/
 ```
 Column specifications are as follows:
 | Column      | Description                                                                                                                                                                                                                                             |
 | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `tool`      | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required].                                                                                                                                          |
 | `db_name`   | A unique name of the particular database [required].                                                                                                                                                                                                    |
 | `db_params` | Any parameters of the given taxonomic profiler that you wish to specify that the taxonomic profiling tool should use when profiling against this specific. Can be empty to use taxonomic profiler defaults Must not be surrounded by quotes [required]. |
 | `db_path`   | Path to the database. Can either be a path to a directory containing the database index files or a `.tar.gz` file which contains the compressed database directory with the same name as the tar archive, minus `.tar.gz` [required].                   |
 > 💡 You can also specify the same database directory/file twice (ensuring unique `db_name`s) and specify different parameters for each database to compare the effect of different parameters during profiling.
 ## Running the pipeline
 The typical command for running the pipeline is as follows:
 ```console
-nextflow run nf-core/taxprofiler --input samplesheet.csv --outdir <OUTDIR> --genome GRCh37 -profile docker
+nextflow run nf-core/taxprofiler --input samplesheet.csv --databases databases.csv --outdir <OUTDIR> -profile docker --run_<TOOL1> --run_<TOOL2>
 ```
 This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.
@ -66,7 +100,7 @@ Note that the pipeline will create the following files in your working directory
 ```console
 work                # Directory containing the nextflow working files
-<OUTIDR>            # Finished results in specified location (defined with --outdir)
+<OUTDIR>            # Finished results in specified location (defined with --outdir)
 .nextflow_log       # Log file from Nextflow
 # Other nextflow hidden files, eg. history of pipeline runs and old logs.
 ```
--- a/modules.json
+++ b/modules.json
@ -18,6 +18,9 @@
            "cat/fastq": {
                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
            },
            "centrifuge/centrifuge": {
                "git_sha": "d2726fcf75063960f06b36d2229a4c0966614108"
            },
            "custom/dumpsoftwareversions": {
                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
            },
--- a/modules/nf-core/modules/centrifuge/centrifuge/main.nf
+++ b/modules/nf-core/modules/centrifuge/centrifuge/main.nf
@ -0,0 +1,61 @@
 process CENTRIFUGE_CENTRIFUGE {
    tag "$meta.id"
    label 'process_high'
    conda (params.enable_conda ? "bioconda::centrifuge=1.0.4_beta" : null)
    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
        'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6' :
        'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }"
    input:
    tuple val(meta), path(reads)
    path db
    val save_unaligned
    val save_aligned
    val sam_format
    output:
    tuple val(meta), path('*report.txt')                 , emit: report
    tuple val(meta), path('*results.txt')                , emit: results
    tuple val(meta), path('*.sam')                       , optional: true, emit: sam
    tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz')   , optional: true, emit: fastq_mapped
    tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped
    path "versions.yml"                                  , emit: versions
    when:
    task.ext.when == null || task.ext.when
    script:
    def args = task.ext.args ?: ''
    def prefix = task.ext.prefix ?: "${meta.id}"
    def paired = meta.single_end ? "-U ${reads}" :  "-1 ${reads[0]} -2 ${reads[1]}"
    def unaligned = ''
    def aligned = ''
    if (meta.single_end) {
        unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : ''
        aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : ''
    } else {
        unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : ''
        aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : ''
    }
    def sam_output = sam_format ? "--out-fmt 'sam'" : ''
    """
    ## we add "-no-name ._" to ensure silly Mac OSX metafiles files aren't included
    db_name=`find -L ${db} -name "*.1.cf" -not -name "._*"  | sed 's/.1.cf//'`
    centrifuge \\
        -x \$db_name \\
        -p $task.cpus \\
        $paired \\
        --report-file ${prefix}.report.txt \\
        -S ${prefix}.results.txt \\
        $unaligned \\
        $aligned \\
        $sam_output \\
        $args
    cat <<-END_VERSIONS > versions.yml
    "${task.process}":
        centrifuge: \$( centrifuge --version  | sed -n 1p | sed 's/^.*centrifuge-class version //')
    END_VERSIONS
    """
 }
--- a/modules/nf-core/modules/centrifuge/centrifuge/meta.yml
+++ b/modules/nf-core/modules/centrifuge/centrifuge/meta.yml
@ -0,0 +1,66 @@
 name: centrifuge_centrifuge
 description: Classifies metagenomic sequence data
 keywords:
  - classify
  - metagenomics
  - fastq
  - db
 tools:
  - centrifuge:
      description: Centrifuge is a classifier for metagenomic sequences.
      homepage: https://ccb.jhu.edu/software/centrifuge/
      documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml
      doi: 10.1101/gr.210641.116
      licence: ["GPL v3"]
 input:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - reads:
      type: file
      description: |
        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
        respectively.
  - db:
      type: directory
      description: Path to directory containing centrifuge database files
  - save_unaligned:
      type: value
      description: If true unmapped fastq files are saved
  - save_aligned:
      type: value
      description: If true mapped fastq files are saved
 output:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - report:
      type: file
      description: |
        File containing a classification summary
      pattern: "*.{report.txt}"
  - results:
      type: file
      description: |
        File containing classification results
      pattern: "*.{results.txt}"
  - fastq_unmapped:
      type: file
      description: Unmapped fastq files
      pattern: "*.unmapped.fastq.gz"
  - fastq_mapped:
      type: file
      description: Mapped fastq files
      pattern: "*.mapped.fastq.gz"
  - versions:
      type: file
      description: File containing software versions
      pattern: "versions.yml"
 authors:
  - "@sofstam"
  - "@jfy133"
  - "@sateeshperi"
--- a/nextflow.config
+++ b/nextflow.config
@ -89,6 +89,11 @@ params {
    // kraken2
    run_kraken2                = false
    // centrifuge
    run_centrifuge             = false
    centrifuge_save_unaligned  = false
    centrifuge_save_aligned    = false
    centrifuge_sam_format      = false
    // metaphlan3
    run_metaphlan3             = false
 }
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -281,6 +281,18 @@
        "run_kraken2": {
            "type": "boolean"
        },
        "run_centrifuge": {
            "type": "boolean"
        },
        "centrifuge_save_unaligned": {
            "type": "boolean"
        },
        "centrifuge_save_aligned": {
            "type": "boolean"
        },
        "centrifuge_sam_format": {
            "type": "boolean"
        },
        "run_metaphlan3": {
            "type": "boolean",
            "description": "Enable MetaPhlAn for taxonomic profiling"
--- a/nf-core/modules/centrifuge/centrifuge/main.nf
+++ b/nf-core/modules/centrifuge/centrifuge/main.nf
@ -0,0 +1,61 @@
 process CENTRIFUGE_CENTRIFUGE {
    tag "$meta.id"
    label 'process_high'
    conda (params.enable_conda ? "bioconda::centrifuge=1.0.4_beta" : null)
    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
        'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6' :
        'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }"
    input:
    tuple val(meta), path(reads)
    path db
    val save_unaligned
    val save_aligned
    val sam_format
    output:
    tuple val(meta), path('*report.txt')                 , emit: report
    tuple val(meta), path('*results.txt')                , emit: results
    tuple val(meta), path('*.sam')                       , optional: true, emit: sam
    tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz')   , optional: true, emit: fastq_mapped
    tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped
    path "versions.yml"                                  , emit: versions
    when:
    task.ext.when == null || task.ext.when
    script:
    def args = task.ext.args ?: ''
    def prefix = task.ext.prefix ?: "${meta.id}"
    def paired = meta.single_end ? "-U ${reads}" :  "-1 ${reads[0]} -2 ${reads[1]}"
    def unaligned = ''
    def aligned = ''
    if (meta.single_end) {
        unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : ''
        aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : ''
    } else {
        unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : ''
        aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : ''
    }
    def sam_output = sam_format ? "--out-fmt 'sam'" : ''
    """
    ## we add "-no-name ._" to ensure silly Mac OSX metafiles files aren't included
    db_name=`find -L ${db} -name "*.1.cf" -not -name "._*"  | sed 's/.1.cf//'`
    centrifuge \\
        -x \$db_name \\
        -p $task.cpus \\
        $paired \\
        --report-file ${prefix}.report.txt \\
        -S ${prefix}.results.txt \\
        $unaligned \\
        $aligned \\
        $sam_output \\
        $args
    cat <<-END_VERSIONS > versions.yml
    "${task.process}":
        centrifuge: \$( centrifuge --version  | sed -n 1p | sed 's/^.*centrifuge-class version //')
    END_VERSIONS
    """
 }
--- a/nf-core/modules/centrifuge/centrifuge/meta.yml
+++ b/nf-core/modules/centrifuge/centrifuge/meta.yml
@ -0,0 +1,66 @@
 name: centrifuge_centrifuge
 description: Classifies metagenomic sequence data
 keywords:
  - classify
  - metagenomics
  - fastq
  - db
 tools:
  - centrifuge:
      description: Centrifuge is a classifier for metagenomic sequences.
      homepage: https://ccb.jhu.edu/software/centrifuge/
      documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml
      doi: 10.1101/gr.210641.116
      licence: ["GPL v3"]
 input:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - reads:
      type: file
      description: |
        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
        respectively.
  - db:
      type: directory
      description: Path to directory containing centrifuge database files
  - save_unaligned:
      type: value
      description: If true unmapped fastq files are saved
  - save_aligned:
      type: value
      description: If true mapped fastq files are saved
 output:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - report:
      type: file
      description: |
        File containing a classification summary
      pattern: "*.{report.txt}"
  - results:
      type: file
      description: |
        File containing classification results
      pattern: "*.{results.txt}"
  - fastq_unmapped:
      type: file
      description: Unmapped fastq files
      pattern: "*.unmapped.fastq.gz"
  - fastq_mapped:
      type: file
      description: Mapped fastq files
      pattern: "*.mapped.fastq.gz"
  - versions:
      type: file
      description: File containing software versions
      pattern: "versions.yml"
 authors:
  - "@sofstam"
  - "@jfy133"
  - "@sateeshperi"
--- a/subworkflows/local/db_check.nf
+++ b/subworkflows/local/db_check.nf
@ -22,7 +22,7 @@ workflow DB_CHECK {
    ch_dbs_for_untar = parsed_samplesheet
        .branch {
-            untar: it[1].toString().endsWith(".tar.gz") && it[0]['tool'] != 'centrifuge'
+            untar: it[1].toString().endsWith(".tar.gz")
            skip: true
        }
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@ -65,11 +65,10 @@ def create_fastq_channel(LinkedHashMap row) {
            }
         fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
        }
    }
    return fastq_meta
-}
+}// Function to get list of [ meta, fasta ]
 // Function to get list of [ meta, fasta ]
 def create_fasta_channel(LinkedHashMap row) {
    def meta = [:]
    meta.id                     = row.sample
--- a/subworkflows/local/profiling.nf
+++ b/subworkflows/local/profiling.nf
@ -0,0 +1,119 @@
 //
 // Run profiling
 //
 include { MALT_RUN                    } from '../../modules/nf-core/modules/malt/run/main'
 include { KRAKEN2_KRAKEN2             } from '../../modules/nf-core/modules/kraken2/kraken2/main'
 include { CENTRIFUGE_CENTRIFUGE       } from '../../modules/nf-core/modules/centrifuge/centrifuge/main'
 include { METAPHLAN3                  } from '../../modules/nf-core/modules/metaphlan3/main'
 workflow PROFILING {
    take:
    shortreads // [ [ meta ], [ reads ] ]
    longreads // [ [ meta ], [ reads ] ]
    databases // [ [ meta ], path ]
    main:
    ch_versions       = Channel.empty()
    ch_multiqc_files  = Channel.empty()
 /*
        COMBINE READS WITH POSSIBLE DATABASES
    */
    // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
    ch_input_for_profiling = shortreads
            .mix( longreads )
            .combine(databases)
            .branch {
                malt:    it[2]['tool'] == 'malt'
                kraken2: it[2]['tool'] == 'kraken2'
                metaphlan3: it[2]['tool'] == 'metaphlan3'
                centrifuge: it[2]['tool'] == 'centrifuge'
                unknown: true
            }
    /*
        PREPARE PROFILER INPUT CHANNELS
    */
    // Each tool as a slightly different input structure and generally separate
    // input channels for reads vs databases. We restructure the channel tuple
    // for each tool and make liberal use of multiMap to keep reads/databases
    // channel element order in sync with each other
    // MALT: We groupTuple to have all samples in one channel for MALT as database
    // loading takes a long time, so we only want to run it once per database
    // TODO document somewhere we only accept illumina short reads for MALT?
    ch_input_for_malt =  ch_input_for_profiling.malt
                            .filter { it[0]['instrument_platform'] == 'ILLUMINA' }
                            .map {
                                it ->
                                    def temp_meta =  [ id: it[2]['db_name']]  + it[2]
                                    def db = it[3]
                                    [ temp_meta, it[1], db ]
                            }
                            .groupTuple(by: [0,2])
                            .multiMap {
                                it ->
                                    reads: [ it[0], it[1].flatten() ]
                                    db: it[2]
                            }
    // All subsequent tools can easily run on a per-sample basis
    ch_input_for_kraken2 =  ch_input_for_profiling.kraken2
                            .multiMap {
                                it ->
                                    reads: [ it[0] + it[2], it[1] ]
                                    db: it[3]
                            }
    ch_input_for_centrifuge =  ch_input_for_profiling.centrifuge
                                .multiMap {
                                    it ->
                                        reads: [ it[0] + it[2], it[1] ]
                                        db: it[3]
                                }
    ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3
                            .multiMap {
                                it ->
                                    reads: [it[0] + it[2], it[1]]
                                    db: it[3]
                            }
    /*
        RUN PROFILING
    */
    if ( params.run_malt ) {
        MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db )
        ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([])  )
        ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() )
    }
    if ( params.run_kraken2 ) {
        KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db  )
        ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([])  )
        ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() )
    }
    if ( params.run_centrifuge ) {
        CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format  )
        ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() )
    }
    if ( params.run_metaphlan3 ) {
        METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db )
        ch_versions = ch_versions.mix( METAPHLAN3.out.versions.first() )
    }
    emit:
    // TODO work out if there is enough standardisation of output to export as one?
    //output    = ch_filtered_reads    // channel: [ val(meta), [ reads ] ]
    versions = ch_versions          // channel: [ versions.yml ]
    mqc      = ch_multiqc_files
 }
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -54,6 +54,7 @@ include { SHORTREAD_PREPROCESSING       } from '../subworkflows/local/shortread_
 include { LONGREAD_PREPROCESSING        } from '../subworkflows/local/longread_preprocessing'
 include { SHORTREAD_HOSTREMOVAL         } from '../subworkflows/local/shortread_hostremoval'
 include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_complexityfiltering'
 include { PROFILING                     } from '../subworkflows/local/profiling'
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -69,9 +70,6 @@ include { MULTIQC                     } from '../modules/nf-core/modules/multiqc
 include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main'
 include { CAT_FASTQ                   } from '../modules/nf-core/modules/cat/fastq/main'
 include { MALT_RUN                    } from '../modules/nf-core/modules/malt/run/main'
 include { KRAKEN2_KRAKEN2             } from '../modules/nf-core/modules/kraken2/kraken2/main'
 include { METAPHLAN3                  } from '../modules/nf-core/modules/metaphlan3/main'
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -97,6 +95,7 @@ workflow TAXPROFILER {
    DB_CHECK (
        ch_databases
    )
    ch_versions = ch_versions.mix(DB_CHECK.out.versions)
    /*
        MODULE: Run FastQC
@ -113,6 +112,7 @@ workflow TAXPROFILER {
        SUBWORKFLOW: PERFORM PREPROCESSING
    */
    if ( params.shortread_clipmerge ) {
        ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads
    } else {
        ch_shortreads_preprocessed = INPUT_CHECK.out.fastq
@ -148,71 +148,11 @@ workflow TAXPROFILER {
    }
    /*
-        COMBINE READS WITH POSSIBLE DATABASES
+        SUBWORKFLOW: PROFILING
    */
-    // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
+    PROFILING ( ch_shortreads_hostremoved ch_longreads_preprocessed, DB_CHECK.out.dbs )
-    ch_input_for_profiling = ch_shortreads_hostremoved
+    ch_versions = ch_versions.mix( PROFILING.out.versions )
            .mix( ch_longreads_preprocessed )
            .combine(DB_CHECK.out.dbs)
            .branch {
                malt:    it[2]['tool'] == 'malt'
                kraken2: it[2]['tool'] == 'kraken2'
                metaphlan3: it[2]['tool'] == 'metaphlan3'
                unknown: true
            }
    /*
        PREPARE PROFILER INPUT CHANNELS
    */
    // We groupTuple to have all samples in one channel for MALT as database
    // loading takes a long time, so we only want to run it once per database
    // TODO document somewhere we only accept illumina short reads for MALT?
    ch_input_for_malt =  ch_input_for_profiling.malt
                            .filter { it[0]['instrument_platform'] == 'ILLUMINA' }
                            .map {
                                it ->
                                    def temp_meta =  [ id: it[2]['db_name']]  + it[2]
                                    def db = it[3]
                                    [ temp_meta, it[1], db ]
                            }
                            .groupTuple(by: [0,2])
                            .multiMap {
                                it ->
                                    reads: [ it[0], it[1].flatten() ]
                                    db: it[2]
                            }
    // We can run Kraken2 one-by-one sample-wise
    ch_input_for_kraken2 =  ch_input_for_profiling.kraken2
                            .multiMap {
                                it ->
                                    reads: [ it[0] + it[2], it[1] ]
                                    db: it[3]
                            }
    ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3
                            .multiMap {
                                it ->
                                    reads: [it[0] + it[2], it[1]]
                                    db: it[3]
                            }
    /*
        MODULE: RUN PROFILING
    */
    if ( params.run_malt ) {
        MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db )
    }
    if ( params.run_kraken2 ) {
        KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db  )
    }
    if ( params.run_metaphlan3 ) {
        METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db )
    }
    /*
        MODULE: MultiQC
@ -252,17 +192,8 @@ workflow TAXPROFILER {
        ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_HOSTREMOVAL.out.mqc.collect{it[1]}.ifEmpty([]))
    }
-    if (params.run_kraken2) {
+    ch_multiqc_files = ch_multiqc_files.mix( PROFILING.out.mqc )
        ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([])  )
        ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() )
    }
    if (params.run_malt) {
        ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([])  )
        ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() )
    }
    // TODO Versions for Karken/MALT not report?
    // TODO create multiQC module for metaphlan
    MULTIQC (
        ch_multiqc_files.collect()