Merge pull request #241 from genomic-medicine-sweden/add_taxpasta_merge

Add taxpasta_merge to taxprofiler
2024-11-24 22:09:56 +00:00 · 2023-02-20 13:15:30 +01:00 · 2023-02-20 13:15:30 +01:00 · 599c8d3592
commit 599c8d3592
parent bb3c49389f 04bc978ef6
9 changed files with 180 additions and 6 deletions
--- a/conf/modules.config
+++ b/conf/modules.config
@ -534,7 +534,7 @@ process {
    }

    withName: MOTUS_MERGE {
-        ext.args = { params.generate_biom_output ? "-B" : "" }
+        ext.args = { params.standardisation_motus_generatebiom ? "-B" : "" }
        ext.prefix = { "motus_${meta.id}_combined_reports" }
        publishDir = [
            path: { "${params.outdir}/motus/" },
@ -542,6 +542,15 @@ process {
        ]
    }

+    withName: TAXPASTA_MERGE {
+        ext.args =  { "-p ${meta.tool} -o ${meta.tool}_${meta.id}.${params.standardisation_taxpasta_format}" }
+        publishDir = [
+            path: { "${params.outdir}/taxpasta/" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{tsv,csv,arrow,parquet,biom}'
+        ]
+    }
+
    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
        publishDir = [
            path: { "${params.outdir}/pipeline_info" },
--- a/conf/test.config
+++ b/conf/test.config
@ -44,7 +44,7 @@ params {
    malt_save_reads                       = true
    kraken2_save_reads                    = true
    centrifuge_save_reads                 = true
-    diamond_save_reads                    = true
+    run_profile_standardisation           = true
 }

 process {
--- a/docs/output.md
+++ b/docs/output.md
@ -33,6 +33,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 - [MALT](#malt) - Sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics
 - [MetaPhlAn3](#metaphlan3) - Genome-level marker gene based taxonomic classifier
 - [mOTUs](#motus) - Tool for marker gene-based OTU (mOTU) profiling.
+- [TAXPASTA](#taxpasta) - Tool to standardise taxonomic profiles as well as merge profiles across samples from the same database and classifier/profiler.
 - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution

@ -435,6 +436,35 @@ Krona charts will be generated by the pipeline for supported tools (Kraken2, Cen

 The resulting HTML files can be loaded into your web browser for exploration. Each file will have a dropdown to allow you to switch between each sample aligned against the given database of the tool.

+### TAXPASTA
+
+[TAXPASTA](https://github.com/taxprofiler/taxpasta) standardises and merges two or more taxonomic profiles across samples into one single table. It supports multiple different classifiers simplifying comparison of taxonomic classification results between tools and databases.
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `taxpasta`
+
+  - `<tool>_<database>*.{tsv,csv,arrow,parquet,biom}`: Standardised taxon table containing multiple samples. The standard format is the `tsv`. The first column describes the taxonomy ID and the rest of the columns describe the read counts for each sample.
+
+  </details>
+
+These files will likely be the most useful files for the comparison of differences in classification between different tools or building consensuses, with the caveat they have slightly less information than the actual output from each tool (which may have non-standard information e.g. taxonomic rank, percentage of hits, abundance estimations).
+
+The following report files are used for the taxpasta step:
+
+- Bracken: `<sample>_<db_name>.tsv` Taxpasta used the `new_est_reads` column for the standardised profile.
+- Centrifuge: `<sample_id>.centrifuge.txt` Taxpasta uses the `direct_assigned_reads` column for the standardised profile.
+- Diamond: `<sample_id>` Taxpasta summarises number of reads per NCBI taxonomy ID standardised profile.
+- Kaiju: `<sample_id>_<db_name>.kaijutable.txt` Taxpasta uses the `reads` column from kaiju2table standardised profile.
+- KrakenUniq: `<sample_id>_<db_name>.report.txt` Taxpasta uses the `reads` column for the standardised profile.
+- Kraken2: `<sample_id>_<db_name>.report.txt` Taxpasta uses the `direct_assigned_reads` column for the standardised profile.
+- MALT: `<sample_id>.txt.gz` Taxpasta uses the `count` (second) column from the output of MEGAN6's rma2info for the standardised profile.
+- MetaPhlAn3: `<sample_id>_profile.txt` Taxpasta uses the `relative_abundance` column multiplied with a fixed number to yield an integer for the standardised profile.
+- mOTUs: `<sample_id>.out` Taxpasta uses the `read_count` column for the standardised profile.
+
+> ⚠️ Please aware the outputs of each tool's standardised profile _may not_ be directly comparable between each tool. Some may report raw read counts, whereas others may report abundance information. Please always refer to the list above, for which information is used for each tool.
+
 ### MultiQC

 <details markdown="1">
--- a/modules.json
+++ b/modules.json
@ -207,6 +207,11 @@
                        "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
                        "installed_by": ["modules"]
                    },
+                    "taxpasta/merge": {
+                        "branch": "master",
+                        "git_sha": "74ab450ed05e034d049c00f6e2853de2c31594b4",
+                        "installed_by": ["modules"]
+                    },
                    "untar": {
                        "branch": "master",
                        "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
--- a/modules/nf-core/taxpasta/merge/main.nf
+++ b/modules/nf-core/taxpasta/merge/main.nf
@ -0,0 +1,47 @@
+process TAXPASTA_MERGE {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "bioconda::taxpasta=0.1.1"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/taxpasta:0.1.1--pyhdfd78af_0':
+        'quay.io/biocontainers/taxpasta:0.1.1--pyhdfd78af_0' }"
+
+
+    input:
+    tuple val(meta), path(profiles)
+    path taxonomy
+    path samplesheet
+
+    output:
+    tuple val(meta), path("*.{tsv,csv,arrow,parquet,biom}"), emit: merged_profiles
+    path "versions.yml"                                    , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    // N.B.: Taxpasta requires a --profiler option and will fail without it.
+    // This must be specified via a `nextflow.config` or `modules.config`, for
+    // example, as "--profiler kraken2". Additionally, it requires a --output
+    // option with the output file name. The desired format will be parsed from
+    // the name and should correspond to the output pattern specified above,
+    // e.g., "--output ${task.ext.prefix}.tsv".
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def taxonomy_option = taxonomy ? "--taxonomy ${taxonomy}" : ''
+    def samplesheet_input = samplesheet ? "-s ${samplesheet}" : ''
+    """
+    taxpasta merge \\
+        $args \\
+        $taxonomy_option \\
+        $samplesheet_input \\
+        $profiles
+
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        taxpasta: \$(taxpasta --version)
+    END_VERSIONS
+    """
+}
--- a/modules/nf-core/taxpasta/merge/meta.yml
+++ b/modules/nf-core/taxpasta/merge/meta.yml
@ -0,0 +1,58 @@
+name: "taxpasta_merge"
+description: Standardise and merge two or more taxonomic profiles into a single table
+keywords:
+  - taxonomic profile
+  - standardise
+  - standardisation
+  - metagenomics
+  - taxonomic profiling
+  - otu tables
+  - taxon tables
+tools:
+  - "taxpasta":
+      description: "TAXonomic Profile Aggregation and STAndardisation"
+      homepage: "https://taxpasta.readthedocs.io/"
+      documentation: "https://taxpasta.readthedocs.io/"
+      tool_dev_url: "https://github.com/taxprofiler/taxpasta"
+      doi: ""
+      licence: "['Apache-2.0']"
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - profiles:
+      type: file
+      description: A list of taxonomic profiler output files (typically in text format, mandatory)
+      pattern: "*.{tsv,csv,arrow,parquet,biom}"
+  - samplesheet:
+      type: file
+      description:
+        A samplesheet describing the sample name and a filepath to a taxonomic abundance profile that needs to be relative
+        from the work environment. The profiles must be provided even if you give a samplesheet as argument (optional)
+      pattern: "*.{tsv,csv,ods,xlsx,arrow,parquet}"
+  - taxonomy:
+      type: directory
+      description: Directory containing at a minimum nodes.dmp and names.dmp files (optional)
+      pattern: "*/"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - merged_profiles:
+      type: file
+      description: Output file with standardised multiple profiles in one go and have all profiles combined into a single table.
+      pattern: "*.{tsv,csv,ods,xlsx,arrow,parquet,biom}"
+
+authors:
+  - "@sofstam"
+  - "@jfy133"
--- a/nextflow.config
+++ b/nextflow.config
@ -155,8 +155,9 @@ params {
    krona_taxonomy_directory   = null

    // profile standardisation
-    run_profile_standardisation = false
-    generate_biom_output        = false
+    run_profile_standardisation             = false
+    standardisation_taxpasta_format         = 'tsv'
+    standardisation_motus_generatebiom      = false
 }

 // Load base.config by default for all pipelines
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -491,11 +491,11 @@
                    "description": "Turn on standardisation of taxon tables across profilers",
                    "help_text": "Turns on standardisation of output OTU tables across all tools; each into a TSV format following the following scheme:\n\n|TAXON   | SAMPLE_A | SAMPLE_B |\n|-------------|----------------|-----------------|\n| taxon_a | 32               | 123             |\n| taxon_b | 1                 | 5                 |\n\nThis currently only is generated for mOTUs."
                },
-                "generate_biom_output": {
+                "standardisation_motus_generatebiom": {
                    "type": "boolean",
                    "fa_icon": "fas fa-toggle-on",
                    "description": "Turn on generation of BIOM output (currently only applies to mOTUs)",
-                    "help_text": "Turn on the saving of the taxonomic output in BIOM format (`.biom`) in the results directory of your pipeline run, instead of the default TSV format.\n\nNote this file is from the output of the `motus merge` command.\n\n> Modifies tool parameter(s):\n> - `-B -o`"
+                    "help_text": "Turn on the saving of the taxonomic output in BIOM format (`.biom`) in the results directory of your pipeline run, instead of the default TSV format.\\n\\nNote this file is from the output of the `motus merge` command.\\n\\n> Modifies tool parameter(s):\\n> - `-B -o`"
                },
                "run_krona": {
                    "type": "boolean",
@ -509,6 +509,13 @@
                    "fa_icon": "fas fa-folder-open",
                    "description": "Specify path to krona taxonomy directories (required for MALT krona plots)",
                    "help_text": "Specify a path to a Krona taxonomy database directory (i.e. a directory containing a krona generated `.tab` file).\n\nThis is only required for generating Krona plots of MALT output.\n\nNote this taxonomy database must be downloaded and generated with the `updateTaxonomy.sh` script from the krona-tools package."
+                },
+                "standardisation_taxpasta_format": {
+                    "type": "string",
+                    "default": "tsv",
+                    "fa_icon": "fas fa-file",
+                    "description": "The desired output format.",
+                    "enum": ["tsv", "csv", "arrow", "parquet", "biom"]
                }
            },
            "fa_icon": "fas fa-chart-line"
--- a/subworkflows/local/standardisation_profiles.nf
+++ b/subworkflows/local/standardisation_profiles.nf
@ -8,6 +8,7 @@ include { KRAKENTOOLS_COMBINEKREPORTS as KRAKENTOOLS_COMBINEKREPORTS_KRAKEN
 include { KRAKENTOOLS_COMBINEKREPORTS as KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE } from '../../modules/nf-core/krakentools/combinekreports/main'
 include { METAPHLAN3_MERGEMETAPHLANTABLES                                       } from '../../modules/nf-core/metaphlan3/mergemetaphlantables/main'
 include { MOTUS_MERGE                                                           } from '../../modules/nf-core/motus/merge/main'
+include { TAXPASTA_MERGE                                                        } from '../../modules/nf-core/taxpasta/merge/main'

 workflow STANDARDISATION_PROFILES {
    take:
@ -21,6 +22,20 @@ workflow STANDARDISATION_PROFILES {
    ch_versions            = Channel.empty()
    ch_multiqc_files       = Channel.empty()

+    //Taxpasta standardisation
+    ch_input_for_taxpasta = profiles
+                            .map {
+                                meta, profile ->
+                                    def meta_new = [:]
+                                    meta_new.id = meta.db_name
+                                    meta_new.tool = meta.tool == 'metaphlan3' ? 'metaphlan' : meta.tool == 'malt' ? 'megan6' : meta.tool
+                                    [meta_new, profile]
+                                }
+                                .groupTuple ()
+
+    TAXPASTA_MERGE (ch_input_for_taxpasta, [], [])
+
+
    /*
        Split profile results based on tool they come from
    */
@ -74,6 +89,7 @@ workflow STANDARDISATION_PROFILES {
                                    [[id:it[0]], it[1]]
                                }

+
    KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE ( ch_profiles_for_centrifuge )
    ch_standardised_tables = ch_standardised_tables.mix( KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE.out.txt )
    ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE.out.txt )
@ -149,6 +165,7 @@ workflow STANDARDISATION_PROFILES {

    emit:
    tables   = ch_standardised_tables
+    taxpasta = TAXPASTA_MERGE.out.merged_profiles
    versions = ch_versions
    mqc      = ch_multiqc_files
 }