1
0
Fork 0
mirror of https://github.com/MillironX/taxprofiler.git synced 2024-11-24 22:09:56 +00:00

Merge pull request #241 from genomic-medicine-sweden/add_taxpasta_merge

Add taxpasta_merge to taxprofiler
This commit is contained in:
Sofia Stamouli 2023-02-20 13:15:30 +01:00 committed by GitHub
commit 599c8d3592
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 180 additions and 6 deletions

View file

@ -534,7 +534,7 @@ process {
}
withName: MOTUS_MERGE {
ext.args = { params.generate_biom_output ? "-B" : "" }
ext.args = { params.standardisation_motus_generatebiom ? "-B" : "" }
ext.prefix = { "motus_${meta.id}_combined_reports" }
publishDir = [
path: { "${params.outdir}/motus/" },
@ -542,6 +542,15 @@ process {
]
}
withName: TAXPASTA_MERGE {
ext.args = { "-p ${meta.tool} -o ${meta.tool}_${meta.id}.${params.standardisation_taxpasta_format}" }
publishDir = [
path: { "${params.outdir}/taxpasta/" },
mode: params.publish_dir_mode,
pattern: '*.{tsv,csv,arrow,parquet,biom}'
]
}
withName: CUSTOM_DUMPSOFTWAREVERSIONS {
publishDir = [
path: { "${params.outdir}/pipeline_info" },

View file

@ -44,7 +44,7 @@ params {
malt_save_reads = true
kraken2_save_reads = true
centrifuge_save_reads = true
diamond_save_reads = true
run_profile_standardisation = true
}
process {

View file

@ -33,6 +33,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
- [MALT](#malt) - Sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics
- [MetaPhlAn3](#metaphlan3) - Genome-level marker gene based taxonomic classifier
- [mOTUs](#motus) - Tool for marker gene-based OTU (mOTU) profiling.
- [TAXPASTA](#taxpasta) - Tool to standardise taxonomic profiles as well as merge profiles across samples from the same database and classifier/profiler.
- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
@ -435,6 +436,35 @@ Krona charts will be generated by the pipeline for supported tools (Kraken2, Cen
The resulting HTML files can be loaded into your web browser for exploration. Each file will have a dropdown to allow you to switch between each sample aligned against the given database of the tool.
### TAXPASTA
[TAXPASTA](https://github.com/taxprofiler/taxpasta) standardises and merges two or more taxonomic profiles across samples into one single table. It supports multiple different classifiers simplifying comparison of taxonomic classification results between tools and databases.
<details markdown="1">
<summary>Output files</summary>
- `taxpasta`
- `<tool>_<database>*.{tsv,csv,arrow,parquet,biom}`: Standardised taxon table containing multiple samples. The standard format is the `tsv`. The first column describes the taxonomy ID and the rest of the columns describe the read counts for each sample.
</details>
These files will likely be the most useful files for the comparison of differences in classification between different tools or building consensuses, with the caveat they have slightly less information than the actual output from each tool (which may have non-standard information e.g. taxonomic rank, percentage of hits, abundance estimations).
The following report files are used for the taxpasta step:
- Bracken: `<sample>_<db_name>.tsv` Taxpasta used the `new_est_reads` column for the standardised profile.
- Centrifuge: `<sample_id>.centrifuge.txt` Taxpasta uses the `direct_assigned_reads` column for the standardised profile.
- Diamond: `<sample_id>` Taxpasta summarises number of reads per NCBI taxonomy ID standardised profile.
- Kaiju: `<sample_id>_<db_name>.kaijutable.txt` Taxpasta uses the `reads` column from kaiju2table standardised profile.
- KrakenUniq: `<sample_id>_<db_name>.report.txt` Taxpasta uses the `reads` column for the standardised profile.
- Kraken2: `<sample_id>_<db_name>.report.txt` Taxpasta uses the `direct_assigned_reads` column for the standardised profile.
- MALT: `<sample_id>.txt.gz` Taxpasta uses the `count` (second) column from the output of MEGAN6's rma2info for the standardised profile.
- MetaPhlAn3: `<sample_id>_profile.txt` Taxpasta uses the `relative_abundance` column multiplied with a fixed number to yield an integer for the standardised profile.
- mOTUs: `<sample_id>.out` Taxpasta uses the `read_count` column for the standardised profile.
> ⚠️ Please aware the outputs of each tool's standardised profile _may not_ be directly comparable between each tool. Some may report raw read counts, whereas others may report abundance information. Please always refer to the list above, for which information is used for each tool.
### MultiQC
<details markdown="1">

View file

@ -207,6 +207,11 @@
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
},
"taxpasta/merge": {
"branch": "master",
"git_sha": "74ab450ed05e034d049c00f6e2853de2c31594b4",
"installed_by": ["modules"]
},
"untar": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",

47
modules/nf-core/taxpasta/merge/main.nf generated Normal file
View file

@ -0,0 +1,47 @@
process TAXPASTA_MERGE {
tag "$meta.id"
label 'process_single'
conda "bioconda::taxpasta=0.1.1"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/taxpasta:0.1.1--pyhdfd78af_0':
'quay.io/biocontainers/taxpasta:0.1.1--pyhdfd78af_0' }"
input:
tuple val(meta), path(profiles)
path taxonomy
path samplesheet
output:
tuple val(meta), path("*.{tsv,csv,arrow,parquet,biom}"), emit: merged_profiles
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
// N.B.: Taxpasta requires a --profiler option and will fail without it.
// This must be specified via a `nextflow.config` or `modules.config`, for
// example, as "--profiler kraken2". Additionally, it requires a --output
// option with the output file name. The desired format will be parsed from
// the name and should correspond to the output pattern specified above,
// e.g., "--output ${task.ext.prefix}.tsv".
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def taxonomy_option = taxonomy ? "--taxonomy ${taxonomy}" : ''
def samplesheet_input = samplesheet ? "-s ${samplesheet}" : ''
"""
taxpasta merge \\
$args \\
$taxonomy_option \\
$samplesheet_input \\
$profiles
cat <<-END_VERSIONS > versions.yml
"${task.process}":
taxpasta: \$(taxpasta --version)
END_VERSIONS
"""
}

58
modules/nf-core/taxpasta/merge/meta.yml generated Normal file
View file

@ -0,0 +1,58 @@
name: "taxpasta_merge"
description: Standardise and merge two or more taxonomic profiles into a single table
keywords:
- taxonomic profile
- standardise
- standardisation
- metagenomics
- taxonomic profiling
- otu tables
- taxon tables
tools:
- "taxpasta":
description: "TAXonomic Profile Aggregation and STAndardisation"
homepage: "https://taxpasta.readthedocs.io/"
documentation: "https://taxpasta.readthedocs.io/"
tool_dev_url: "https://github.com/taxprofiler/taxpasta"
doi: ""
licence: "['Apache-2.0']"
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- profiles:
type: file
description: A list of taxonomic profiler output files (typically in text format, mandatory)
pattern: "*.{tsv,csv,arrow,parquet,biom}"
- samplesheet:
type: file
description:
A samplesheet describing the sample name and a filepath to a taxonomic abundance profile that needs to be relative
from the work environment. The profiles must be provided even if you give a samplesheet as argument (optional)
pattern: "*.{tsv,csv,ods,xlsx,arrow,parquet}"
- taxonomy:
type: directory
description: Directory containing at a minimum nodes.dmp and names.dmp files (optional)
pattern: "*/"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- merged_profiles:
type: file
description: Output file with standardised multiple profiles in one go and have all profiles combined into a single table.
pattern: "*.{tsv,csv,ods,xlsx,arrow,parquet,biom}"
authors:
- "@sofstam"
- "@jfy133"

View file

@ -155,8 +155,9 @@ params {
krona_taxonomy_directory = null
// profile standardisation
run_profile_standardisation = false
generate_biom_output = false
run_profile_standardisation = false
standardisation_taxpasta_format = 'tsv'
standardisation_motus_generatebiom = false
}
// Load base.config by default for all pipelines

View file

@ -491,11 +491,11 @@
"description": "Turn on standardisation of taxon tables across profilers",
"help_text": "Turns on standardisation of output OTU tables across all tools; each into a TSV format following the following scheme:\n\n|TAXON | SAMPLE_A | SAMPLE_B |\n|-------------|----------------|-----------------|\n| taxon_a | 32 | 123 |\n| taxon_b | 1 | 5 |\n\nThis currently only is generated for mOTUs."
},
"generate_biom_output": {
"standardisation_motus_generatebiom": {
"type": "boolean",
"fa_icon": "fas fa-toggle-on",
"description": "Turn on generation of BIOM output (currently only applies to mOTUs)",
"help_text": "Turn on the saving of the taxonomic output in BIOM format (`.biom`) in the results directory of your pipeline run, instead of the default TSV format.\n\nNote this file is from the output of the `motus merge` command.\n\n> Modifies tool parameter(s):\n> - `-B -o`"
"help_text": "Turn on the saving of the taxonomic output in BIOM format (`.biom`) in the results directory of your pipeline run, instead of the default TSV format.\\n\\nNote this file is from the output of the `motus merge` command.\\n\\n> Modifies tool parameter(s):\\n> - `-B -o`"
},
"run_krona": {
"type": "boolean",
@ -509,6 +509,13 @@
"fa_icon": "fas fa-folder-open",
"description": "Specify path to krona taxonomy directories (required for MALT krona plots)",
"help_text": "Specify a path to a Krona taxonomy database directory (i.e. a directory containing a krona generated `.tab` file).\n\nThis is only required for generating Krona plots of MALT output.\n\nNote this taxonomy database must be downloaded and generated with the `updateTaxonomy.sh` script from the krona-tools package."
},
"standardisation_taxpasta_format": {
"type": "string",
"default": "tsv",
"fa_icon": "fas fa-file",
"description": "The desired output format.",
"enum": ["tsv", "csv", "arrow", "parquet", "biom"]
}
},
"fa_icon": "fas fa-chart-line"

View file

@ -8,6 +8,7 @@ include { KRAKENTOOLS_COMBINEKREPORTS as KRAKENTOOLS_COMBINEKREPORTS_KRAKEN
include { KRAKENTOOLS_COMBINEKREPORTS as KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE } from '../../modules/nf-core/krakentools/combinekreports/main'
include { METAPHLAN3_MERGEMETAPHLANTABLES } from '../../modules/nf-core/metaphlan3/mergemetaphlantables/main'
include { MOTUS_MERGE } from '../../modules/nf-core/motus/merge/main'
include { TAXPASTA_MERGE } from '../../modules/nf-core/taxpasta/merge/main'
workflow STANDARDISATION_PROFILES {
take:
@ -21,6 +22,20 @@ workflow STANDARDISATION_PROFILES {
ch_versions = Channel.empty()
ch_multiqc_files = Channel.empty()
//Taxpasta standardisation
ch_input_for_taxpasta = profiles
.map {
meta, profile ->
def meta_new = [:]
meta_new.id = meta.db_name
meta_new.tool = meta.tool == 'metaphlan3' ? 'metaphlan' : meta.tool == 'malt' ? 'megan6' : meta.tool
[meta_new, profile]
}
.groupTuple ()
TAXPASTA_MERGE (ch_input_for_taxpasta, [], [])
/*
Split profile results based on tool they come from
*/
@ -74,6 +89,7 @@ workflow STANDARDISATION_PROFILES {
[[id:it[0]], it[1]]
}
KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE ( ch_profiles_for_centrifuge )
ch_standardised_tables = ch_standardised_tables.mix( KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE.out.txt )
ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE.out.txt )
@ -149,6 +165,7 @@ workflow STANDARDISATION_PROFILES {
emit:
tables = ch_standardised_tables
taxpasta = TAXPASTA_MERGE.out.merged_profiles
versions = ch_versions
mqc = ch_multiqc_files
}