From a0ee82bf43ad79ef94b5b3a8e7f9dc72ae30e18a Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Tue, 12 Jul 2022 11:39:26 +0200
Subject: [PATCH] Add motus/merge and biom support

---
 conf/modules.config                           |  8 +++
 conf/test_motus.config                        |  1 +
 modules.json                                  |  5 +-
 modules/nf-core/modules/motus/merge/main.nf   | 47 +++++++++++++++
 modules/nf-core/modules/motus/merge/meta.yml  | 57 ++++++++++++++++++
 modules/nf-core/modules/motus/profile/main.nf |  2 +-
 nextflow.config                               |  4 ++
 nextflow_schema.json                          | 60 ++++++++++++++++---
 subworkflows/local/profiling.nf               |  1 +
 .../local/standardisation_profiles.nf         | 56 +++++++++++++++++
 subworkflows/local/visualization_krona.nf     |  4 +-
 workflows/taxprofiler.nf                      |  9 +++
 12 files changed, 241 insertions(+), 13 deletions(-)
 create mode 100644 modules/nf-core/modules/motus/merge/main.nf
 create mode 100644 modules/nf-core/modules/motus/merge/meta.yml
 create mode 100644 subworkflows/local/standardisation_profiles.nf

diff --git a/conf/modules.config b/conf/modules.config
index b858ec3..1558a98 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -391,12 +391,20 @@ process {
     }
 
     withName: MOTUS_PROFILE {
+        ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
         publishDir = [
             path: { "${params.outdir}/motus/${meta.db_name}" },
             mode: params.publish_dir_mode
         ]
     }
 
+    withName: MOTUS_MERGE {
+        publishDir = [
+            path: { "${params.outdir}/motus/" },
+            mode: params.publish_dir_mode
+        ]
+    }
+
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },
diff --git a/conf/test_motus.config b/conf/test_motus.config
index 9d39ad4..1405447 100644
--- a/conf/test_motus.config
+++ b/conf/test_motus.config
@@ -38,4 +38,5 @@ params {
     run_centrifuge                        = false
     run_diamond                           = false
     run_motus                             = true
+    run_profile_standardisation           = true
 }
diff --git a/modules.json b/modules.json
index 1d40748..f98cd62 100644
--- a/modules.json
+++ b/modules.json
@@ -78,8 +78,11 @@
             "minimap2/index": {
                 "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
             },
+            "motus/merge": {
+                "git_sha": "b02e648c221e1da17cb589eefe297e61ec9e9c49"
+            },
             "motus/profile": {
-                "git_sha": "6b960f0e75bbb4d5bd301cd3875fa078d0eab4d1"
+                "git_sha": "b02e648c221e1da17cb589eefe297e61ec9e9c49"
             },
             "multiqc": {
                 "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
diff --git a/modules/nf-core/modules/motus/merge/main.nf b/modules/nf-core/modules/motus/merge/main.nf
new file mode 100644
index 0000000..01ca5a2
--- /dev/null
+++ b/modules/nf-core/modules/motus/merge/main.nf
@@ -0,0 +1,47 @@
+VERSION = '3.0.1'
+
+process MOTUS_MERGE {
+    label 'process_low'
+
+    conda (params.enable_conda ? "bioconda::motus=3.0.1" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/motus:3.0.1--pyhdfd78af_0':
+        'quay.io/biocontainers/motus:3.0.1--pyhdfd78af_0' }"
+
+    input:
+    path input
+    path db // to stop docker saying it can't find it... would have to have the module in upstream steps anyway
+    path profile_version_yml, stageAs: 'profile_version.yml'
+    val biom_format
+
+    output:
+    path("*.txt") , optional: true, emit: txt
+    path("*.biom"), optional: true, emit: biom
+    path "versions.yml" , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = 'motus_merged'
+    def cmd_input = input.size() > 1 ? "-i ${input.join(',')}" : input.isDirectory() ? "-d ${input}" : "-i ${input}"
+    def output = biom_format ? "-B -o ${prefix}.biom" : "-o ${prefix}.txt"
+    """
+    motus \\
+        merge \\
+        -db $db \\
+        ${cmd_input} \\
+        $args \\
+        ${output}
+
+    ## Take version from the mOTUs/profile module output, as cannot reconstruct
+    ## version without having database staged in this directory.
+    VERSION=\$(cat ${profile_version_yml} | grep '/*motus:.*' | sed 's/.*otus: //g')
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        motus: \$VERSION
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/modules/motus/merge/meta.yml b/modules/nf-core/modules/motus/merge/meta.yml
new file mode 100644
index 0000000..c9c7711
--- /dev/null
+++ b/modules/nf-core/modules/motus/merge/meta.yml
@@ -0,0 +1,57 @@
+name: "motus_merge"
+description: Taxonomic meta-omics profiling using universal marker genes
+keywords:
+  - classify
+  - metagenomics
+  - fastq
+  - taxonomic profiling
+  - merging
+  - merge
+  - otu table
+tools:
+  - "motus":
+      description: "Marker gene-based OTU (mOTU) profiling"
+      homepage: "https://motu-tool.org/"
+      documentation: "https://github.com/motu-tool/mOTUs/wiki"
+      tool_dev_url: "https://github.com/motu-tool/mOTUs"
+      doi: "10.1038/s41467-019-08844-4"
+      licence: "['GPL v3']"
+
+input:
+  - input:
+      type: file
+      description: |
+        List of output files (more than one) from motus profile,
+        or a single directory containing motus output files.
+  - db:
+      type: directory
+      description: |
+        mOTUs database downloaded by `motus downloadDB`
+        pattern: "db_mOTU/"
+  - profile_version_yml:
+      type: file
+      description: |
+        A single versions.yml file output from motus/profile. motus/merge cannot reconstruct
+        this itself without having the motus database present and configured with the tool
+        so here we take it from what is already reported by the upstream module.
+      pattern: "versions.yml"
+  - biom_format:
+      type: boolean
+      description: Whether to save output OTU table in biom format
+
+output:
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - txt:
+      type: file
+      description: OTU table in txt format, if BIOM format not requested
+      pattern: "*.txt"
+  - biom:
+      type: file
+      description: OTU table in biom format, if BIOM format requested
+      pattern: "*.biom"
+
+authors:
+  - "@jfy133"
diff --git a/modules/nf-core/modules/motus/profile/main.nf b/modules/nf-core/modules/motus/profile/main.nf
index 6a1acd3..2747984 100644
--- a/modules/nf-core/modules/motus/profile/main.nf
+++ b/modules/nf-core/modules/motus/profile/main.nf
@@ -48,7 +48,7 @@ process MOTUS_PROFILE {
     fi
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        mOTUs: \$VERSION
+        motus: \$VERSION
     END_VERSIONS
     """
 }
diff --git a/nextflow.config b/nextflow.config
index 7160d0f..58c9254 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -132,6 +132,10 @@ params {
     // krona
     run_krona                  = false
     krona_taxonomy_directory   = null
+
+    // profile standardisation
+    run_profile_standardisation = false
+    generate_biom_output                 = false
 }
 
 // Load base.config by default for all pipelines
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 4eec889..28050ba 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -10,7 +10,10 @@
             "type": "object",
             "fa_icon": "fas fa-terminal",
             "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input", "outdir"],
+            "required": [
+                "input",
+                "outdir"
+            ],
             "properties": {
                 "input": {
                     "type": "string",
@@ -52,7 +55,8 @@
                     "type": "string",
                     "description": "Name of iGenomes reference.",
                     "fa_icon": "fas fa-book",
-                    "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details."
+                    "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details.",
+                    "hidden": true
                 },
                 "igenomes_base": {
                     "type": "string",
@@ -173,7 +177,14 @@
                     "description": "Method used to save pipeline results to output directory.",
                     "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                     "fa_icon": "fas fa-copy",
-                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
+                    "enum": [
+                        "symlink",
+                        "rellink",
+                        "link",
+                        "copy",
+                        "copyNoFollow",
+                        "move"
+                    ],
                     "hidden": true
                 },
                 "email_on_fail": {
@@ -287,7 +298,10 @@
         "shortread_qc_tool": {
             "type": "string",
             "default": "fastp",
-            "enum": ["fastp", "adapterremoval"]
+            "enum": [
+                "fastp",
+                "adapterremoval"
+            ]
         },
         "shortread_qc_skipadaptertrim": {
             "type": "boolean"
@@ -313,7 +327,11 @@
         "shortread_complexityfilter_tool": {
             "type": "string",
             "default": "bbduk",
-            "enum": ["bbduk", "prinseqplusplus", "fastp"]
+            "enum": [
+                "bbduk",
+                "prinseqplusplus",
+                "fastp"
+            ]
         },
         "shortread_complexityfilter_bbduk_windowsize": {
             "type": "integer",
@@ -329,7 +347,10 @@
         "shortread_complexityfilter_prinseqplusplus_mode": {
             "type": "string",
             "default": "entropy",
-            "enum": ["entropy", "dust"]
+            "enum": [
+                "entropy",
+                "dust"
+            ]
         },
         "shortread_complexityfilter_prinseqplusplus_dustscore": {
             "type": "number",
@@ -385,7 +406,14 @@
         "kaiju_taxon_name": {
             "type": "string",
             "default": "species",
-            "enum": ["phylum", "class", "order", "family", "genus", "species"]
+            "enum": [
+                "phylum",
+                "class",
+                "order",
+                "family",
+                "genus",
+                "species"
+            ]
         },
         "run_diamond": {
             "type": "boolean"
@@ -393,7 +421,15 @@
         "diamond_output_format": {
             "type": "string",
             "default": "tsv",
-            "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"]
+            "enum": [
+                "blast",
+                "xml",
+                "txt",
+                "daa",
+                "sam",
+                "tsv",
+                "paf"
+            ]
         },
         "longread_hostremoval_index": {
             "type": "string",
@@ -444,7 +480,13 @@
         },
         "krona_taxonomy_directory": {
             "type": "string",
-            "default": null
+            "default": "None"
+        },
+        "run_profile_standardisation": {
+            "type": "boolean"
+        },
+        "generate_biom_output": {
+            "type": "boolean"
         }
     }
 }
diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf
index 60963c9..68f8dcc 100644
--- a/subworkflows/local/profiling.nf
+++ b/subworkflows/local/profiling.nf
@@ -234,5 +234,6 @@ workflow PROFILING {
     classifications = ch_raw_classifications
     profiles        = ch_raw_profiles    // channel: [ val(meta), [ reads ] ] - should be text files or biom
     versions        = ch_versions          // channel: [ versions.yml ]
+    motu_version    = MOTUS_PROFILE.out.versions.first()
     mqc             = ch_multiqc_files
 }
diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf
new file mode 100644
index 0000000..92ceb16
--- /dev/null
+++ b/subworkflows/local/standardisation_profiles.nf
@@ -0,0 +1,56 @@
+//
+// Create Krona visualizations
+//
+
+include { MOTUS_MERGE } from '../../modules/nf-core/modules/motus/merge/main'
+
+workflow STANDARDISATION_PROFILES {
+    take:
+    classifications
+    profiles
+    databases
+    motu_version
+
+    main:
+    ch_standardised_tables = Channel.empty()
+    ch_versions            = Channel.empty()
+
+    /*
+        Split profile results based on tool they come from
+    */
+    ch_input_profiles = profiles
+        .branch {
+            motus: it[0]['tool'] == 'motus'
+            unknown: true
+        }
+
+    ch_input_classifications = classifications
+        .branch {
+            unknown: true
+        }
+
+    ch_input_databases = databases
+        .branch {
+            motus: it[0]['tool'] == 'motus'
+            unknown: true
+        }
+
+    /*
+        Standardise and aggregate
+    */
+
+    // mOTUs has a 'single' database, and cannot create custom ones.
+    // Therefore removing db info here, and publish merged at root mOTUs results
+    // directory
+    MOTUS_MERGE ( ch_input_profiles.motus.map{it[1]}.collect(), ch_input_databases.motus.map{it[1]}, motu_version, params.generate_biom_output )
+    if ( params.generate_biom_output ) {
+        ch_standardised_tables = ch_standardised_tables.mix ( MOTUS_MERGE.out.biom )
+    } else {
+        ch_standardised_tables = ch_standardised_tables.mix ( MOTUS_MERGE.out.txt )
+    }
+    ch_versions = ch_versions.mix( MOTUS_MERGE.out.versions )
+
+    emit:
+    tables = ch_standardised_tables
+    versions = ch_versions
+}
diff --git a/subworkflows/local/visualization_krona.nf b/subworkflows/local/visualization_krona.nf
index 7a94fc6..397251f 100644
--- a/subworkflows/local/visualization_krona.nf
+++ b/subworkflows/local/visualization_krona.nf
@@ -78,7 +78,7 @@ workflow VISUALIZATION_KRONA {
     ch_krona_text_for_import = ch_cleaned_krona_text
         .map{[[id: it[0]['db_name'], tool: it[0]['tool']], it[1]]}
         .groupTuple()
-        .dump(tag: "text")
+
     KRONA_KTIMPORTTEXT( ch_krona_text_for_import )
     ch_krona_html = ch_krona_html.mix( KRONA_KTIMPORTTEXT.out.html )
     ch_versions = ch_versions.mix( KRONA_KTIMPORTTEXT.out.versions.first() )
@@ -92,7 +92,7 @@ workflow VISUALIZATION_KRONA {
         ch_krona_taxonomy_for_input = GUNZIP.out.gunzip
             .map{[[id: it[0]['db_name'], tool: it[0]['tool']], it[1]]}
             .groupTuple()
-            .dump(tag: "taxonomy")
+
         KRONA_KTIMPORTTAXONOMY ( ch_krona_taxonomy_for_input, file(params.krona_taxonomy_directory, checkExists: true) )
         ch_krona_html.mix( KRONA_KTIMPORTTAXONOMY.out.html )
         ch_versions = ch_versions.mix( MEGAN_RMA2INFO.out.versions.first() )
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index 2037649..e382e05 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -65,6 +65,7 @@ include { LONGREAD_HOSTREMOVAL          } from '../subworkflows/local/longread_h
 include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_complexityfiltering'
 include { PROFILING                     } from '../subworkflows/local/profiling'
 include { VISUALIZATION_KRONA           } from '../subworkflows/local/visualization_krona'
+include { STANDARDISATION_PROFILES      } from '../subworkflows/local/standardisation_profiles'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -220,6 +221,14 @@ workflow TAXPROFILER {
         ch_versions = ch_versions.mix( VISUALIZATION_KRONA.out.versions )
     }
 
+    /*
+        SUBWORKFLOW: PROFILING STANDARDISATION
+    */
+    if ( params.run_profile_standardisation ) {
+        STANDARDISATION_PROFILES ( PROFILING.out.classifications, PROFILING.out.profiles, DB_CHECK.out.dbs, PROFILING.out.motu_version )
+        ch_versions = ch_versions.mix( STANDARDISATION_PROFILES.out.versions )
+    }
+
     /*
         MODULE: MultiQC
     */