Merge pull request #29 from nf-core/add-metaphlan

Add metaphlan3
2024-11-25 18:19:55 +00:00 · 2022-04-03 09:08:01 +02:00 · 2022-04-03 09:08:01 +02:00 · fe628b3578
commit fe628b3578
parent 1dfbcacf68 f5baf910be
13 changed files with 263 additions and 46 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -46,6 +46,14 @@ jobs:
          wget -qO- get.nextflow.io | bash
          sudo mv nextflow /usr/local/bin/

+      - name: Show current locale
+        run: locale
+
+      - name: Set UTF-8 enabled locale
+        run: |
+          sudo locale-gen en_US.UTF-8
+          sudo update-locale LANG=en_US.UTF-8
+
      - name: Run pipeline with test data
        # TODO nf-core: You can customise CI pipeline run tests as required
        # For example: adding multiple test runs with different parameters
--- a/CITATIONS.md
+++ b/CITATIONS.md
@ -34,6 +34,10 @@

  > Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. “Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico.” Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6.

+- [MetaPhlAn3](https://doi.org/10.7554/eLife.65088)
+
+  > Beghini, Francesco, Lauren J McIver, Aitor Blanco-Míguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, et al. 2021. “Integrating Taxonomic, Functional, and Strain-Level Profiling of Diverse Microbial Communities with BioBakery 3.” Edited by Peter Turnbaugh, Eduardo Franco, and C Titus Brown. ELife 10 (May): e65088.
+
 ## Software packaging/containerisation tools

 - [Anaconda](https://anaconda.com)
--- a/conf/modules.config
+++ b/conf/modules.config
@ -170,6 +170,15 @@ process {
        ]
    }

+    withName: METAPHLAN3 {
+        publishDir = [
+            path: { "${params.outdir}/metaphlan3/${meta.db_name}" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{biom,txt}'
+        ]
+        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
+    }
+
    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
        publishDir = [
            path: { "${params.outdir}/pipeline_info" },
--- a/conf/test.config
+++ b/conf/test.config
@ -26,6 +26,7 @@ params {
    databases           = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
    run_kraken2         = true
    run_malt            = true
+    run_metaphlan3      = true
    shortread_clipmerge = true

 }
--- a/modules.json
+++ b/modules.json
@ -24,6 +24,9 @@
            "malt/run": {
                "git_sha": "72b96f4e504eef673f2b5c13560a9d90b669129b"
            },
+            "metaphlan3": {
+                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
+            },
            "multiqc": {
                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
            },
--- a/modules/local/ensure_fastq_extension.nf
+++ b/modules/local/ensure_fastq_extension.nf
@ -0,0 +1,31 @@
+process ENSURE_FASTQ_EXTENSION {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda (params.enable_conda ? "conda-forge::bash=5.0" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv2/biocontainers_v1.2.0_cv2.img' :
+        'biocontainers/biocontainers:v1.2.0_cv2' }"
+
+
+    input:
+    tuple val(meta), path(reads)
+
+    output:
+    tuple val(meta), path('*.fastq.gz'), emit: reads
+
+    script:
+    if (meta.single_end) {
+        fastq = "${reads.baseName}.fastq.gz"
+        """
+        ln -s '${reads}' '${fastq}'
+        """
+    } else {
+        first = "${reads[0].baseName}.fastq.gz"
+        second = "${reads[1].baseName}.fastq.gz"
+        """
+        ln -s '${reads[0]}' '${first}'
+        ln -s '${reads[1]}' '${second}'
+        """
+    }
+}
--- a/modules/nf-core/modules/metaphlan3/main.nf
+++ b/modules/nf-core/modules/metaphlan3/main.nf
@ -0,0 +1,45 @@
+process METAPHLAN3 {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda (params.enable_conda ? 'bioconda::metaphlan=3.0.12' : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/metaphlan:3.0.12--pyhb7b1952_0' :
+        'quay.io/biocontainers/metaphlan:3.0.12--pyhb7b1952_0' }"
+
+    input:
+    tuple val(meta), path(input)
+    path metaphlan_db
+
+    output:
+    tuple val(meta), path("*_profile.txt")   ,                emit: profile
+    tuple val(meta), path("*.biom")          ,                emit: biom
+    tuple val(meta), path('*.bowtie2out.txt'), optional:true, emit: bt2out
+    path "versions.yml"                      ,                emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def input_type  = ("$input".endsWith(".fastq.gz")) ? "--input_type fastq" :  ("$input".contains(".fasta")) ? "--input_type fasta" : ("$input".endsWith(".bowtie2out.txt")) ? "--input_type bowtie2out" : "--input_type sam"
+    def input_data  = ("$input_type".contains("fastq")) && !meta.single_end ? "${input[0]},${input[1]}" : "$input"
+    def bowtie2_out = "$input_type" == "--input_type bowtie2out" || "$input_type" == "--input_type sam" ? '' : "--bowtie2out ${prefix}.bowtie2out.txt"
+
+    """
+    metaphlan \\
+        --nproc $task.cpus \\
+        $input_type \\
+        $input_data \\
+        $args \\
+        $bowtie2_out \\
+        --bowtie2db ${metaphlan_db} \\
+        --biom ${prefix}.biom \\
+        --output_file ${prefix}_profile.txt
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        metaphlan3: \$(metaphlan --version 2>&1 | awk '{print \$3}')
+    END_VERSIONS
+    """
+}
--- a/modules/nf-core/modules/metaphlan3/meta.yml
+++ b/modules/nf-core/modules/metaphlan3/meta.yml
@ -0,0 +1,52 @@
+name: metaphlan3
+description: MetaPhlAn is a tool for profiling the composition of microbial communities from metagenomic shotgun sequencing data.
+keywords:
+  - metagenomics
+  - classification
+  - fastq
+  - bam
+  - fasta
+tools:
+  - metaphlan3:
+      description: Identify clades (phyla to species) present in the metagenome obtained from a microbiome sample and their relative abundance
+      homepage: https://huttenhower.sph.harvard.edu/metaphlan/
+      documentation: https://github.com/biobakery/MetaPhlAn
+      doi: "10.7554/eLife.65088"
+      licence: ["MIT License"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - input:
+      type: file
+      description: Metaphlan 3.0 can classify the metagenome from a variety of input data types, including FASTQ files (single-end and paired-end), FASTA, bowtie2-produced SAM files (produced from alignments to the MetaPHlAn marker database) and intermediate bowtie2 alignment files (bowtie2out)
+      pattern: "*.{fastq.gz, fasta, fasta.gz, sam, bowtie2out.txt}"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - profile:
+      type: file
+      description: Tab-separated output file of the predicted taxon relative abundances
+      pattern: "*.{txt}"
+  - biom:
+      type: file
+      description: General-use format for representing biological sample by observation contingency tables
+      pattern: "*.{biom}"
+  - bowtie2out:
+      type: file
+      description: Intermediate Bowtie2 output produced from mapping the metagenome against the MetaPHlAn marker database ( not compatible with `bowtie2out` files generated with MetaPhlAn versions below 3 )
+      pattern: "*.{bowtie2out.txt}"
+
+authors:
+  - "@MGordon09"
--- a/nextflow.config
+++ b/nextflow.config
@ -71,6 +71,9 @@ params {

    // kraken2
    run_kraken2                = false
+
+    // metaphlan3
+    run_metaphlan3             = false
 }

 // Load base.config by default for all pipelines
@ -155,7 +158,7 @@ if (!params.igenomes_ignore) {
 // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable.

 env {
-    PYTHONNOUSERSITE = 1
+    PYTHONNOUSERSITE = '1'
    R_PROFILE_USER   = "/.Rprofile"
    R_ENVIRON_USER   = "/.Renviron"
    JULIA_DEPOT_PATH = "/usr/local/share/julia"
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -282,6 +282,10 @@
        "run_kraken2": {
            "type": "boolean"
        },
+        "run_metaphlan3": {
+            "type": "boolean",
+            "description": "Enable MetaPhlAn for taxonomic profiling"
+        },
        "shortread_clipmerge_tool": {
            "type": "string",
            "default": "fastp",
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@ -31,9 +31,9 @@ workflow INPUT_CHECK {
        .set { fasta }

    emit:
-    fastq                                     // channel: [ val(meta), [ reads ] ]
-    nanopore                                  // channel: [ val(meta), [ reads ] ]
-    fasta                                     // channel: [ val(meta), fasta ]
+    fastq = fastq ?: []                       // channel: [ val(meta), [ reads ] ]
+    nanopore = nanopore ?: []                 // channel: [ val(meta), [ reads ] ]
+    fasta = fasta ?: []                       // channel: [ val(meta), fasta ]
    versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ]
 }

--- a/subworkflows/local/shortread_adapterremoval.nf
+++ b/subworkflows/local/shortread_adapterremoval.nf
@ -5,6 +5,11 @@ Process short raw reads with AdapterRemoval
 include { ADAPTERREMOVAL as ADAPTERREMOVAL_SINGLE       } from '../../modules/nf-core/modules/adapterremoval/main'
 include { ADAPTERREMOVAL as ADAPTERREMOVAL_PAIRED       } from '../../modules/nf-core/modules/adapterremoval/main'
 include { CAT_FASTQ                                     } from '../../modules/nf-core/modules/cat/fastq/main'
+include {
+    ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION1;
+    ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION2;
+    ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION3;
+} from '../../modules/local/ensure_fastq_extension'

 workflow SHORTREAD_ADAPTERREMOVAL {

@ -24,63 +29,101 @@ workflow SHORTREAD_ADAPTERREMOVAL {
    ADAPTERREMOVAL_SINGLE ( ch_input_for_adapterremoval.single, [] )
    ADAPTERREMOVAL_PAIRED ( ch_input_for_adapterremoval.paired, [] )

-    // due to the slightly ugly output implementation of the current AdapterRemoval2 version, each file
-    // has to be exported in a separate channel, and we must manually recombine when necessary
+    /*
+     * Due to the ~slightly~ very ugly output implementation of the current AdapterRemoval2 version, each file
+     * has to be exported in a separate channel and we must manually recombine when necessary.
+     */

    if ( params.shortread_clipmerge_mergepairs && !params.shortread_clipmerge_excludeunmerged ) {
-        ch_adapterremoval_for_cat = ADAPTERREMOVAL_PAIRED.out.collapsed
-                                                .mix(
+
+        ENSURE_FASTQ_EXTENSION1(
+            Channel.empty().mix(
+                ADAPTERREMOVAL_PAIRED.out.collapsed,
                ADAPTERREMOVAL_PAIRED.out.collapsed_truncated,
                ADAPTERREMOVAL_PAIRED.out.singles_truncated,
                ADAPTERREMOVAL_PAIRED.out.pair1_truncated,
                ADAPTERREMOVAL_PAIRED.out.pair2_truncated
            )
-                                                .map {
-                                                    meta, reads ->
-                                                        def meta_new = meta.clone()
-                                                        meta_new.single_end = true
-
-                                                        [ meta_new, reads ]
+            .map { meta, reads ->
+                meta.single_end = true
+                [meta, reads]
            }
-                                                    .groupTuple()
+        )

-        ch_adapterremoval_reads_prepped = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads
-                                            .mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated )
+        CAT_FASTQ(
+            ENSURE_FASTQ_EXTENSION1.out.reads
+                .groupTuple()
+        )
+
+        ENSURE_FASTQ_EXTENSION2(ADAPTERREMOVAL_SINGLE.out.singles_truncated)
+
+        ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads
+            .mix(ENSURE_FASTQ_EXTENSION2.out.reads)

    } else if ( params.shortread_clipmerge_mergepairs && params.shortread_clipmerge_excludeunmerged ) {
-        ch_adapterremoval_for_cat = ADAPTERREMOVAL_PAIRED.out.collapsed
-                                                .mix( ADAPTERREMOVAL_PAIRED.out.collapsed_truncated )
-                                                .map {
-                                                    meta, reads ->
-                                                        def meta_new = meta.clone()
-                                                        meta_new['single_end'] = true

-                                                        [ meta_new, reads ]
+        ENSURE_FASTQ_EXTENSION1(
+            Channel.empty().mix(
+                ADAPTERREMOVAL_PAIRED.out.collapsed,
+                ADAPTERREMOVAL_PAIRED.out.collapsed_truncated
+            )
+            .map { meta, reads ->
+                meta.single_end = true
+                [meta, reads]
            }
-                                                    .groupTuple(by: 0)
+        )

-        ch_adapterremoval_reads_prepped = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads
-                                            .mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated )
+        CAT_FASTQ(
+            ENSURE_FASTQ_EXTENSION1.out.reads
+                .groupTuple()
+        )
+
+        ENSURE_FASTQ_EXTENSION2(ADAPTERREMOVAL_SINGLE.out.singles_truncated)
+
+        ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads
+            .mix(ENSURE_FASTQ_EXTENSION2.out.reads)

    } else {

-        ch_adapterremoval_reads_prepped = ADAPTERREMOVAL_PAIRED.out.pair1_truncated
-                                                .join( ADAPTERREMOVAL_PAIRED.out.pair2_truncated )
+        ENSURE_FASTQ_EXTENSION1(
+            ADAPTERREMOVAL_PAIRED.out.pair1_truncated
+            .map { meta, reads ->
+                meta.single_end = true
+                [meta, reads]
+            }
+        )
+
+        ENSURE_FASTQ_EXTENSION2(
+            ADAPTERREMOVAL_PAIRED.out.pair2_truncated
+            .map { meta, reads ->
+                meta.single_end = true
+                [meta, reads]
+            }
+        )
+
+        ENSURE_FASTQ_EXTENSION3(ADAPTERREMOVAL_SINGLE.out.singles_truncated)
+
+        ch_adapterremoval_reads_prepped = ENSURE_FASTQ_EXTENSION1.out.reads
+            .join(ENSURE_FASTQ_EXTENSION2.out.reads)
            .groupTuple()
            .map { meta, pair1, pair2 ->
+                meta.single_end = false
                [ meta, [ pair1, pair2 ].flatten() ]
            }
-                                            .mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated )
-    }
+            .mix(ENSURE_FASTQ_EXTENSION3.out.reads)

-    ch_processed_reads = ch_adapterremoval_reads_prepped
+    }

    ch_versions = ch_versions.mix( ADAPTERREMOVAL_SINGLE.out.versions.first() )
    ch_versions = ch_versions.mix( ADAPTERREMOVAL_PAIRED.out.versions.first() )
-    ch_multiqc_files = ch_multiqc_files.mix( ADAPTERREMOVAL_PAIRED.out.log.collect{it[1]}, ADAPTERREMOVAL_SINGLE.out.log.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix(
+        ADAPTERREMOVAL_PAIRED.out.log.collect{it[1]},
+        ADAPTERREMOVAL_SINGLE.out.log.collect{it[1]}
+    )

    emit:
-    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
+    reads    = ch_adapterremoval_reads_prepped  // channel: [ val(meta), [ reads ] ]
    versions = ch_versions  // channel: [ versions.yml ]
    mqc      = ch_multiqc_files
 }
+
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -60,7 +60,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/
 include { CAT_FASTQ                   } from '../modules/nf-core/modules/cat/fastq/main'
 include { MALT_RUN                    } from '../modules/nf-core/modules/malt/run/main'
 include { KRAKEN2_KRAKEN2             } from '../modules/nf-core/modules/kraken2/kraken2/main'
-
+include { METAPHLAN3                  } from '../modules/nf-core/modules/metaphlan3/main'

 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -130,6 +130,7 @@ workflow TAXPROFILER {
            .branch {
                malt:    it[2]['tool'] == 'malt'
                kraken2: it[2]['tool'] == 'kraken2'
+                metaphlan3: it[2]['tool'] == 'metaphlan3'
                unknown: true
            }

@ -163,6 +164,14 @@ workflow TAXPROFILER {
                                    db: it[3]
                            }

+    ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3
+                            .dump(tag: "input_metaphlan3")
+                            .multiMap {
+                                it ->
+                                    reads: [it[0] + it[2], it[1]]
+                                    db: it[3]
+                            }
+
    /*
        MODULE: RUN PROFILING
    */
@ -174,6 +183,10 @@ workflow TAXPROFILER {
        KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db  )
    }

+    if ( params.run_metaphlan3 ) {
+        METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db )
+    }
+
    /*
        MODULE: MultiQC
    */
@ -204,6 +217,7 @@ workflow TAXPROFILER {

    // TODO MALT results overwriting per database?
    // TODO Versions for Karken/MALT not report?
+    // TODO create multiQC module for metaphlan
    MULTIQC (
        ch_multiqc_files.collect()
    )