Merge pull request #153 from mjamy/add-krakenuniq-module

Add krakenuniq module
2024-11-25 18:49:55 +00:00 · 2022-12-01 09:03:29 +01:00 · 2022-12-01 09:03:29 +01:00 · 4f6eb6fd75
commit 4f6eb6fd75
parent 3d949a803f d7fbe55849
14 changed files with 390 additions and 3 deletions
--- a/CITATIONS.md
+++ b/CITATIONS.md
@ -36,6 +36,10 @@
  > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. Improved Metagenomic Analysis with Kraken 2. Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0.
 - [KrakenUniq](https://doi.org/10.1186/s13059-018-1568-0)
  > Breitwieser, Florian P., Daniel N. Baker, and Steven L. Salzberg. 2018. KrakenUniq: confident and fast metagenomics classification using unique k-mer counts. Genome Biology 19 (1): 198. doi: 10.1186/s13059-018-1568-0
 - [Bracken](https://doi.org/10.7717/peerj-cs.104)
  > Lu, J., Breitwieser, F. P., Thielen, P., & Salzberg, S. L. (2017). Bracken: Estimating species abundance in metagenomics data. PeerJ Computer Science, 3, e104. doi: 10.7717/peerj-cs.104
--- a/conf/modules.config
+++ b/conf/modules.config
@ -330,6 +330,17 @@ process {
        ]
    }
    withName: KRAKENUNIQ_PRELOADEDKRAKENUNIQ {
        ext.args = { "${meta.db_params}" }
        // one run with multiple samples, so fix ID to just db name to ensure clean log name
        ext.prefix = { "${meta.db_name}" }
        publishDir = [
            path: { "${params.outdir}/krakenuniq/${meta.db_name}/" },
            mode: params.publish_dir_mode,
            pattern: '*.{txt,report,fastq.gz}'
        ]
    }
    withName: KRONA_CLEANUP {
        ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
        publishDir = [
--- a/conf/test.config
+++ b/conf/test.config
@ -39,6 +39,7 @@ params {
    run_metaphlan3                        = true
    run_centrifuge                        = true
    run_diamond                           = true
    run_krakenuniq                        = true
    run_motus                             = false
    run_krona                             = true
    krona_taxonomy_directory              = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/metagenome/krona_taxonomy.tab'
--- a/conf/test_motus.config
+++ b/conf/test_motus.config
@ -38,6 +38,7 @@ params {
    run_metaphlan3                        = false
    run_centrifuge                        = false
    run_diamond                           = false
    run_krakenuniq                        = false
    run_motus                             = true
    run_profile_standardisation           = true
 }
--- a/conf/test_nopreprocessing.config
+++ b/conf/test_nopreprocessing.config
@ -38,6 +38,7 @@ params {
    run_metaphlan3                        = true
    run_centrifuge                        = true
    run_diamond                           = true
    run_krakenuniq                        = true
    run_motus                             = false
    run_krona                             = true
 }
--- a/conf/test_noprofiling.config
+++ b/conf/test_noprofiling.config
@ -39,6 +39,7 @@ params {
    run_metaphlan3                        = false
    run_centrifuge                        = false
    run_diamond                           = false
    run_krakenuniq                        = false
    run_motus                             = false
 }
--- a/conf/test_nothing.config
+++ b/conf/test_nothing.config
@ -38,6 +38,7 @@ params {
    run_metaphlan3                        = false
    run_centrifuge                        = false
    run_diamond                           = false
    run_krakenuniq                        = false
    run_motus                             = false
 }
--- a/docs/usage.md
+++ b/docs/usage.md
@ -74,7 +74,7 @@ The pipeline takes the locations and specific profiling parameters of the tool o
 > ⚠️ nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user. See below for more information of the expected database files.
-An example database sheet can look as follows, where 4 tools are being used, and `malt` and `kraken2` will be used against two databases each. This is because specifying `bracken` implies first running `kraken2` on the same database.
+An example database sheet can look as follows, where 5 tools are being used, and `malt` and `kraken2` will be used against two databases each. This is because specifying `bracken` implies first running `kraken2` on the same database.
 ```console
 tool,db_name,db_params,db_path
@ -82,6 +82,7 @@ malt,malt85,-id 85,/<path>/<to>/malt/testdb-malt/
 malt,malt95,-id 90,/<path>/<to>/malt/testdb-malt.tar.gz
 bracken,db1,,/<path>/<to>/bracken/testdb-bracken.tar.gz
 kraken2,db2,--quick,/<path>/<to>/kraken2/testdb-kraken2.tar.gz
 krakenuniq,db3,,/<path>/<to>/krakenuniq/testdb-krakenuniq.tar.gz
 centrifuge,db1,,/<path>/<to>/centrifuge/minigut_cf.tar.gz
 metaphlan3,db1,,/<path>/<to>/metaphlan3/metaphlan_database/
 motus,db_mOTU,,/<path>/<to>/motus/motus_database/
@ -125,6 +126,12 @@ Expected (uncompressed) database files for each tool are as follows:
  - `database100mers.kraken`
  - `database150mers.kmer_distrib`
  - `database150mers.kraken`
 - **KrakenUniq** output of `krakenuniq-build` command(s) A directory containing:
  - `opts.k2d`
  - `hash.k2d`
  - `taxo.k2d`
  - `database.idx`
  - `taxDB`
 - **Centrifuge** output of `centrifuge-build`. A directory containing:
  - `<database_name>.<number>.cf`
  - `<database_name>.<number>.cf`
@ -177,7 +184,7 @@ work                # Directory containing the nextflow working files
 ### Sequencing quality control
-nf-core taxprofiler offers [`falco`](https://github.com/smithlabcode/falco] as an alternative option to [`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
+nf-core taxprofiler offers [`falco`][https://github.com/smithlabcode/falco] as an alternative option to [`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
 ### Preprocessing Steps
@ -442,7 +449,7 @@ NXF_OPTS='-Xms1g -Xmx4g'
 ## Troubleshooting and FAQs
-### I get a warning during centrifuge_kreport process with exit status 255.
+### I get a warning during centrifuge_kreport process with exit status 255
 When a sample has insufficient hits for abundance estimation, the resulting `report.txt` file will be empty.
--- a/modules.json
+++ b/modules.json
@ -97,6 +97,10 @@
                        "branch": "master",
                        "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905"
                    },
                    "krakenuniq/preloadedkrakenuniq": {
                        "branch": "master",
                        "git_sha": "05649975c6611c6e007537a7984e186e12ae03af"
                    },
                    "krona/ktimporttaxonomy": {
                        "branch": "master",
                        "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905"
--- a/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf
+++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf
@ -0,0 +1,224 @@
 process KRAKENUNIQ_PRELOADEDKRAKENUNIQ {
    tag "$meta.id"
    label 'process_high'
    conda (params.enable_conda ? "bioconda::krakenuniq=1.0.0" : null)
    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
        'https://depot.galaxyproject.org/singularity/krakenuniq:1.0.0--pl5321h19e8d03_0':
        'quay.io/biocontainers/krakenuniq:1.0.0--pl5321h19e8d03_0' }"
    input:
    tuple val(meta), path(fastqs)
    path  db
    val ram_chunk_size
    val save_output_fastqs
    val report_file
    val save_output
    output:
    tuple val(meta), path('*.classified{.,_}*')     , optional:true, emit: classified_reads_fastq
    tuple val(meta), path('*.unclassified{.,_}*')   , optional:true, emit: unclassified_reads_fastq
    tuple val(meta), path('*classified.txt')        , optional:true, emit: classified_assignment
    tuple val(meta), path('*report.txt')                           , emit: report
    path "versions.yml"                                            , emit: versions
    when:
    task.ext.when == null || task.ext.when
    script:
    def args = task.ext.args ?: ''
    def args2 = task.ext.args ?: ''
    def classified   = meta.single_end ? '"\${PREFIX}.classified.fastq"'   : '"\${PREFIX}.classified#.fastq"'
    def unclassified = meta.single_end ? '"\${PREFIX}.unclassified.fastq"' : '"\${PREFIX}.unclassified#.fastq"'
    def classified_option = save_output_fastqs ? "--classified-out ${classified}" : ''
    def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : ''
    def output_option = save_output ? '--output "\${PREFIX}.krakenuniq.classified.txt"' : ''
    def report = report_file ? '--report-file "\${PREFIX}.krakenuniq.report.txt"' : ''
    def compress_reads_command = save_output_fastqs ? 'gzip --no-name *.fastq' : ''
    if (meta.single_end) {
        """
        krakenuniq \\
            --db $db \\
            --preload \\
            --preload-size $ram_chunk_size \\
            --threads $task.cpus \\
            $args
        strip_suffix() {
            local result=\$1
            # Strip any file extensions.
            echo "\${result%%.*}"
        }
        printf "%s\\n" ${fastqs} | while read FASTQ; do \\
            PREFIX="\$(strip_suffix "\${FASTQ}")"
            krakenuniq \\
                --db $db \\
                --threads $task.cpus \\
                $report \\
                $output_option \\
                $unclassified_option \\
                $classified_option \\
                $output_option \\
                $args2 \\
                "\${FASTQ}"
        done
        $compress_reads_command
        cat <<-END_VERSIONS > versions.yml
        "${task.process}":
            krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//')
        END_VERSIONS
        """
    } else {
        """
        krakenuniq \\
            --db $db \\
            --preload \\
            --preload-size $ram_chunk_size \\
            --threads $task.cpus \\
            $args
        strip_suffix() {
            local result
            read result
            # Strip any trailing dot or underscore.
            result="\${result%_}"
            echo "\${result%.}"
        }
        printf "%s %s\\n" ${fastqs} | while read FASTQ; do \\
            read -r -a FASTQ <<< "\${FASTQ}"
            PREFIX="\$(printf "%s\\n" "\${FASTQ[@]}" |  sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' | strip_suffix)"
            krakenuniq \\
                --db $db \\
                --threads $task.cpus \\
                $report \\
                $output_option \\
                $unclassified_option \\
                $classified_option \\
                $output_option \\
                --paired \\
                $args2 \\
                "\${FASTQ[@]}"
        done
        $compress_reads_command
        cat <<-END_VERSIONS > versions.yml
        "${task.process}":
            krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//')
        END_VERSIONS
        """
    }
    stub:
    def args = task.ext.args ?: ''
    def args2 = task.ext.args ?: ''
    def classified   = meta.single_end ? '"\${PREFIX}.classified.fastq"'   : '"\${PREFIX}.classified#.fastq"'
    def unclassified = meta.single_end ? '"\${PREFIX}.unclassified.fastq"' : '"\${PREFIX}.unclassified#.fastq"'
    def classified_option = save_output_fastqs ? "--classified-out ${classified}" : ''
    def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : ''
    def output_option = save_output ? '--output "\${PREFIX}.krakenuniq.classified.txt"' : ''
    def report = report_file ? '--report-file "\${PREFIX}.krakenuniq.report.txt"' : ''
    def compress_reads_command = save_output_fastqs ? 'gzip --no-name *.fastq' : ''
    if (meta.single_end) {
        """
        echo krakenuniq \\
            --db $db \\
            --preload \\
            --preload-size $ram_chunk_size \\
            --threads $task.cpus \\
            $args
        strip_suffix() {
            local result=\$1
            # Strip any file extensions.
            echo "\${result%%.*}"
        }
        printf "%s\\n" ${fastqs} | while read FASTQ; do \\
            echo "\${FASTQ}"
            PREFIX="\$(strip_suffix "\${FASTQ}")"
            echo "\${PREFIX}"
            echo krakenuniq \\
                --db $db \\
                --threads $task.cpus \\
                $report \\
                $output_option \\
                $unclassified_option \\
                $classified_option \\
                $output_option \\
                $args2 \\
                "\${FASTQ}"
            touch "\${PREFIX}.classified.fastq.gz"
            touch "\${PREFIX}.krakenuniq.classified.txt"
            touch "\${PREFIX}.krakenuniq.report.txt"
            touch "\${PREFIX}.unclassified.fastq.gz"
        done
        echo $compress_reads_command
        cat <<-END_VERSIONS > versions.yml
        "${task.process}":
            krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//')
        END_VERSIONS
        """
    } else {
        """
        echo krakenuniq \\
            --db $db \\
            --preload \\
            --preload-size $ram_chunk_size \\
            --threads $task.cpus \\
            $args
        strip_suffix() {
            local result
            read result
            # Strip any trailing dot or underscore.
            result="\${result%_}"
            echo "\${result%.}"
        }
        printf "%s %s\\n" ${fastqs} | while read FASTQ; do \\
            read -r -a FASTQ <<< "\${FASTQ}"
            echo "\${FASTQ[@]}"
            PREFIX="\$(printf "%s\\n" "\${FASTQ[@]}" |  sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' | strip_suffix)"
            echo "\${PREFIX}"
            echo krakenuniq \\
                --db $db \\
                --threads $task.cpus \\
                $report \\
                $output_option \\
                $unclassified_option \\
                $classified_option \\
                $output_option \\
                --paired \\
                $args2 \\
                "\${FASTQ[@]}"
            touch "\${PREFIX}.classified_1.fastq.gz" "\${PREFIX}.classified_2.fastq.gz"
            touch "\${PREFIX}.krakenuniq.classified.txt"
            touch "\${PREFIX}.krakenuniq.report.txt"
            touch "\${PREFIX}.unclassified_1.fastq.gz" "\${PREFIX}.unclassified_2.fastq.gz"
        done
        echo $compress_reads_command
        cat <<-END_VERSIONS > versions.yml
        "${task.process}":
            krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//')
        END_VERSIONS
        """
    }
 }
--- a/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml
+++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml
@ -0,0 +1,78 @@
 name: "krakenuniq_preloadedkrakenuniq"
 description: Classifies metagenomic sequence data using unique k-mer counts
 keywords:
  - classify
  - metagenomics
  - kmers
  - fastq
  - db
 tools:
  - "krakenuniq":
      description: "Metagenomics classifier with unique k-mer counting for more specific results"
      homepage: https://github.com/fbreitwieser/krakenuniq
      documentation: https://github.com/fbreitwieser/krakenuniq
      doi: 10.1186/s13059-018-1568-0
      licence: ["MIT"]
 input:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - fastqs:
      type: file
      description: List of input FastQ files
  - db:
      type: directory
      description: KrakenUniq database
  - ram_chunk_size:
      type: val
      description: Amount of maximum amount of RAM each chunk of database that should be loaded at any one time
      pattern: "*GB"
  - save_output_fastqs:
      type: boolean
      description: |
        If true, optional commands are added to save classified and unclassified reads
        as fastq files
  - save_reads_assignment:
      type: boolean
      description: |
        If true, an optional command is added to save a file reporting the taxonomic
        classification of each input read
 output:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - classified_reads_fastq:
      type: file
      description: |
        Reads classified as belonging to any of the taxa
        on the KrakenUniq database.
      pattern: "*.fastq.gz"
  - unclassified_reads_fastq:
      type: file
      description: |
        Reads not classified to any of the taxa
        on the KrakenUniq database.
      pattern: "*.fastq.gz"
  - classified_assignment:
      type: file
      description: |
        KrakenUniq output file indicating the taxonomic assignment of
        each input read ## DOUBLE CHECK!!
  - report:
      type: file
      description: |
        KrakenUniq report containing stats about classified
        and not classifed reads.
      pattern: "*.report.txt"
  - versions:
      type: file
      description: File containing software versions
      pattern: "versions.yml"
 authors:
  - "@mjamy"
  - "@Midnighter"
--- a/nextflow.config
+++ b/nextflow.config
@ -119,6 +119,12 @@ params {
    kraken2_save_readclassification = false // added directly to module in profiling.nf
    kraken2_save_minimizers         = false
    //krakenuniq
    run_krakenuniq                      = false
    krakenuniq_ram_chunk_size           = '16G'
    krakenuniq_save_reads               = false // added directly to module in profiling.nf
    krakenuniq_save_readclassifications = false // added directly to module in profiling.nf
    // Bracken
    run_bracken = false
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -402,6 +402,30 @@
                    "fa_icon": "fas fa-save",
                    "help_text": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.\n\nAdds `--report-minimizer-data` to the kraken2 command."
                },
                "run_krakenuniq": {
                    "type": "boolean",
                    "fa_icon": "fas fa-toggle-on",
                    "description": "Turn on profiling with KrakenUniq. Requires database to be present CSV file passed to --databases"
                },
                "krakenuniq_save_reads": {
                    "type": "boolean",
                    "fa_icon": "fas fa-save",
                    "description": "Turn on saving of KrakenUniq-aligned reads",
                    "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--classified-out` and `--unclassified-out`"
                },
                "krakenuniq_ram_chunk_size": {
                    "type": "string",
                    "default": "16G",
                    "description": "Specify how large to chunk database when loading into memory for KrakenUniq",
                    "fa_icon": "fas fa-database",
                    "help_text": "nf-core/taxprofiler utilises a 'low memory' option for KrakenUniq that can reduce the amount of RAM the process requires using the `--preloaded` option.\n\nA further extension to this option is that you can specify how large each chunk of the database should be that gets loaded into memory at any one time. You can specify the amount of RAM to chunk the database to with this parameter, and is particularly useful for people with limited computational resources.\n\nMore information about this parameter can be seen [here](https://github.com/fbreitwieser/krakenuniq/blob/master/README.md#new-release-v07).\n\n> Modifies KrakenUniq parameter: --preload\n\n> \n\n"
                },
                "krakenuniq_save_readclassifications": {
                    "type": "boolean",
                    "fa_icon": "fas fa-save",
                    "description": "Turn on saving of KrakenUniq per-read taxonomic assignment file",
                    "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--output`"
                },
                "run_bracken": {
                    "type": "boolean",
                    "description": "Post-process kraken2 reports with Bracken.",
--- a/subworkflows/local/profiling.nf
+++ b/subworkflows/local/profiling.nf
@ -13,6 +13,7 @@ include { METAPHLAN3_METAPHLAN3                 } from '../../modules/nf-core/me
 include { KAIJU_KAIJU                           } from '../../modules/nf-core/kaiju/kaiju/main'
 include { DIAMOND_BLASTX                        } from '../../modules/nf-core/diamond/blastx/main'
 include { MOTUS_PROFILE                         } from '../../modules/nf-core/motus/profile/main'
 include { KRAKENUNIQ_PRELOADEDKRAKENUNIQ        } from '../../modules/nf-core/krakenuniq/preloadedkrakenuniq/main'
 workflow PROFILING {
    take:
@ -47,6 +48,7 @@ workflow PROFILING {
                kaiju: it[2]['tool'] == 'kaiju'
                diamond: it[2]['tool'] == 'diamond'
                motus: it[2]['tool'] == 'motus'
                krakenuniq: it[2]['tool'] == 'krakenuniq'
                unknown: true
            }
@ -265,6 +267,28 @@ workflow PROFILING {
        ch_multiqc_files   = ch_multiqc_files.mix( MOTUS_PROFILE.out.log )
    }
    if ( params.run_krakenuniq ) {
        ch_input_for_krakenuniq =  ch_input_for_profiling.krakenuniq
                                    .map {
                                        meta, reads, db_meta, db ->
                                            [[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db]
                                    }
                                    .groupTuple(by: [0,2,3])
                                    .dump(tag: "krakenuniq_premultimap")
                                    .multiMap {
                                        single_meta, reads, db_meta, db ->
                                            reads: [ single_meta + db_meta, reads.flatten() ]
                                            db: db
                                }
        // Hardcode to _always_ produce the report file (which is our basic otput, and goes into)
        KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads.dump(tag: "krakenuniq_input"), ch_input_for_krakenuniq.db.dump(tag: "krakenuniq_db"), params.krakenuniq_ram_chunk_size, params.krakenuniq_save_reads, true, params.krakenuniq_save_readclassifications )
        ch_multiqc_files       = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report )
        ch_versions            = ch_versions.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions.first() )
        ch_raw_classifications = ch_raw_classifications.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.classified_assignment )
        ch_raw_profiles        = ch_raw_profiles.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report )
    }
    emit:
    classifications = ch_raw_classifications
    profiles        = ch_raw_profiles    // channel: [ val(meta), [ reads ] ] - should be text files or biom