Merge branch 'dev' into metaphlan3/mergemetaphlantables

2024-12-22 15:28:16 +00:00 · 2022-09-15 13:08:57 +02:00 · 2022-09-15 13:08:57 +02:00 · cdb21d17b3
commit cdb21d17b3
parent 19a195ed14 d9795a4fdc
12 changed files with 203 additions and 55 deletions
--- a/README.md
+++ b/README.md
@ -12,6 +12,8 @@

 ## Introduction

+> ⚠️ This pipeline is still under development! While the pipeline is usable, not all functionality will be available!
+
 <!-- TODO nf-core: Write a 1-2 sentence summary of what data the pipeline is for and what it does -->

 **nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling with multiple profiling tools against multiple databases, produces standardised output tables.
--- a/conf/modules.config
+++ b/conf/modules.config
@ -294,6 +294,15 @@ process {
        ]
    }

+    withName: KRAKENTOOLS_COMBINEKREPORTS {
+        ext.prefix = { "kraken2_${meta.id}_combined_reports" }
+        publishDir = [
+            path: { "${params.outdir}/kraken2/" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{txt}'
+        ]
+    }
+
    withName: KRONA_CLEANUP {
        ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
        publishDir = [
@ -367,6 +376,15 @@ process {
        ]
    }

+    withName: KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE {
+        ext.prefix = { "centrifuge_${meta.id}_combined_reports" }
+        publishDir = [
+            path: { "${params.outdir}/centrifuge/" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{txt}'
+        ]
+    }
+
    withName: KAIJU_KAIJU {
        ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
        publishDir = [
@ -378,7 +396,7 @@ process {
    }

    withName: KAIJU_KAIJU2TABLE {
-        ext.prefix = { "${meta.id}_combined_reports" }
+        ext.prefix = { "kaiju_${meta.id}_combined_reports" }
        publishDir = [
            path: { "${params.outdir}/kaiju/" },
            mode: params.publish_dir_mode,
--- a/docs/usage.md
+++ b/docs/usage.md
@ -410,3 +410,13 @@ We recommend adding the following line to your environment to limit this (typica
 ```bash
 NXF_OPTS='-Xms1g -Xmx4g'
 ```
+
+## Troubleshooting and FAQs
+
+### I get a warning during centrifuge_kreport process with exit status 255.
+
+When a sample has insufficient hits for abundance estimation, the resulting `report.txt` file will be empty.
+
+When trying to convert this to a kraken-style report, the conversion tool will exit with a status code `255`, and provide a `WARN`.
+
+This is **not** an error nor a failure of the pipeline, just your sample has no hits to the provided database when using centrifuge.
--- a/modules.json
+++ b/modules.json
@ -43,8 +43,7 @@
                },
                "fastp": {
                    "branch": "master",
-                    "git_sha": "7e8ad566883449e7939062b5e2bcf53fc1e0002f",
-                    "patch": "modules/nf-core/modules/fastp/fastp.diff"
+                    "git_sha": "2c70c1c1951aaf884d2e8d8d9c871db79f7b35aa"
                },
                "fastqc": {
                    "branch": "master",
@ -74,6 +73,10 @@
                    "branch": "master",
                    "git_sha": "409a308ba46284d8ebb48c2c1befd6f6433db3f7"
                },
+                "krakentools/combinekreports": {
+                    "branch": "master",
+                    "git_sha": "ee0346b4d14ffdc15ce7e093ca1363cd07c9bd78"
+                },
                "krakentools/kreport2krona": {
                    "branch": "master",
                    "git_sha": "233fa70811a03a4cecb2ece483b5c8396e2cee1d"
@ -140,7 +143,7 @@
                },
                "untar": {
                    "branch": "master",
-                    "git_sha": "5e7b1ef9a5a2d9258635bcbf70fcf37dacd1b247"
+                    "git_sha": "393dbd6ddafe3f18eac02893dd4a21e4d45de679"
                }
            }
        }
--- a/modules/nf-core/modules/fastp/fastp.diff
+++ b/modules/nf-core/modules/fastp/fastp.diff
@ -1,33 +0,0 @@
-Changes in module 'nf-core/modules/fastp'
--- modules/nf-core/modules/fastp/main.nf
-+++ modules/nf-core/modules/fastp/main.nf
-@@ -33,9 +33,8 @@
-         def fail_fastq = save_trimmed_fail ? "--failed_out ${prefix}.fail.fastq.gz" : ''
-         """
-         [ ! -f  ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz
-        cat ${prefix}.fastq.gz \\
-        | fastp \\
-            --stdin \\
-+        
-+        fastp \\
-             --stdout \\
-             --in1 ${prefix}.fastq.gz \\
-             --thread $task.cpus \\
-@@ -45,6 +44,7 @@
-             $args \\
-             2> ${prefix}.fastp.log \\
-         | gzip -c > ${prefix}.fastp.fastq.gz
-+
-         cat <<-END_VERSIONS > versions.yml
-         "${task.process}":
-             fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
-@@ -69,6 +69,7 @@
-             --detect_adapter_for_pe \\
-             $args \\
-             2> ${prefix}.fastp.log
-+            
- 
-         cat <<-END_VERSIONS > versions.yml
-         "${task.process}":
-
-************************************************************
--- a/modules/nf-core/modules/fastp/main.nf
+++ b/modules/nf-core/modules/fastp/main.nf
@ -26,14 +26,14 @@ process FASTP {

    script:
    def args = task.ext.args ?: ''
-    // Added soft-links to original fastqs for consistent naming in MultiQC
    def prefix = task.ext.prefix ?: "${meta.id}"
+    def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : ''
+    // Added soft-links to original fastqs for consistent naming in MultiQC
    // Use single ended for interleaved. Add --interleaved_in in config.
-    if (meta.single_end) {
-        def fail_fastq = save_trimmed_fail ? "--failed_out ${prefix}.fail.fastq.gz" : ''
+    if ( task.ext.args?.contains('--interleaved_in') ) {
        """
        [ ! -f  ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz
-        
+
        fastp \\
            --stdout \\
            --in1 ${prefix}.fastq.gz \\
@ -45,13 +45,32 @@ process FASTP {
            2> ${prefix}.fastp.log \\
        | gzip -c > ${prefix}.fastp.fastq.gz

+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
+        END_VERSIONS
+        """
+    } else if (meta.single_end) {
+        """
+        [ ! -f  ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz
+
+        fastp \\
+            --stdout \\
+            --in1 ${prefix}.fastq.gz \\
+            --out1  ${prefix}.fastp.fastq.gz \\
+            --thread $task.cpus \\
+            --json ${prefix}.fastp.json \\
+            --html ${prefix}.fastp.html \\
+            $fail_fastq \\
+            $args \\
+            2> ${prefix}.fastp.log
+
        cat <<-END_VERSIONS > versions.yml
        "${task.process}":
            fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
        END_VERSIONS
        """
    } else {
-        def fail_fastq  = save_trimmed_fail ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : ''
        def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : ''
        """
        [ ! -f  ${prefix}_1.fastq.gz ] && ln -sf ${reads[0]} ${prefix}_1.fastq.gz
@ -69,7 +88,6 @@ process FASTP {
            --detect_adapter_for_pe \\
            $args \\
            2> ${prefix}.fastp.log
-            

        cat <<-END_VERSIONS > versions.yml
        "${task.process}":
--- a/modules/nf-core/modules/fastp/meta.yml
+++ b/modules/nf-core/modules/fastp/meta.yml
@ -21,7 +21,8 @@ input:
      type: file
      description: |
        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
-        respectively.
+        respectively. If you wish to run interleaved paired-end data,  supply as single-end data
+        but with `--interleaved_in` in your `modules.conf`'s `ext.args` for the module.
  - save_trimmed_fail:
      type: boolean
      description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz`
--- a/modules/nf-core/modules/krakentools/combinekreports/main.nf
+++ b/modules/nf-core/modules/krakentools/combinekreports/main.nf
@ -0,0 +1,34 @@
+process KRAKENTOOLS_COMBINEKREPORTS {
+    label 'process_low'
+
+    conda (params.enable_conda ? "bioconda::krakentools=1.2" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/krakentools:1.2--pyh5e36f6f_0':
+        'quay.io/biocontainers/krakentools:1.2--pyh5e36f6f_0' }"
+
+    input:
+    tuple val(meta), path(kreports)
+
+    output:
+    tuple val(meta), path("*.txt"), emit: txt
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    prefix = task.ext.prefix ?: "${meta.id}"
+    def VERSION = '1.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
+    """
+    combine_kreports.py \\
+        -r ${kreports} \\
+        -o ${prefix}.txt \\
+        ${args}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        combine_kreports.py: ${VERSION}
+    END_VERSIONS
+    """
+}
--- a/modules/nf-core/modules/krakentools/combinekreports/meta.yml
+++ b/modules/nf-core/modules/krakentools/combinekreports/meta.yml
@ -0,0 +1,43 @@
+name: krakentools_combinekreports
+description: Takes a Kraken report file and prints out a krona-compatible TEXT file
+keywords:
+  - kraken
+  - krakentools
+  - metagenomics
+  - table
+  - combining
+  - merging
+tools:
+  - krakentools:
+      description: KrakenTools is a suite of scripts to be used for post-analysis of Kraken/KrakenUniq/Kraken2/Bracken results. Please cite the relevant paper if using KrakenTools with any of the listed programs.
+      homepage: https://github.com/jenniferlu717/KrakenTools
+      licence: ["GPL v3"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - kreports:
+      type: file
+      description: List of kraken-style report files
+      pattern: "*.{txt,kreport}"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - txt:
+      type: file
+      description: Combined kreport file of all input files
+      pattern: "*.txt"
+
+authors:
+  - "@jfy133"
--- a/modules/nf-core/modules/untar/main.nf
+++ b/modules/nf-core/modules/untar/main.nf
@ -25,12 +25,23 @@ process UNTAR {
    """
    mkdir output

-    tar \\
-        -C output --strip-components 1 \\
-        -xzvf \\
-        $args \\
-        $archive \\
-        $args2
+    ## Ensures --strip-components only applied when top level of tar contents is a directory
+    ## If just files or multiple directories, place all in output
+    if [[ \$(tar -tzf ${archive} | grep "/\$" | wc -l) -eq 1 ]]; then
+        tar \\
+            -C output --strip-components 1 \\
+            -xzvf \\
+            $args \\
+            $archive \\
+            $args2
+    else
+        tar \\
+            -C output \\
+            -xzvf \\
+            $args \\
+            $archive \\
+            $args2
+    fi

    mv output ${untar}

--- a/modules/nf-core/modules/untar/meta.yml
+++ b/modules/nf-core/modules/untar/meta.yml
@ -26,9 +26,9 @@ output:
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - untar:
-      type: file
-      description:
-      pattern: "*.*"
+      type: directory
+      description: Directory containing contents of archive
+      pattern: "*/"
  - versions:
      type: file
      description: File containing software versions
@ -36,3 +36,5 @@ output:
 authors:
  - "@joseespinosa"
  - "@drpatelh"
+  - "@matthdsm"
+  - "@jfy133"
--- a/subworkflows/local/standardisation_profiles.nf
+++ b/subworkflows/local/standardisation_profiles.nf
@ -2,9 +2,11 @@
 // Standardise output files e.g. aggregation
 //

-include { KAIJU_KAIJU2TABLE               } from '../../modules/nf-core/modules/kaiju/kaiju2table/main'
-include { MOTUS_MERGE                     } from '../../modules/nf-core/modules/motus/merge/main'
+include { KAIJU_KAIJU2TABLE                                                     } from '../../modules/nf-core/modules/kaiju/kaiju2table/main'
+include { KRAKENTOOLS_COMBINEKREPORTS                                           } from '../../modules/nf-core/modules/krakentools/combinekreports/main'
+include { KRAKENTOOLS_COMBINEKREPORTS as KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE } from '../../modules/nf-core/modules/krakentools/combinekreports/main'
 include { METAPHLAN3_MERGEMETAPHLANTABLES } from '../../modules/nf-core/modules/metaphlan3/mergemetaphlantables/main'
+include { MOTUS_MERGE                                                           } from '../../modules/nf-core/modules/motus/merge/main'

 workflow STANDARDISATION_PROFILES {
    take:
@ -24,6 +26,8 @@ workflow STANDARDISATION_PROFILES {
    ch_input_profiles = profiles
        .branch {
            motus: it[0]['tool'] == 'motus'
+            kraken2: it[0]['tool'] == 'kraken2'
+            centrifuge: it[0]['tool'] == 'centrifuge'
            metaphlan3: it[0]['tool'] == 'metaphlan3'
            unknown: true
        }
@ -45,6 +49,23 @@ workflow STANDARDISATION_PROFILES {
        Standardise and aggregate
    */

+        // CENTRIFUGE
+
+    // Collect and replace id for db_name for prefix
+    // Have to sort by size to ensure first file actually has hits otherwise
+    // the script fails
+    ch_profiles_for_centrifuge = ch_input_profiles.centrifuge
+                                .map { [it[0]['db_name'], it[1]] }
+                                .groupTuple(sort: {-it.size()} )
+                                .map {
+                                    [[id:it[0]], it[1]]
+                                }
+
+    KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE ( ch_profiles_for_centrifuge )
+    ch_standardised_tables = ch_standardised_tables.mix( KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE.out.txt )
+    ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE.out.txt )
+    ch_versions = ch_versions.mix( KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE.out.versions )
+
    // Kaiju

    // Collect and replace id for db_name for prefix
@ -60,7 +81,25 @@ workflow STANDARDISATION_PROFILES {
    ch_multiqc_files = ch_multiqc_files.mix( KAIJU_KAIJU2TABLE.out.summary )
    ch_versions = ch_versions.mix( KAIJU_KAIJU2TABLE.out.versions )

+    // Kraken2
+
+    // Collect and replace id for db_name for prefix
+    // Have to sort by size to ensure first file actually has hits otherwise
+    // the script fails
+    ch_profiles_for_kraken2 = ch_input_profiles.kraken2
+                                .map { [it[0]['db_name'], it[1]] }
+                                .groupTuple(sort: {-it.size()} )
+                                .map {
+                                    [[id:it[0]], it[1]]
+                                }
+
+    KRAKENTOOLS_COMBINEKREPORTS ( ch_profiles_for_kraken2 )
+    ch_standardised_tables = ch_standardised_tables.mix( KRAKENTOOLS_COMBINEKREPORTS.out.txt )
+    ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS.out.txt )
+    ch_versions = ch_versions.mix( KRAKENTOOLS_COMBINEKREPORTS.out.versions )
+
    // MetaPhlAn3
+    
    ch_profiles_for_metaphlan3 = ch_input_profiles.metaphlan3
                            .map { [it[0]['db_name'], it[1]] }
                            .groupTuple()