From 3ff54e620e9b9212a3bad5c687769b8c37e5b89d Mon Sep 17 00:00:00 2001 From: sofstam Date: Thu, 24 Mar 2022 12:51:45 +0100 Subject: [PATCH] Add centrifuge classification --- conf/modules.config | 10 +++ conf/test.config | 1 + modules.json | 5 +- modules/nf-core/modules/centrifuge/main.nf | 63 ++++++++++++++++++ modules/nf-core/modules/centrifuge/meta.yml | 73 +++++++++++++++++++++ nextflow.config | 8 ++- subworkflows/local/db_check.nf | 2 +- subworkflows/local/input_check.nf | 3 +- workflows/taxprofiler.nf | 22 +++++-- 9 files changed, 179 insertions(+), 8 deletions(-) create mode 100644 modules/nf-core/modules/centrifuge/main.nf create mode 100644 modules/nf-core/modules/centrifuge/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 29a5135..20e6bba 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -121,4 +121,14 @@ process { ] } + withName: CENTRIFUGE { + publishDir = [ + path: { "${params.outdir}/centrifuge/${meta.db_name}" }, + mode: 'copy', + pattern: '*.{fastq.gz,txt}' + ] + ext.args = { "${meta.db_params}" } + ext.prefix = { "${meta.id}-${meta.db_name}" } + } + } diff --git a/conf/test.config b/conf/test.config index 42d8de6..6fca9c0 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,5 +29,6 @@ params { run_kraken2 = true run_malt = true shortread_clipmerge = true + run_centrifuge = true } diff --git a/modules.json b/modules.json index 673a69b..b9dfc87 100644 --- a/modules.json +++ b/modules.json @@ -29,6 +29,9 @@ "porechop": { "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046" } + "centrifuge": { + "git_sha": "ea41a8a6f761b9993d857570e872abaae3fea555" + } } } -} \ No newline at end of file +} diff --git a/modules/nf-core/modules/centrifuge/main.nf b/modules/nf-core/modules/centrifuge/main.nf new file mode 100644 index 0000000..7eb566d --- /dev/null +++ b/modules/nf-core/modules/centrifuge/main.nf @@ -0,0 +1,63 @@ +process CENTRIFUGE { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? "bioconda::centrifuge=1.0.4_beta" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6' : + 'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }" + + input: + tuple val(meta), path(reads) + path db + val save_unaligned + val save_aligned + val sam_format + + output: + tuple val(meta), path('*report.txt') , emit: report + tuple val(meta), path('*results.txt') , emit: results + tuple val(meta), path('*kreport.txt') , emit: kreport + tuple val(meta), path('*.sam') , optional: true, emit: sam + tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_mapped + tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "-U ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" + def db_name = db.toString().replace(".tar.gz","") + def unaligned = '' + def aligned = '' + if (meta.single_end) { + unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : '' + } else { + unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : '' + } + def sam_output = sam_format ? "--out-fmt 'sam'" : '' + """ + tar -xf $db + centrifuge \\ + -x $db_name \\ + -p $task.cpus \\ + $paired \\ + --report-file ${prefix}.report.txt \\ + -S ${prefix}.results.txt \\ + $unaligned \\ + $aligned \\ + $sam_output \\ + $args + centrifuge-kreport -x $db_name ${prefix}.results.txt > ${prefix}.kreport.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/centrifuge/meta.yml b/modules/nf-core/modules/centrifuge/meta.yml new file mode 100644 index 0000000..3adf0e2 --- /dev/null +++ b/modules/nf-core/modules/centrifuge/meta.yml @@ -0,0 +1,73 @@ +name: centrifuge +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - centrifuge: + description: Centrifuge is a classifier for metagenomic sequences. + homepage: https://ccb.jhu.edu/software/centrifuge/ + documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml + doi: 10.1101/gr.210641.116 + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Centrifuge database in .tar.gz format + pattern: "*.tar.gz" + - save_unaligned: + type: value + description: If true unmapped fastq files are saved + - save_aligned: + type: value + description: If true mapped fastq files are saved +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - report: + type: file + description: | + File containing a classification summary + pattern: "*.{report.txt}" + - results: + type: file + description: | + File containing classification results + pattern: "*.{results.txt}" + - kreport: + type: file + description: | + File containing kraken-style report from centrifuge + out files. + pattern: "*.{kreport.txt}" + - fastq_unmapped: + type: file + description: Unmapped fastq files + pattern: "*.unmapped.fastq.gz" + - fastq_mapped: + type: file + description: Mapped fastq files + pattern: "*.mapped.fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@sofstam" + - "@jfy133" + - "@sateeshperi" diff --git a/nextflow.config b/nextflow.config index 5f7aec6..5bd8f39 100644 --- a/nextflow.config +++ b/nextflow.config @@ -56,7 +56,7 @@ params { // FASTQ preprocessing shortread_clipmerge = false - shortread_excludeunmerged = true + shortread_excludeunmerged = true longread_clip = false // MALT @@ -65,6 +65,12 @@ params { // kraken2 run_kraken2 = false + + // centrifuge + run_centrifuge = false + save_unaligned = false + save_aligned = false + sam_format = false } // Load base.config by default for all pipelines diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index 890e373..28268c3 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -21,7 +21,7 @@ workflow DB_CHECK { ch_dbs_for_untar = parsed_samplesheet .branch { - untar: it[1].toString().endsWith(".tar.gz") + untar: it[1].toString().endsWith(".tar.gz") && it[0]['tool']!="centrifuge" skip: true } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 4501386..b64e31e 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -67,8 +67,9 @@ def create_fastq_channel(LinkedHashMap row) { if (!file(row.fastq_2).exists()) { exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] + fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] } + } return fastq_meta } diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 6fc5450..ea3ef18 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -58,7 +58,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/ include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' include { MALT_RUN } from '../modules/nf-core/modules/malt/run/main' include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/modules/kraken2/kraken2/main' - +include { CENTRIFUGE } from '../modules/nf-core/modules/centrifuge/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -149,9 +149,10 @@ workflow TAXPROFILER { .combine(DB_CHECK.out.dbs) .dump(tag: "reads_plus_db") .branch { - malt: it[2]['tool'] == 'malt' - kraken2: it[2]['tool'] == 'kraken2' - unknown: true + malt: it[2]['tool'] == 'malt' + kraken2: it[2]['tool'] == 'kraken2' + centrifuge: it[2]['tool'] == 'centrifuge' + unknown: true } // @@ -184,6 +185,15 @@ workflow TAXPROFILER { db: it[3] } + // We can run centrifuge one-by-one sample-wise + ch_input_for_centrifuge = ch_input_for_profiling.centrifuge + .dump(tag: "input for centrifuge") + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + // // RUN PROFILING // @@ -195,6 +205,10 @@ workflow TAXPROFILER { KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) } + if ( params.run_centrifuge ) { + CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.save_unaligned, params.save_aligned, params.sam_format ) + } + // // MODULE: MultiQC //