1
0
Fork 0
mirror of https://github.com/MillironX/taxprofiler.git synced 2024-11-21 16:36:03 +00:00

Add Kraken2 and MALT/run as Proof of Concept (currnetly MQC issue)

This commit is contained in:
James Fellows Yates 2022-03-03 17:42:02 +01:00
parent 278f5605ca
commit 2c183ed2ed
8 changed files with 315 additions and 7 deletions

View file

@ -68,6 +68,26 @@ process {
]
}
withName: MALT_RUN {
publishDir = [
path: { "${params.outdir}/malt/${meta.db_name}" },
mode: 'copy',
pattern: '*.{rma6,tab,text,sam,log}'
]
ext.args = { "${meta.db_params}" }
ext.when = params.run_malt
}
withName: KRAKEN2_KRAKEN2 {
publishDir = [
path: { "${params.outdir}/kraken2/${meta.db_name}" },
mode: 'copy',
pattern: '.{fastq.gz,txt}'
]
ext.args = { "${meta.db_params}" }
ext.when = params.run_kraken2
ext.prefix = { "${meta.id}-${meta.db_name}" }
}
withName: CUSTOM_DUMPSOFTWAREVERSIONS {
publishDir = [

View file

@ -15,6 +15,12 @@
"fastqc": {
"git_sha": "9d0cad583b9a71a6509b754fdf589cbfbed08961"
},
"kraken2/kraken2": {
"git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
},
"malt/run": {
"git_sha": "76cdd46f3f8a77fb5023fb5a39c4ab99925b8b56"
},
"multiqc": {
"git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41"
}

View file

@ -0,0 +1,49 @@
process KRAKEN2_KRAKEN2 {
tag "$meta.id"
label 'process_high'
conda (params.enable_conda ? 'bioconda::kraken2=2.1.2 conda-forge::pigz=2.6' : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' :
'quay.io/biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }"
input:
tuple val(meta), path(reads)
path db
output:
tuple val(meta), path('*classified*') , emit: classified
tuple val(meta), path('*unclassified*'), emit: unclassified
tuple val(meta), path('*report.txt') , emit: txt
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def paired = meta.single_end ? "" : "--paired"
def classified = meta.single_end ? "${prefix}.classified.fastq" : "${prefix}.classified#.fastq"
def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq"
"""
kraken2 \\
--db $db \\
--threads $task.cpus \\
--unclassified-out $unclassified \\
--classified-out $classified \\
--report ${prefix}.kraken2.report.txt \\
--gzip-compressed \\
$paired \\
$args \\
$reads
pigz -p $task.cpus *.fastq
cat <<-END_VERSIONS > versions.yml
"${task.process}":
kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//')
pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
END_VERSIONS
"""
}

View file

@ -0,0 +1,60 @@
name: kraken2_kraken2
description: Classifies metagenomic sequence data
keywords:
- classify
- metagenomics
- fastq
- db
tools:
- kraken2:
description: |
Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads
homepage: https://ccb.jhu.edu/software/kraken2/
documentation: https://github.com/DerrickWood/kraken2/wiki/Manual
doi: 10.1186/s13059-019-1891-0
licence: ["MIT"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: |
List of input FastQ files of size 1 and 2 for single-end and paired-end data,
respectively.
- db:
type: directory
description: Kraken2 database
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- classified:
type: file
description: |
Reads classified to belong to any of the taxa
on the Kraken2 database.
pattern: "*{fastq.gz}"
- unclassified:
type: file
description: |
Reads not classified to belong to any of the taxa
on the Kraken2 database.
pattern: "*{fastq.gz}"
- txt:
type: file
description: |
Kraken2 report containing stats about classified
and not classifed reads.
pattern: "*.{report.txt}"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@joseespinosa"
- "@drpatelh"

49
modules/nf-core/modules/malt/run/main.nf generated Normal file
View file

@ -0,0 +1,49 @@
process MALT_RUN {
tag "$meta.id"
label 'process_high'
conda (params.enable_conda ? "bioconda::malt=0.53" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/malt:0.53--hdfd78af_0' :
'quay.io/biocontainers/malt:0.53--hdfd78af_0' }"
input:
tuple val(meta), path(fastqs)
val mode
path index
output:
tuple val(meta), path("*.rma6") , emit: rma6
tuple val(meta), path("*.{tab,text,sam}"), optional:true, emit: alignments
tuple val(meta), path("*.log") , emit: log
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def avail_mem = 6
if (!task.memory) {
log.info '[MALT_RUN] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.'
} else {
avail_mem = task.memory.giga
}
"""
malt-run \\
-J-Xmx${avail_mem}g \\
-t $task.cpus \\
-v \\
-o . \\
$args \\
--inFile ${fastqs.join(' ')} \\
-m $mode \\
--index $index/ |&tee malt-run.log
cat <<-END_VERSIONS > versions.yml
"${task.process}":
malt: \$(malt-run --help 2>&1 | grep -o 'version.* ' | cut -f 1 -d ',' | cut -f2 -d ' ')
END_VERSIONS
"""
}

View file

@ -0,0 +1,58 @@
name: malt_run
description: MALT, an acronym for MEGAN alignment tool, is a sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics.
keywords:
- malt
- alignment
- metagenomics
- ancient DNA
- aDNA
- palaeogenomics
- archaeogenomics
- microbiome
tools:
- malt:
description: A tool for mapping metagenomic data
homepage: https://www.wsi.uni-tuebingen.de/lehrstuehle/algorithms-in-bioinformatics/software/malt/
documentation: https://software-ab.informatik.uni-tuebingen.de/download/malt/manual.pdf
tool_dev_url: None
doi: "10.1038/s41559-017-0446-6"
licence: ["GPL v3"]
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- fastqs:
type: file
description: Input FASTQ files
pattern: "*.{fastq.gz,fq.gz}"
- mode:
type: string
description: Program mode
pattern: "Unknown|BlastN|BlastP|BlastX|Classifier"
- index:
type: directory
description: Index/database directory from malt-build
pattern: "*/"
output:
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- rma6:
type: file
description: MEGAN6 RMA6 file
pattern: "*.rma6"
- sam:
type: file
description: Alignment files in Tab, Text or MEGAN-compatible SAM format
pattern: "*.{tab,txt,sam}"
- log:
type: file
description: Log of verbose MALT stdout
pattern: "malt-run.log"
authors:
- "@jfy133"

View file

@ -54,8 +54,15 @@ params {
databases = null
// FASTQ preprocessing
fastp_clip_merge = false
fastp_exclude_unmerged = true
fastp_clip_merge = false
fastp_exclude_unmerged = true
// MALT
run_malt = false
malt_mode = 'BlastN'
// kraken2
run_kraken2 = false
}
// Load base.config by default for all pipelines

View file

@ -56,6 +56,9 @@ include { MULTIQC } from '../modules/nf-core/modules/multiqc
include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main'
include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main'
include { MALT_RUN } from '../modules/nf-core/modules/malt/run/main'
include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/modules/kraken2/kraken2/main'
/*
========================================================================================
@ -95,13 +98,15 @@ workflow TAXPROFILER {
)
//
// MODULE: Run Clip/Merge/Complexity
// PERFORM PREPROCESSING
//
if ( params.fastp_clip_merge ) {
FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq )
}
// MODULE: Cat merge runs of same sample
//
// PERFORM RUN MERGING
//
ch_processed_for_combine = FASTQ_PREPROCESSING.out.reads
.dump(tag: "prep_for_combine_grouping")
.map {
@ -118,15 +123,61 @@ workflow TAXPROFILER {
CAT_FASTQ ( ch_processed_for_combine.combine )
// Ready for profiling!
ch_reads_for_profiling = ch_processed_for_combine.skip
.dump(tag: "skip_combine")
.mix( CAT_FASTQ.out.reads )
.dump(tag: "files_for_profiling")
// Combine reads with possible databases
//
// COMBINE READS WITH POSSIBLE DATABASES
//
// output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
ch_input_for_profiling = ch_reads_for_profiling
.combine(DB_CHECK.out.dbs)
.dump(tag: "reads_plus_db")
.branch {
malt: it[2]['tool'] == 'malt'
kraken2: it[2]['tool'] == 'kraken2'
unknown: true
}
//
// PREP PROFILER INPUT CHANNELS ON PER TOOL BASIS
//
// We groupTuple to have all samples in one channel for MALT as database
// loading takes a long time, so we only want to run it once per database
ch_input_for_malt = ch_input_for_profiling.malt
.map {
it ->
def temp_meta = [ id: it[2]['db_name']] + it[2]
def db = it[3]
[ temp_meta, it[1], db ]
}
.groupTuple(by: [0,2])
.dump(tag: "input for malt")
.multiMap {
it ->
reads: [ it[0], it[1].flatten() ]
db: it[2]
}
// We can run Kraken2 one-by-one sample-wise
ch_input_for_kraken2 = ch_input_for_profiling.kraken2
.dump(tag: "input for kraken")
.multiMap {
it ->
reads: [ it[0] + it[2], it[1] ]
db: it[3]
}
//
// RUN PROFILING
//
MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db )
KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db )
ch_reads_for_profiling.combine(DB_CHECK.out.dbs).dump(tag: "reads_plus_db")
//
// MODULE: MultiQC
@ -143,6 +194,14 @@ workflow TAXPROFILER {
if (params.fastp_clip_merge) {
ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc)
}
if (params.run_kraken2) {
ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]))
ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first())
}
if (params.run_malt) {
ch_multiqc_files = ch_multiqc_files.mix(MALT_RUN.out.log.collect{it[1]}.ifEmpty([]))
ch_versions = ch_versions.mix(MALT_RUN.out.versions.first())
}
MULTIQC (