mirror of
https://github.com/MillironX/taxprofiler.git
synced 2024-12-22 14:58:17 +00:00
Add bbduk complexity (entropy-based) filtering
This commit is contained in:
parent
1dfbcacf68
commit
b055df5ea0
13 changed files with 222 additions and 45 deletions
|
@ -26,6 +26,8 @@
|
|||
|
||||
- [Porechop](https://github.com/rrwick/Porechop)
|
||||
|
||||
- [BBTools](http://sourceforge.net/projects/bbmap/)
|
||||
|
||||
- [Kraken2](https://doi.org/10.1186/s13059-019-1891-0)
|
||||
|
||||
> Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. “Improved Metagenomic Analysis with Kraken 2.” Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0.
|
||||
|
|
|
@ -132,7 +132,6 @@ process {
|
|||
]
|
||||
}
|
||||
|
||||
|
||||
withName: PORECHOP {
|
||||
ext.prefix = { "${meta.id}_${meta.run_accession}" }
|
||||
publishDir = [
|
||||
|
@ -142,11 +141,17 @@ process {
|
|||
]
|
||||
}
|
||||
|
||||
withName: CAT_FASTQ {
|
||||
withName: BBMAP_BBDUK {
|
||||
ext.args = [
|
||||
"entropy=${params.shortread_complexityfilter_bbduk_entropy}",
|
||||
"entropywindow=${params.shortread_complexityfilter_bbduk_windowsize}",
|
||||
params.shortread_complexityfilter_bbduk_mask ? "entropymask=t" : "entropymask=f"
|
||||
].join(' ').trim()
|
||||
ext.prefix = { "${meta.id}-${meta.run_accession}" }
|
||||
publishDir = [
|
||||
path: { "${params.outdir}/prepared_sequences" },
|
||||
mode: 'copy',
|
||||
pattern: '*.fastq.gz'
|
||||
path: { "${params.outdir}/bbduk/" },
|
||||
mode: params.publish_dir_mode,
|
||||
pattern: '*.{fastq.gz,log}'
|
||||
]
|
||||
}
|
||||
|
||||
|
|
|
@ -6,6 +6,9 @@
|
|||
"adapterremoval": {
|
||||
"git_sha": "f0800157544a82ae222931764483331a81812012"
|
||||
},
|
||||
"bbmap/bbduk": {
|
||||
"git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
|
||||
},
|
||||
"cat/fastq": {
|
||||
"git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
|
||||
},
|
||||
|
|
43
modules/nf-core/modules/bbmap/bbduk/main.nf
generated
Normal file
43
modules/nf-core/modules/bbmap/bbduk/main.nf
generated
Normal file
|
@ -0,0 +1,43 @@
|
|||
process BBMAP_BBDUK {
|
||||
tag "$meta.id"
|
||||
label 'process_medium'
|
||||
|
||||
conda (params.enable_conda ? "bioconda::bbmap=38.90" : null)
|
||||
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
|
||||
'https://depot.galaxyproject.org/singularity/bbmap:38.90--he522d1c_1' :
|
||||
'quay.io/biocontainers/bbmap:38.90--he522d1c_1' }"
|
||||
|
||||
input:
|
||||
tuple val(meta), path(reads)
|
||||
path contaminants
|
||||
|
||||
output:
|
||||
tuple val(meta), path('*.fastq.gz'), emit: reads
|
||||
tuple val(meta), path('*.log') , emit: log
|
||||
path "versions.yml" , emit: versions
|
||||
|
||||
when:
|
||||
task.ext.when == null || task.ext.when
|
||||
|
||||
script:
|
||||
def args = task.ext.args ?: ''
|
||||
def prefix = task.ext.prefix ?: "${meta.id}"
|
||||
def raw = meta.single_end ? "in=${reads[0]}" : "in1=${reads[0]} in2=${reads[1]}"
|
||||
def trimmed = meta.single_end ? "out=${prefix}.fastq.gz" : "out1=${prefix}_1.fastq.gz out2=${prefix}_2.fastq.gz"
|
||||
def contaminants_fa = contaminants ? "ref=$contaminants" : ''
|
||||
"""
|
||||
maxmem=\$(echo \"$task.memory\"| sed 's/ GB/g/g')
|
||||
bbduk.sh \\
|
||||
-Xmx\$maxmem \\
|
||||
$raw \\
|
||||
$trimmed \\
|
||||
threads=$task.cpus \\
|
||||
$args \\
|
||||
$contaminants_fa \\
|
||||
&> ${prefix}.bbduk.log
|
||||
cat <<-END_VERSIONS > versions.yml
|
||||
"${task.process}":
|
||||
bbmap: \$(bbversion.sh)
|
||||
END_VERSIONS
|
||||
"""
|
||||
}
|
52
modules/nf-core/modules/bbmap/bbduk/meta.yml
generated
Normal file
52
modules/nf-core/modules/bbmap/bbduk/meta.yml
generated
Normal file
|
@ -0,0 +1,52 @@
|
|||
name: bbmap_bbduk
|
||||
description: Adapter and quality trimming of sequencing reads
|
||||
keywords:
|
||||
- trimming
|
||||
- adapter trimming
|
||||
- quality trimming
|
||||
tools:
|
||||
- bbmap:
|
||||
description: BBMap is a short read aligner, as well as various other bioinformatic tools.
|
||||
homepage: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/
|
||||
documentation: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/
|
||||
tool_dev_url: None
|
||||
doi: ""
|
||||
licence: ["UC-LBL license (see package)"]
|
||||
|
||||
input:
|
||||
- meta:
|
||||
type: map
|
||||
description: |
|
||||
Groovy Map containing sample information
|
||||
e.g. [ id:'test', single_end:false ]
|
||||
- reads:
|
||||
type: file
|
||||
description: |
|
||||
List of input FastQ files of size 1 and 2 for single-end and paired-end data,
|
||||
respectively.
|
||||
- contaminants:
|
||||
type: file
|
||||
description: |
|
||||
Reference files containing adapter and/or contaminant sequences for sequence kmer matching
|
||||
|
||||
output:
|
||||
- meta:
|
||||
type: map
|
||||
description: |
|
||||
Groovy Map containing sample information
|
||||
e.g. [ id:'test', single_end:false ]
|
||||
- reads:
|
||||
type: file
|
||||
description: The trimmed/modified fastq reads
|
||||
pattern: "*fastq.gz"
|
||||
- versions:
|
||||
type: file
|
||||
description: File containing software versions
|
||||
pattern: "versions.yml"
|
||||
- log:
|
||||
type: file
|
||||
description: Bbduk log file
|
||||
pattern: "*bbduk.log"
|
||||
|
||||
authors:
|
||||
- "@MGordon09"
|
|
@ -65,6 +65,13 @@ params {
|
|||
shortread_clipmerge_minlength = 15
|
||||
longread_clip = false
|
||||
|
||||
// Complexity filtering
|
||||
shortread_complexityfilter = false
|
||||
shortread_complexityfilter_tool = 'bbduk'
|
||||
shortread_complexityfilter_bbduk_entropy = 0.3
|
||||
shortread_complexityfilter_bbduk_windowsize = 50
|
||||
shortread_complexityfilter_bbduk_mask = false
|
||||
|
||||
// MALT
|
||||
run_malt = false
|
||||
malt_mode = 'BlastN'
|
||||
|
|
|
@ -266,8 +266,7 @@
|
|||
"type": "boolean"
|
||||
},
|
||||
"shortread_clipmerge_excludeunmerged": {
|
||||
"type": "boolean",
|
||||
"default": false
|
||||
"type": "boolean"
|
||||
},
|
||||
"longread_clip": {
|
||||
"type": "boolean"
|
||||
|
@ -304,6 +303,24 @@
|
|||
"shortread_clipmerge_minlength": {
|
||||
"type": "integer",
|
||||
"default": 15
|
||||
},
|
||||
"shortread_complexityfilter_tool": {
|
||||
"type": "string",
|
||||
"default": "bbduk"
|
||||
},
|
||||
"shortread_complexityfilter_bbduk_entropy": {
|
||||
"type": "number",
|
||||
"default": 0.3
|
||||
},
|
||||
"shortread_complexityfilter_bbduk_windowsize": {
|
||||
"type": "integer",
|
||||
"default": 50
|
||||
},
|
||||
"shortread_complexityfilter_bbduk_mask": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"shortread_complexityfilter": {
|
||||
"type": "boolean"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
Process long raw reads with porechop
|
||||
*/
|
||||
//
|
||||
// Process long raw reads with porechop
|
||||
//
|
||||
|
||||
include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main'
|
||||
include { PORECHOP } from '../../modules/nf-core/modules/porechop/main'
|
||||
|
@ -25,7 +25,7 @@ workflow LONGREAD_PREPROCESSING {
|
|||
|
||||
FASTQC_PROCESSED ( PORECHOP.out.reads )
|
||||
ch_versions = ch_versions.mix(PORECHOP.out.versions.first())
|
||||
ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )
|
||||
ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip )
|
||||
|
||||
|
||||
emit:
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
Process short raw reads with AdapterRemoval
|
||||
*/
|
||||
//
|
||||
// Process short raw reads with AdapterRemoval
|
||||
//
|
||||
|
||||
include { ADAPTERREMOVAL as ADAPTERREMOVAL_SINGLE } from '../../modules/nf-core/modules/adapterremoval/main'
|
||||
include { ADAPTERREMOVAL as ADAPTERREMOVAL_PAIRED } from '../../modules/nf-core/modules/adapterremoval/main'
|
||||
|
|
28
subworkflows/local/shortread_complexityfiltering.nf
Normal file
28
subworkflows/local/shortread_complexityfiltering.nf
Normal file
|
@ -0,0 +1,28 @@
|
|||
//
|
||||
// Check input samplesheet and get read channels
|
||||
//
|
||||
|
||||
include { BBMAP_BBDUK } from '../../modules/nf-core/modules/bbmap/bbduk/main'
|
||||
|
||||
workflow SHORTREAD_COMPLEXITYFILTERING {
|
||||
take:
|
||||
reads // [ [ meta ], [ reads ] ]
|
||||
|
||||
main:
|
||||
ch_versions = Channel.empty()
|
||||
ch_multiqc_files = Channel.empty()
|
||||
|
||||
if ( params.shortread_complexityfilter_tool == 'bbduk' ) {
|
||||
ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads
|
||||
ch_versions = ch_versions.mix( BBMAP_BBDUK.out.versions.first() )
|
||||
ch_multiqc_files = ch_multiqc_files.mix( BBMAP_BBDUK.out.log )
|
||||
} else {
|
||||
ch_filtered_reads = reads
|
||||
}
|
||||
|
||||
emit:
|
||||
reads = ch_filtered_reads // channel: [ val(meta), [ reads ] ]
|
||||
versions = ch_versions // channel: [ versions.yml ]
|
||||
mqc = ch_multiqc_files
|
||||
}
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
Process short raw reads with FastP
|
||||
*/
|
||||
//
|
||||
// Process short raw reads with FastP
|
||||
//
|
||||
|
||||
include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fastp/main'
|
||||
include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main'
|
||||
|
@ -44,8 +44,8 @@ workflow SHORTREAD_FASTP {
|
|||
|
||||
ch_processed_reads = ch_fastp_reads_prepped
|
||||
|
||||
ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
|
||||
ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
|
||||
ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json )
|
||||
ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json )
|
||||
|
||||
emit:
|
||||
reads = ch_processed_reads // channel: [ val(meta), [ reads ] ]
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Check input samplesheet and get read channels
|
||||
// Perform read trimming and merging
|
||||
//
|
||||
|
||||
|
||||
|
@ -9,7 +9,7 @@ include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules
|
|||
|
||||
workflow SHORTREAD_PREPROCESSING {
|
||||
take:
|
||||
reads // file: /path/to/samplesheet.csv
|
||||
reads // [ [ meta ], [ reads ] ]
|
||||
|
||||
main:
|
||||
ch_versions = Channel.empty()
|
||||
|
@ -29,7 +29,7 @@ workflow SHORTREAD_PREPROCESSING {
|
|||
|
||||
FASTQC_PROCESSED ( ch_processed_reads )
|
||||
ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions )
|
||||
ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )
|
||||
ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip )
|
||||
|
||||
emit:
|
||||
reads = ch_processed_reads // channel: [ val(meta), [ reads ] ]
|
||||
|
|
|
@ -40,9 +40,10 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi
|
|||
//
|
||||
include { INPUT_CHECK } from '../subworkflows/local/input_check'
|
||||
|
||||
include { DB_CHECK } from '../subworkflows/local/db_check'
|
||||
include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing'
|
||||
include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing'
|
||||
include { DB_CHECK } from '../subworkflows/local/db_check'
|
||||
include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing'
|
||||
include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing'
|
||||
include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_complexityfiltering'
|
||||
|
||||
/*
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -61,7 +62,6 @@ include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fas
|
|||
include { MALT_RUN } from '../modules/nf-core/modules/malt/run/main'
|
||||
include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/modules/kraken2/kraken2/main'
|
||||
|
||||
|
||||
/*
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
RUN MAIN WORKFLOW
|
||||
|
@ -98,10 +98,6 @@ workflow TAXPROFILER {
|
|||
|
||||
ch_versions = ch_versions.mix(FASTQC.out.versions.first())
|
||||
|
||||
CUSTOM_DUMPSOFTWAREVERSIONS (
|
||||
ch_versions.unique().collectFile(name: 'collated_versions.yml')
|
||||
)
|
||||
|
||||
/*
|
||||
SUBWORKFLOW: PERFORM PREPROCESSING
|
||||
*/
|
||||
|
@ -114,17 +110,26 @@ workflow TAXPROFILER {
|
|||
if ( params.longread_clip ) {
|
||||
ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ).reads
|
||||
.map { it -> [ it[0], [it[1]] ] }
|
||||
ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first())
|
||||
} else {
|
||||
ch_longreads_preprocessed = INPUT_CHECK.out.nanopore
|
||||
}
|
||||
|
||||
/*
|
||||
SUBWORKFLOW: COMPLEXITY FILTERING
|
||||
*/
|
||||
|
||||
if ( params.shortread_complexityfilter ) {
|
||||
ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads
|
||||
} else {
|
||||
ch_shortreads_filtered = ch_shortreads_preprocessed
|
||||
}
|
||||
|
||||
/*
|
||||
COMBINE READS WITH POSSIBLE DATABASES
|
||||
*/
|
||||
|
||||
// e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
|
||||
ch_input_for_profiling = ch_shortreads_preprocessed
|
||||
ch_input_for_profiling = ch_shortreads_filtered
|
||||
.mix( ch_longreads_preprocessed )
|
||||
.combine(DB_CHECK.out.dbs)
|
||||
.branch {
|
||||
|
@ -177,6 +182,12 @@ workflow TAXPROFILER {
|
|||
/*
|
||||
MODULE: MultiQC
|
||||
*/
|
||||
|
||||
CUSTOM_DUMPSOFTWAREVERSIONS (
|
||||
ch_versions.unique().collectFile(name: 'collated_versions.yml')
|
||||
)
|
||||
|
||||
|
||||
workflow_summary = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params)
|
||||
ch_workflow_summary = Channel.value(workflow_summary)
|
||||
|
||||
|
@ -188,21 +199,30 @@ workflow TAXPROFILER {
|
|||
ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
|
||||
|
||||
if (params.shortread_clipmerge) {
|
||||
ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_PREPROCESSING.out.mqc)
|
||||
}
|
||||
if (params.longread_clip) {
|
||||
ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc)
|
||||
}
|
||||
if (params.run_kraken2) {
|
||||
ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]))
|
||||
ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first())
|
||||
}
|
||||
if (params.run_malt) {
|
||||
ch_multiqc_files = ch_multiqc_files.mix(MALT_RUN.out.log.collect{it[1]}.ifEmpty([]))
|
||||
ch_versions = ch_versions.mix(MALT_RUN.out.versions.first())
|
||||
ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_shortclipmerge")
|
||||
ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions )
|
||||
}
|
||||
|
||||
if (params.longread_clip) {
|
||||
ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_longclipmerge")
|
||||
ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions )
|
||||
}
|
||||
|
||||
if (params.shortread_complexityfilter){
|
||||
ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_compelxity")
|
||||
ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions )
|
||||
}
|
||||
|
||||
if (params.run_kraken2) {
|
||||
ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_kraken")
|
||||
ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() )
|
||||
}
|
||||
|
||||
if (params.run_malt) {
|
||||
ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ).dump(tag: "mqc_malt")
|
||||
ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() )
|
||||
}
|
||||
|
||||
// TODO MALT results overwriting per database?
|
||||
// TODO Versions for Karken/MALT not report?
|
||||
MULTIQC (
|
||||
ch_multiqc_files.collect()
|
||||
|
|
Loading…
Reference in a new issue