1
0
Fork 0
mirror of https://github.com/MillironX/taxprofiler.git synced 2024-11-25 19:09:56 +00:00

Add FASTP complexity option

This commit is contained in:
James Fellows Yates 2022-05-07 06:09:05 +02:00
parent e7b54801ed
commit 47a5ae0cff
9 changed files with 153 additions and 14 deletions

View file

@ -54,7 +54,8 @@ process {
params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "", params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "",
params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
// filtering options // filtering options
"--length_required ${params.shortread_clipmerge_minlength}" "--length_required ${params.shortread_clipmerge_minlength}",
params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : ''
].join(' ').trim() ].join(' ').trim()
ext.prefix = { "${meta.id}_${meta.run_accession}" } ext.prefix = { "${meta.id}_${meta.run_accession}" }
publishDir = [ publishDir = [
@ -74,7 +75,8 @@ process {
params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe", params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe",
// filtering options // filtering options
"--length_required ${params.shortread_clipmerge_minlength}" "--length_required ${params.shortread_clipmerge_minlength}",
params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : ''
].join(' ').trim() ].join(' ').trim()
ext.prefix = { "${meta.id}_${meta.run_accession}" } ext.prefix = { "${meta.id}_${meta.run_accession}" }
publishDir = [ publishDir = [

View file

@ -29,6 +29,7 @@ params {
perform_shortread_complexityfilter = true perform_shortread_complexityfilter = true
perform_shortread_hostremoval = true perform_shortread_hostremoval = true
perform_longread_hostremoval = true perform_longread_hostremoval = true
perform_runmerging = true
hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
run_kaiju = true run_kaiju = true
run_kraken2 = true run_kraken2 = true

View file

@ -0,0 +1,46 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/
params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset skipping all preprocessing to check pipeline function'
// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'
// Input data
// TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
// TODO nf-core: Give any required params for the test so that command line flags are not needed
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
perform_shortread_clipmerge = false
perform_longread_clip = false
perform_shortread_complexityfilter = false
perform_shortread_hostremoval = false
perform_longread_hostremoval = false
perform_runmerging = false
hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
run_kaiju = true
run_kraken2 = true
run_malt = true
run_metaphlan3 = true
run_centrifuge = true
run_diamond = true
}
process {
withName: MALT_RUN {
maxForks = 1
}
}

View file

@ -0,0 +1,46 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/
params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset without performing any profiling to check pipeline function'
// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'
// Input data
// TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
// TODO nf-core: Give any required params for the test so that command line flags are not needed
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
perform_shortread_clipmerge = true
perform_longread_clip = true
perform_shortread_complexityfilter = true
perform_shortread_hostremoval = true
perform_longread_hostremoval = true
perform_runmerging = true
hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
run_kaiju = false
run_kraken2 = false
run_malt = false
run_metaphlan3 = false
run_centrifuge = false
run_diamond = false
}
process {
withName: MALT_RUN {
maxForks = 1
}
}

View file

@ -183,11 +183,11 @@ Complexity filtering can be activated via the `--perform_shortread_complexityfil
Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources. Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources.
There are currently two options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) and [`prinseq++`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/). There are currently three options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus), and [`fastp`](https://github.com/OpenGene/fastp#low-complexity-filter).
The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of both tools (see links above) to decide on optimal methods and parameters for your dataset. The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of both tools (see links above) to decide on optimal methods and parameters for your dataset.
You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. If running with `fastp`, complexity filtering happens inclusively within the earlier shortread preprocessing step. Therefore there will not be an independent pipeline step for complexity filtering, and no independent FASTQ file (i.e. `--save_complexityfiltered_reads` will be ignored) - your complexity filtered reads will also be in the `fastp/` folder in the same file(s) as the preprocessed read.
#### Host Removal #### Host Removal

View file

@ -74,6 +74,7 @@ params {
shortread_complexityfilter_bbduk_mask = false shortread_complexityfilter_bbduk_mask = false
shortread_complexityfilter_prinseqplusplus_mode = 'entropy' shortread_complexityfilter_prinseqplusplus_mode = 'entropy'
shortread_complexityfilter_prinseqplusplus_dustscore = 0.5 shortread_complexityfilter_prinseqplusplus_dustscore = 0.5
shortread_complexityfilter_fastp_threshold = 30
save_complexityfiltered_reads = false save_complexityfiltered_reads = false
// run merging // run merging
@ -185,6 +186,8 @@ profiles {
} }
test { includeConfig 'conf/test.config' } test { includeConfig 'conf/test.config' }
test_full { includeConfig 'conf/test_full.config' } test_full { includeConfig 'conf/test_full.config' }
test_noprofiling { includeConfig 'conf/test_noprofiling.config' }
test_nopreprocessing { includeConfig 'conf/test_preprocessing.config' }
} }
// Load igenomes.config if required // Load igenomes.config if required

View file

@ -10,7 +10,10 @@
"type": "object", "type": "object",
"fa_icon": "fas fa-terminal", "fa_icon": "fas fa-terminal",
"description": "Define where the pipeline should find input data and save output data.", "description": "Define where the pipeline should find input data and save output data.",
"required": ["input", "outdir"], "required": [
"input",
"outdir"
],
"properties": { "properties": {
"input": { "input": {
"type": "string", "type": "string",
@ -173,7 +176,14 @@
"description": "Method used to save pipeline results to output directory.", "description": "Method used to save pipeline results to output directory.",
"help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
"fa_icon": "fas fa-copy", "fa_icon": "fas fa-copy",
"enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "enum": [
"symlink",
"rellink",
"link",
"copy",
"copyNoFollow",
"move"
],
"hidden": true "hidden": true
}, },
"email_on_fail": { "email_on_fail": {
@ -294,7 +304,10 @@
"shortread_clipmerge_tool": { "shortread_clipmerge_tool": {
"type": "string", "type": "string",
"default": "fastp", "default": "fastp",
"enum": ["fastp", "adapterremoval"] "enum": [
"fastp",
"adapterremoval"
]
}, },
"shortread_clipmerge_skipadaptertrim": { "shortread_clipmerge_skipadaptertrim": {
"type": "boolean" "type": "boolean"
@ -319,7 +332,12 @@
}, },
"shortread_complexityfilter_tool": { "shortread_complexityfilter_tool": {
"type": "string", "type": "string",
"default": "bbduk" "default": "bbduk",
"enum": [
"bbduk",
"prinseqplusplus",
"fastp"
]
}, },
"shortread_complexityfilter_bbduk_windowsize": { "shortread_complexityfilter_bbduk_windowsize": {
"type": "integer", "type": "integer",
@ -335,7 +353,10 @@
"shortread_complexityfilter_prinseqplusplus_mode": { "shortread_complexityfilter_prinseqplusplus_mode": {
"type": "string", "type": "string",
"default": "entropy", "default": "entropy",
"enum": ["entropy", "dust"] "enum": [
"entropy",
"dust"
]
}, },
"shortread_complexityfilter_prinseqplusplus_dustscore": { "shortread_complexityfilter_prinseqplusplus_dustscore": {
"type": "number", "type": "number",
@ -391,7 +412,14 @@
"kaiju_taxon_name": { "kaiju_taxon_name": {
"type": "string", "type": "string",
"default": "species", "default": "species",
"enum": ["phylum", "class", "order", "family", "genus", "species"] "enum": [
"phylum",
"class",
"order",
"family",
"genus",
"species"
]
}, },
"run_diamond": { "run_diamond": {
"type": "boolean" "type": "boolean"
@ -399,7 +427,15 @@
"diamond_output_format": { "diamond_output_format": {
"type": "string", "type": "string",
"default": "tsv", "default": "tsv",
"enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"] "enum": [
"blast",
"xml",
"txt",
"daa",
"sam",
"tsv",
"paf"
]
}, },
"longread_hostremoval_index": { "longread_hostremoval_index": {
"type": "string", "type": "string",

View file

@ -13,6 +13,7 @@ workflow SHORTREAD_COMPLEXITYFILTERING {
ch_versions = Channel.empty() ch_versions = Channel.empty()
ch_multiqc_files = Channel.empty() ch_multiqc_files = Channel.empty()
// fastp complexity filtering is activated via modules.conf in shortread_preprocessing
if ( params.shortread_complexityfilter_tool == 'bbduk' ) { if ( params.shortread_complexityfilter_tool == 'bbduk' ) {
ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads
ch_versions = ch_versions.mix( BBMAP_BBDUK.out.versions.first() ) ch_versions = ch_versions.mix( BBMAP_BBDUK.out.versions.first() )

View file

@ -19,9 +19,12 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true
// Check mandatory parameters // Check mandatory parameters
if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' }
if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files."
if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs"
if (params.shortread_complexityfilter_tool == 'fastp' && ( params.perform_shortread_clipmerge == false || params.shortread_clipmerge_tool != 'fastp' )) exit 1, "ERROR: [nf-core/taxprofiler] cannot use fastp complexity filtering if preprocessing not turned on and/or tool is not fastp. Please specify --perform_shortread_clipmerge and/or --shortread_clipmerge_tool 'fastp'"
if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." } if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." }
if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." } if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." }
@ -131,7 +134,8 @@ workflow TAXPROFILER {
SUBWORKFLOW: COMPLEXITY FILTERING SUBWORKFLOW: COMPLEXITY FILTERING
*/ */
if ( params.perform_shortread_complexityfilter ) { // fastp complexity filtering is activated via modules.conf in shortread_preprocessing
if ( params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp' ) {
ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads
ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions )
} else { } else {
@ -228,7 +232,7 @@ workflow TAXPROFILER {
ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) )
} }
if (params.perform_shortread_complexityfilter){ if (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp'){
ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) )
} }