mirror of
https://github.com/MillironX/taxprofiler.git
synced 2024-11-25 22:19:54 +00:00
Merge pull request #79 from nf-core/fastp-complexity
Add FASTP complexity option
This commit is contained in:
commit
fd71b71929
10 changed files with 117 additions and 9 deletions
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
|
@ -38,7 +38,7 @@ jobs:
|
||||||
- "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs --shortread_clipmerge_excludeunmerged"
|
- "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs --shortread_clipmerge_excludeunmerged"
|
||||||
- "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs"
|
- "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs"
|
||||||
- "--shortread_complexityfilter_tool bbduk"
|
- "--shortread_complexityfilter_tool bbduk"
|
||||||
- "--shortread_complexityfilter_tool prinseq"
|
- "--shortread_complexityfilter_tool prinseqplusplus"
|
||||||
- "--perform_runmerging"
|
- "--perform_runmerging"
|
||||||
- "--perform_runmerging --shortread_clipmerge_mergepairs"
|
- "--perform_runmerging --shortread_clipmerge_mergepairs"
|
||||||
- "--shortread_complexityfilter false --perform_shortread_hostremoval"
|
- "--shortread_complexityfilter false --perform_shortread_hostremoval"
|
||||||
|
|
|
@ -54,7 +54,8 @@ process {
|
||||||
params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "",
|
params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "",
|
||||||
params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
|
params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
|
||||||
// filtering options
|
// filtering options
|
||||||
"--length_required ${params.shortread_clipmerge_minlength}"
|
"--length_required ${params.shortread_clipmerge_minlength}",
|
||||||
|
(params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp') ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : ''
|
||||||
].join(' ').trim()
|
].join(' ').trim()
|
||||||
ext.prefix = { "${meta.id}_${meta.run_accession}" }
|
ext.prefix = { "${meta.id}_${meta.run_accession}" }
|
||||||
publishDir = [
|
publishDir = [
|
||||||
|
@ -74,7 +75,8 @@ process {
|
||||||
params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
|
params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
|
||||||
params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe",
|
params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe",
|
||||||
// filtering options
|
// filtering options
|
||||||
"--length_required ${params.shortread_clipmerge_minlength}"
|
"--length_required ${params.shortread_clipmerge_minlength}",
|
||||||
|
params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : ''
|
||||||
].join(' ').trim()
|
].join(' ').trim()
|
||||||
ext.prefix = { "${meta.id}_${meta.run_accession}" }
|
ext.prefix = { "${meta.id}_${meta.run_accession}" }
|
||||||
publishDir = [
|
publishDir = [
|
||||||
|
|
|
@ -29,6 +29,7 @@ params {
|
||||||
perform_shortread_complexityfilter = true
|
perform_shortread_complexityfilter = true
|
||||||
perform_shortread_hostremoval = true
|
perform_shortread_hostremoval = true
|
||||||
perform_longread_hostremoval = true
|
perform_longread_hostremoval = true
|
||||||
|
perform_runmerging = true
|
||||||
hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
|
hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
|
||||||
run_kaiju = true
|
run_kaiju = true
|
||||||
run_kraken2 = true
|
run_kraken2 = true
|
||||||
|
|
46
conf/test_nopreprocessing.config
Normal file
46
conf/test_nopreprocessing.config
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
/*
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
Nextflow config file for running minimal tests
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
Defines input files and everything required to run a fast and simple pipeline test.
|
||||||
|
|
||||||
|
Use as follows:
|
||||||
|
nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
|
||||||
|
|
||||||
|
----------------------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
|
||||||
|
params {
|
||||||
|
config_profile_name = 'Test profile'
|
||||||
|
config_profile_description = 'Minimal test dataset skipping all preprocessing to check pipeline function'
|
||||||
|
|
||||||
|
// Limit resources so that this can run on GitHub Actions
|
||||||
|
max_cpus = 2
|
||||||
|
max_memory = '6.GB'
|
||||||
|
max_time = '6.h'
|
||||||
|
|
||||||
|
// Input data
|
||||||
|
// TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
|
||||||
|
// TODO nf-core: Give any required params for the test so that command line flags are not needed
|
||||||
|
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
|
||||||
|
databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
|
||||||
|
perform_shortread_clipmerge = false
|
||||||
|
perform_longread_clip = false
|
||||||
|
perform_shortread_complexityfilter = false
|
||||||
|
perform_shortread_hostremoval = false
|
||||||
|
perform_longread_hostremoval = false
|
||||||
|
perform_runmerging = false
|
||||||
|
hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
|
||||||
|
run_kaiju = true
|
||||||
|
run_kraken2 = true
|
||||||
|
run_malt = true
|
||||||
|
run_metaphlan3 = true
|
||||||
|
run_centrifuge = true
|
||||||
|
run_diamond = true
|
||||||
|
}
|
||||||
|
|
||||||
|
process {
|
||||||
|
withName: MALT_RUN {
|
||||||
|
maxForks = 1
|
||||||
|
}
|
||||||
|
}
|
46
conf/test_noprofiling.config
Normal file
46
conf/test_noprofiling.config
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
/*
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
Nextflow config file for running minimal tests
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
Defines input files and everything required to run a fast and simple pipeline test.
|
||||||
|
|
||||||
|
Use as follows:
|
||||||
|
nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
|
||||||
|
|
||||||
|
----------------------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
|
||||||
|
params {
|
||||||
|
config_profile_name = 'Test profile'
|
||||||
|
config_profile_description = 'Minimal test dataset without performing any profiling to check pipeline function'
|
||||||
|
|
||||||
|
// Limit resources so that this can run on GitHub Actions
|
||||||
|
max_cpus = 2
|
||||||
|
max_memory = '6.GB'
|
||||||
|
max_time = '6.h'
|
||||||
|
|
||||||
|
// Input data
|
||||||
|
// TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
|
||||||
|
// TODO nf-core: Give any required params for the test so that command line flags are not needed
|
||||||
|
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
|
||||||
|
databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
|
||||||
|
perform_shortread_clipmerge = true
|
||||||
|
perform_longread_clip = true
|
||||||
|
perform_shortread_complexityfilter = true
|
||||||
|
perform_shortread_hostremoval = true
|
||||||
|
perform_longread_hostremoval = true
|
||||||
|
perform_runmerging = true
|
||||||
|
hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
|
||||||
|
run_kaiju = false
|
||||||
|
run_kraken2 = false
|
||||||
|
run_malt = false
|
||||||
|
run_metaphlan3 = false
|
||||||
|
run_centrifuge = false
|
||||||
|
run_diamond = false
|
||||||
|
}
|
||||||
|
|
||||||
|
process {
|
||||||
|
withName: MALT_RUN {
|
||||||
|
maxForks = 1
|
||||||
|
}
|
||||||
|
}
|
|
@ -183,11 +183,11 @@ Complexity filtering can be activated via the `--perform_shortread_complexityfil
|
||||||
|
|
||||||
Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources.
|
Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources.
|
||||||
|
|
||||||
There are currently two options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) and [`prinseq++`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/).
|
There are currently three options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus), and [`fastp`](https://github.com/OpenGene/fastp#low-complexity-filter).
|
||||||
|
|
||||||
The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of both tools (see links above) to decide on optimal methods and parameters for your dataset.
|
The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of the tools (see links above) to decide on optimal methods and parameters for your dataset.
|
||||||
|
|
||||||
You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`.
|
You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. If running with `fastp`, complexity filtering happens inclusively within the earlier shortread preprocessing step. Therefore there will not be an independent pipeline step for complexity filtering, and no independent FASTQ file (i.e. `--save_complexityfiltered_reads` will be ignored) - your complexity filtered reads will also be in the `fastp/` folder in the same file(s) as the preprocessed read.
|
||||||
|
|
||||||
#### Host Removal
|
#### Host Removal
|
||||||
|
|
||||||
|
|
|
@ -74,6 +74,7 @@ params {
|
||||||
shortread_complexityfilter_bbduk_mask = false
|
shortread_complexityfilter_bbduk_mask = false
|
||||||
shortread_complexityfilter_prinseqplusplus_mode = 'entropy'
|
shortread_complexityfilter_prinseqplusplus_mode = 'entropy'
|
||||||
shortread_complexityfilter_prinseqplusplus_dustscore = 0.5
|
shortread_complexityfilter_prinseqplusplus_dustscore = 0.5
|
||||||
|
shortread_complexityfilter_fastp_threshold = 30
|
||||||
save_complexityfiltered_reads = false
|
save_complexityfiltered_reads = false
|
||||||
|
|
||||||
// run merging
|
// run merging
|
||||||
|
@ -185,6 +186,8 @@ profiles {
|
||||||
}
|
}
|
||||||
test { includeConfig 'conf/test.config' }
|
test { includeConfig 'conf/test.config' }
|
||||||
test_full { includeConfig 'conf/test_full.config' }
|
test_full { includeConfig 'conf/test_full.config' }
|
||||||
|
test_noprofiling { includeConfig 'conf/test_noprofiling.config' }
|
||||||
|
test_nopreprocessing { includeConfig 'conf/test_preprocessing.config' }
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load igenomes.config if required
|
// Load igenomes.config if required
|
||||||
|
|
|
@ -319,7 +319,8 @@
|
||||||
},
|
},
|
||||||
"shortread_complexityfilter_tool": {
|
"shortread_complexityfilter_tool": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"default": "bbduk"
|
"default": "bbduk",
|
||||||
|
"enum": ["bbduk", "prinseqplusplus", "fastp"]
|
||||||
},
|
},
|
||||||
"shortread_complexityfilter_bbduk_windowsize": {
|
"shortread_complexityfilter_bbduk_windowsize": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
|
@ -404,6 +405,10 @@
|
||||||
"longread_hostremoval_index": {
|
"longread_hostremoval_index": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"default": "None"
|
"default": "None"
|
||||||
|
},
|
||||||
|
"shortread_complexityfilter_fastp_threshold": {
|
||||||
|
"type": "integer",
|
||||||
|
"default": 30
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,6 +13,7 @@ workflow SHORTREAD_COMPLEXITYFILTERING {
|
||||||
ch_versions = Channel.empty()
|
ch_versions = Channel.empty()
|
||||||
ch_multiqc_files = Channel.empty()
|
ch_multiqc_files = Channel.empty()
|
||||||
|
|
||||||
|
// fastp complexity filtering is activated via modules.conf in shortread_preprocessing
|
||||||
if ( params.shortread_complexityfilter_tool == 'bbduk' ) {
|
if ( params.shortread_complexityfilter_tool == 'bbduk' ) {
|
||||||
ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads
|
ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads
|
||||||
ch_versions = ch_versions.mix( BBMAP_BBDUK.out.versions.first() )
|
ch_versions = ch_versions.mix( BBMAP_BBDUK.out.versions.first() )
|
||||||
|
|
|
@ -19,9 +19,12 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true
|
||||||
// Check mandatory parameters
|
// Check mandatory parameters
|
||||||
if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' }
|
if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' }
|
||||||
if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
|
if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
|
||||||
|
|
||||||
if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files."
|
if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files."
|
||||||
if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs"
|
if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs"
|
||||||
|
|
||||||
|
if (params.shortread_complexityfilter_tool == 'fastp' && ( params.perform_shortread_clipmerge == false || params.shortread_clipmerge_tool != 'fastp' )) exit 1, "ERROR: [nf-core/taxprofiler] cannot use fastp complexity filtering if preprocessing not turned on and/or tool is not fastp. Please specify --perform_shortread_clipmerge and/or --shortread_clipmerge_tool 'fastp'"
|
||||||
|
|
||||||
if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." }
|
if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." }
|
||||||
if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." }
|
if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." }
|
||||||
|
|
||||||
|
@ -131,7 +134,8 @@ workflow TAXPROFILER {
|
||||||
SUBWORKFLOW: COMPLEXITY FILTERING
|
SUBWORKFLOW: COMPLEXITY FILTERING
|
||||||
*/
|
*/
|
||||||
|
|
||||||
if ( params.perform_shortread_complexityfilter ) {
|
// fastp complexity filtering is activated via modules.conf in shortread_preprocessing
|
||||||
|
if ( params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp' ) {
|
||||||
ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads
|
ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads
|
||||||
ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions )
|
ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions )
|
||||||
} else {
|
} else {
|
||||||
|
@ -228,7 +232,7 @@ workflow TAXPROFILER {
|
||||||
ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) )
|
ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) )
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.perform_shortread_complexityfilter){
|
if (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp'){
|
||||||
ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) )
|
ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) )
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue