From 47a5ae0cff4060c12451beba47502dae5dbb9d17 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 7 May 2022 06:09:05 +0200 Subject: [PATCH 1/4] Add FASTP complexity option --- conf/modules.config | 6 ++- conf/test.config | 1 + conf/test_nopreprocessing.config | 46 ++++++++++++++++ conf/test_noprofiling.config | 46 ++++++++++++++++ docs/usage.md | 4 +- nextflow.config | 3 ++ nextflow_schema.json | 52 ++++++++++++++++--- .../local/shortread_complexityfiltering.nf | 1 + workflows/taxprofiler.nf | 8 ++- 9 files changed, 153 insertions(+), 14 deletions(-) create mode 100644 conf/test_nopreprocessing.config create mode 100644 conf/test_noprofiling.config diff --git a/conf/modules.config b/conf/modules.config index cd0fb04..c834f4e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -54,7 +54,8 @@ process { params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "", params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", // filtering options - "--length_required ${params.shortread_clipmerge_minlength}" + "--length_required ${params.shortread_clipmerge_minlength}", + params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ @@ -74,7 +75,8 @@ process { params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe", // filtering options - "--length_required ${params.shortread_clipmerge_minlength}" + "--length_required ${params.shortread_clipmerge_minlength}", + params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ diff --git a/conf/test.config b/conf/test.config index a5244f9..c687a86 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,6 +29,7 @@ params { perform_shortread_complexityfilter = true perform_shortread_hostremoval = true perform_longread_hostremoval = true + perform_runmerging = true hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' run_kaiju = true run_kraken2 = true diff --git a/conf/test_nopreprocessing.config b/conf/test_nopreprocessing.config new file mode 100644 index 0000000..e8d4ed9 --- /dev/null +++ b/conf/test_nopreprocessing.config @@ -0,0 +1,46 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset skipping all preprocessing to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + perform_shortread_clipmerge = false + perform_longread_clip = false + perform_shortread_complexityfilter = false + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + perform_runmerging = false + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_malt = true + run_metaphlan3 = true + run_centrifuge = true + run_diamond = true +} + +process { + withName: MALT_RUN { + maxForks = 1 + } +} diff --git a/conf/test_noprofiling.config b/conf/test_noprofiling.config new file mode 100644 index 0000000..f908651 --- /dev/null +++ b/conf/test_noprofiling.config @@ -0,0 +1,46 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset without performing any profiling to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + perform_shortread_clipmerge = true + perform_longread_clip = true + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = false + run_kraken2 = false + run_malt = false + run_metaphlan3 = false + run_centrifuge = false + run_diamond = false +} + +process { + withName: MALT_RUN { + maxForks = 1 + } +} diff --git a/docs/usage.md b/docs/usage.md index 4aa1d09..47ac952 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -183,11 +183,11 @@ Complexity filtering can be activated via the `--perform_shortread_complexityfil Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources. -There are currently two options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) and [`prinseq++`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/). +There are currently three options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus), and [`fastp`](https://github.com/OpenGene/fastp#low-complexity-filter). The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of both tools (see links above) to decide on optimal methods and parameters for your dataset. -You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. +You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. If running with `fastp`, complexity filtering happens inclusively within the earlier shortread preprocessing step. Therefore there will not be an independent pipeline step for complexity filtering, and no independent FASTQ file (i.e. `--save_complexityfiltered_reads` will be ignored) - your complexity filtered reads will also be in the `fastp/` folder in the same file(s) as the preprocessed read. #### Host Removal diff --git a/nextflow.config b/nextflow.config index ca9e280..411e7a6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -74,6 +74,7 @@ params { shortread_complexityfilter_bbduk_mask = false shortread_complexityfilter_prinseqplusplus_mode = 'entropy' shortread_complexityfilter_prinseqplusplus_dustscore = 0.5 + shortread_complexityfilter_fastp_threshold = 30 save_complexityfiltered_reads = false // run merging @@ -185,6 +186,8 @@ profiles { } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } + test_noprofiling { includeConfig 'conf/test_noprofiling.config' } + test_nopreprocessing { includeConfig 'conf/test_preprocessing.config' } } // Load igenomes.config if required diff --git a/nextflow_schema.json b/nextflow_schema.json index ab2108e..a0a830c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -294,7 +304,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -319,7 +332,12 @@ }, "shortread_complexityfilter_tool": { "type": "string", - "default": "bbduk" + "default": "bbduk", + "enum": [ + "bbduk", + "prinseqplusplus", + "fastp" + ] }, "shortread_complexityfilter_bbduk_windowsize": { "type": "integer", @@ -335,7 +353,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -391,7 +412,14 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": ["phylum", "class", "order", "family", "genus", "species"] + "enum": [ + "phylum", + "class", + "order", + "family", + "genus", + "species" + ] }, "run_diamond": { "type": "boolean" @@ -399,11 +427,19 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"] + "enum": [ + "blast", + "xml", + "txt", + "daa", + "sam", + "tsv", + "paf" + ] }, "longread_hostremoval_index": { "type": "string", "default": "None" } } -} +} \ No newline at end of file diff --git a/subworkflows/local/shortread_complexityfiltering.nf b/subworkflows/local/shortread_complexityfiltering.nf index 12686d7..a34440d 100644 --- a/subworkflows/local/shortread_complexityfiltering.nf +++ b/subworkflows/local/shortread_complexityfiltering.nf @@ -13,6 +13,7 @@ workflow SHORTREAD_COMPLEXITYFILTERING { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() + // fastp complexity filtering is activated via modules.conf in shortread_preprocessing if ( params.shortread_complexityfilter_tool == 'bbduk' ) { ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads ch_versions = ch_versions.mix( BBMAP_BBDUK.out.versions.first() ) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 7a6cd09..b8b953b 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -19,9 +19,12 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } + if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" +if (params.shortread_complexityfilter_tool == 'fastp' && ( params.perform_shortread_clipmerge == false || params.shortread_clipmerge_tool != 'fastp' )) exit 1, "ERROR: [nf-core/taxprofiler] cannot use fastp complexity filtering if preprocessing not turned on and/or tool is not fastp. Please specify --perform_shortread_clipmerge and/or --shortread_clipmerge_tool 'fastp'" + if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." } if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." } @@ -131,7 +134,8 @@ workflow TAXPROFILER { SUBWORKFLOW: COMPLEXITY FILTERING */ - if ( params.perform_shortread_complexityfilter ) { + // fastp complexity filtering is activated via modules.conf in shortread_preprocessing + if ( params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp' ) { ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } else { @@ -228,7 +232,7 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) } - if (params.perform_shortread_complexityfilter){ + if (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp'){ ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) } From 0f651873dd711c2223a8f844a091deb1316f6979 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 7 May 2022 06:12:24 +0200 Subject: [PATCH 2/4] Linting --- conf/modules.config | 4 ++-- nextflow_schema.json | 55 ++++++++++---------------------------------- 2 files changed, 14 insertions(+), 45 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index c834f4e..0abff92 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -55,7 +55,7 @@ process { params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", // filtering options "--length_required ${params.shortread_clipmerge_minlength}", - params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' + params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ @@ -76,7 +76,7 @@ process { params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe", // filtering options "--length_required ${params.shortread_clipmerge_minlength}", - params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' + params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ diff --git a/nextflow_schema.json b/nextflow_schema.json index a0a830c..74fab27 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -304,10 +294,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -333,11 +320,7 @@ "shortread_complexityfilter_tool": { "type": "string", "default": "bbduk", - "enum": [ - "bbduk", - "prinseqplusplus", - "fastp" - ] + "enum": ["bbduk", "prinseqplusplus", "fastp"] }, "shortread_complexityfilter_bbduk_windowsize": { "type": "integer", @@ -353,10 +336,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -412,14 +392,7 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": [ - "phylum", - "class", - "order", - "family", - "genus", - "species" - ] + "enum": ["phylum", "class", "order", "family", "genus", "species"] }, "run_diamond": { "type": "boolean" @@ -427,19 +400,15 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": [ - "blast", - "xml", - "txt", - "daa", - "sam", - "tsv", - "paf" - ] + "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"] }, "longread_hostremoval_index": { "type": "string", "default": "None" + }, + "shortread_complexityfilter_fastp_threshold": { + "type": "integer", + "default": 30 } } -} \ No newline at end of file +} From 3fa2181f498f5e1c994efb06be3a63f459dcf9cc Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 7 May 2022 06:15:15 +0200 Subject: [PATCH 3/4] Fix prinseq tool selection --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a1ece72..7bb2076 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,7 +38,7 @@ jobs: - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs --shortread_clipmerge_excludeunmerged" - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs" - "--shortread_complexityfilter_tool bbduk" - - "--shortread_complexityfilter_tool prinseq" + - "--shortread_complexityfilter_tool prinseqplusplus" - "--perform_runmerging" - "--perform_runmerging --shortread_clipmerge_mergepairs" - "--shortread_complexityfilter false --perform_shortread_hostremoval" From d67543503b927f6af5bce9574ad8b428c53c0c46 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Sat, 7 May 2022 13:15:21 +0200 Subject: [PATCH 4/4] Apply suggestions from code review Co-authored-by: Moritz E. Beber --- conf/modules.config | 2 +- docs/usage.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 0abff92..5d8398e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -55,7 +55,7 @@ process { params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", // filtering options "--length_required ${params.shortread_clipmerge_minlength}", - params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' + (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp') ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ diff --git a/docs/usage.md b/docs/usage.md index 47ac952..54ffce0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -185,7 +185,7 @@ Complexity filtering is primarily a run-time optimisation step. It is not necess There are currently three options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus), and [`fastp`](https://github.com/OpenGene/fastp#low-complexity-filter). -The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of both tools (see links above) to decide on optimal methods and parameters for your dataset. +The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of the tools (see links above) to decide on optimal methods and parameters for your dataset. You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. If running with `fastp`, complexity filtering happens inclusively within the earlier shortread preprocessing step. Therefore there will not be an independent pipeline step for complexity filtering, and no independent FASTQ file (i.e. `--save_complexityfiltered_reads` will be ignored) - your complexity filtered reads will also be in the `fastp/` folder in the same file(s) as the preprocessed read.