From 47a5ae0cff4060c12451beba47502dae5dbb9d17 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 7 May 2022 06:09:05 +0200 Subject: [PATCH] Add FASTP complexity option --- conf/modules.config | 6 ++- conf/test.config | 1 + conf/test_nopreprocessing.config | 46 ++++++++++++++++ conf/test_noprofiling.config | 46 ++++++++++++++++ docs/usage.md | 4 +- nextflow.config | 3 ++ nextflow_schema.json | 52 ++++++++++++++++--- .../local/shortread_complexityfiltering.nf | 1 + workflows/taxprofiler.nf | 8 ++- 9 files changed, 153 insertions(+), 14 deletions(-) create mode 100644 conf/test_nopreprocessing.config create mode 100644 conf/test_noprofiling.config diff --git a/conf/modules.config b/conf/modules.config index cd0fb04..c834f4e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -54,7 +54,8 @@ process { params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "", params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", // filtering options - "--length_required ${params.shortread_clipmerge_minlength}" + "--length_required ${params.shortread_clipmerge_minlength}", + params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ @@ -74,7 +75,8 @@ process { params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe", // filtering options - "--length_required ${params.shortread_clipmerge_minlength}" + "--length_required ${params.shortread_clipmerge_minlength}", + params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ diff --git a/conf/test.config b/conf/test.config index a5244f9..c687a86 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,6 +29,7 @@ params { perform_shortread_complexityfilter = true perform_shortread_hostremoval = true perform_longread_hostremoval = true + perform_runmerging = true hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' run_kaiju = true run_kraken2 = true diff --git a/conf/test_nopreprocessing.config b/conf/test_nopreprocessing.config new file mode 100644 index 0000000..e8d4ed9 --- /dev/null +++ b/conf/test_nopreprocessing.config @@ -0,0 +1,46 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset skipping all preprocessing to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + perform_shortread_clipmerge = false + perform_longread_clip = false + perform_shortread_complexityfilter = false + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + perform_runmerging = false + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_malt = true + run_metaphlan3 = true + run_centrifuge = true + run_diamond = true +} + +process { + withName: MALT_RUN { + maxForks = 1 + } +} diff --git a/conf/test_noprofiling.config b/conf/test_noprofiling.config new file mode 100644 index 0000000..f908651 --- /dev/null +++ b/conf/test_noprofiling.config @@ -0,0 +1,46 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset without performing any profiling to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + perform_shortread_clipmerge = true + perform_longread_clip = true + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = false + run_kraken2 = false + run_malt = false + run_metaphlan3 = false + run_centrifuge = false + run_diamond = false +} + +process { + withName: MALT_RUN { + maxForks = 1 + } +} diff --git a/docs/usage.md b/docs/usage.md index 4aa1d09..47ac952 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -183,11 +183,11 @@ Complexity filtering can be activated via the `--perform_shortread_complexityfil Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources. -There are currently two options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) and [`prinseq++`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/). +There are currently three options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus), and [`fastp`](https://github.com/OpenGene/fastp#low-complexity-filter). The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of both tools (see links above) to decide on optimal methods and parameters for your dataset. -You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. +You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. If running with `fastp`, complexity filtering happens inclusively within the earlier shortread preprocessing step. Therefore there will not be an independent pipeline step for complexity filtering, and no independent FASTQ file (i.e. `--save_complexityfiltered_reads` will be ignored) - your complexity filtered reads will also be in the `fastp/` folder in the same file(s) as the preprocessed read. #### Host Removal diff --git a/nextflow.config b/nextflow.config index ca9e280..411e7a6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -74,6 +74,7 @@ params { shortread_complexityfilter_bbduk_mask = false shortread_complexityfilter_prinseqplusplus_mode = 'entropy' shortread_complexityfilter_prinseqplusplus_dustscore = 0.5 + shortread_complexityfilter_fastp_threshold = 30 save_complexityfiltered_reads = false // run merging @@ -185,6 +186,8 @@ profiles { } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } + test_noprofiling { includeConfig 'conf/test_noprofiling.config' } + test_nopreprocessing { includeConfig 'conf/test_preprocessing.config' } } // Load igenomes.config if required diff --git a/nextflow_schema.json b/nextflow_schema.json index ab2108e..a0a830c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -294,7 +304,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -319,7 +332,12 @@ }, "shortread_complexityfilter_tool": { "type": "string", - "default": "bbduk" + "default": "bbduk", + "enum": [ + "bbduk", + "prinseqplusplus", + "fastp" + ] }, "shortread_complexityfilter_bbduk_windowsize": { "type": "integer", @@ -335,7 +353,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -391,7 +412,14 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": ["phylum", "class", "order", "family", "genus", "species"] + "enum": [ + "phylum", + "class", + "order", + "family", + "genus", + "species" + ] }, "run_diamond": { "type": "boolean" @@ -399,11 +427,19 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"] + "enum": [ + "blast", + "xml", + "txt", + "daa", + "sam", + "tsv", + "paf" + ] }, "longread_hostremoval_index": { "type": "string", "default": "None" } } -} +} \ No newline at end of file diff --git a/subworkflows/local/shortread_complexityfiltering.nf b/subworkflows/local/shortread_complexityfiltering.nf index 12686d7..a34440d 100644 --- a/subworkflows/local/shortread_complexityfiltering.nf +++ b/subworkflows/local/shortread_complexityfiltering.nf @@ -13,6 +13,7 @@ workflow SHORTREAD_COMPLEXITYFILTERING { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() + // fastp complexity filtering is activated via modules.conf in shortread_preprocessing if ( params.shortread_complexityfilter_tool == 'bbduk' ) { ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads ch_versions = ch_versions.mix( BBMAP_BBDUK.out.versions.first() ) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 7a6cd09..b8b953b 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -19,9 +19,12 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } + if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" +if (params.shortread_complexityfilter_tool == 'fastp' && ( params.perform_shortread_clipmerge == false || params.shortread_clipmerge_tool != 'fastp' )) exit 1, "ERROR: [nf-core/taxprofiler] cannot use fastp complexity filtering if preprocessing not turned on and/or tool is not fastp. Please specify --perform_shortread_clipmerge and/or --shortread_clipmerge_tool 'fastp'" + if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." } if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." } @@ -131,7 +134,8 @@ workflow TAXPROFILER { SUBWORKFLOW: COMPLEXITY FILTERING */ - if ( params.perform_shortread_complexityfilter ) { + // fastp complexity filtering is activated via modules.conf in shortread_preprocessing + if ( params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp' ) { ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } else { @@ -228,7 +232,7 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) } - if (params.perform_shortread_complexityfilter){ + if (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp'){ ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) }