From 47a5ae0cff4060c12451beba47502dae5dbb9d17 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Sat, 7 May 2022 06:09:05 +0200
Subject: [PATCH] Add FASTP complexity option

---
 conf/modules.config                           |  6 ++-
 conf/test.config                              |  1 +
 conf/test_nopreprocessing.config              | 46 ++++++++++++++++
 conf/test_noprofiling.config                  | 46 ++++++++++++++++
 docs/usage.md                                 |  4 +-
 nextflow.config                               |  3 ++
 nextflow_schema.json                          | 52 ++++++++++++++++---
 .../local/shortread_complexityfiltering.nf    |  1 +
 workflows/taxprofiler.nf                      |  8 ++-
 9 files changed, 153 insertions(+), 14 deletions(-)
 create mode 100644 conf/test_nopreprocessing.config
 create mode 100644 conf/test_noprofiling.config
diff --git a/conf/modules.config b/conf/modules.config
index cd0fb04..c834f4e 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -54,7 +54,8 @@ process {
             params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "",
             params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
             // filtering options
-            "--length_required ${params.shortread_clipmerge_minlength}"
+            "--length_required ${params.shortread_clipmerge_minlength}",
+           params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : ''
         ].join(' ').trim()
         ext.prefix = { "${meta.id}_${meta.run_accession}" }
         publishDir = [
@@ -74,7 +75,8 @@ process {
             params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
             params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe",
             // filtering options
-            "--length_required ${params.shortread_clipmerge_minlength}"
+            "--length_required ${params.shortread_clipmerge_minlength}",
+           params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : ''
         ].join(' ').trim()
         ext.prefix = { "${meta.id}_${meta.run_accession}" }
         publishDir = [
diff --git a/conf/test.config b/conf/test.config
index a5244f9..c687a86 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -29,6 +29,7 @@ params {
     perform_shortread_complexityfilter    = true
     perform_shortread_hostremoval         = true
     perform_longread_hostremoval          = true
+    perform_runmerging                    = true
     hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
     run_kaiju                             = true
     run_kraken2                           = true
diff --git a/conf/test_nopreprocessing.config b/conf/test_nopreprocessing.config
new file mode 100644
index 0000000..e8d4ed9
--- /dev/null
+++ b/conf/test_nopreprocessing.config
@@ -0,0 +1,46 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset skipping all preprocessing to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
+    // TODO nf-core: Give any required params for the test so that command line flags are not needed
+    input                                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
+    databases                             = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
+    perform_shortread_clipmerge           = false
+    perform_longread_clip                 = false
+    perform_shortread_complexityfilter    = false
+    perform_shortread_hostremoval         = false
+    perform_longread_hostremoval          = false
+    perform_runmerging                    = false
+    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
+    run_kaiju                             = true
+    run_kraken2                           = true
+    run_malt                              = true
+    run_metaphlan3                        = true
+    run_centrifuge                        = true
+    run_diamond                           = true
+}
+
+process {
+    withName: MALT_RUN {
+        maxForks = 1
+    }
+}
diff --git a/conf/test_noprofiling.config b/conf/test_noprofiling.config
new file mode 100644
index 0000000..f908651
--- /dev/null
+++ b/conf/test_noprofiling.config
@@ -0,0 +1,46 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset without performing any profiling to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
+    // TODO nf-core: Give any required params for the test so that command line flags are not needed
+    input                                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
+    databases                             = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
+    perform_shortread_clipmerge           = true
+    perform_longread_clip                 = true
+    perform_shortread_complexityfilter    = true
+    perform_shortread_hostremoval         = true
+    perform_longread_hostremoval          = true
+    perform_runmerging                    = true
+    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
+    run_kaiju                             = false
+    run_kraken2                           = false
+    run_malt                              = false
+    run_metaphlan3                        = false
+    run_centrifuge                        = false
+    run_diamond                           = false
+}
+
+process {
+    withName: MALT_RUN {
+        maxForks = 1
+    }
+}
diff --git a/docs/usage.md b/docs/usage.md
index 4aa1d09..47ac952 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -183,11 +183,11 @@ Complexity filtering can be activated via the `--perform_shortread_complexityfil
 
 Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources.
 
-There are currently two options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) and [`prinseq++`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/).
+There are currently three options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus), and [`fastp`](https://github.com/OpenGene/fastp#low-complexity-filter).
 
 The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of both tools (see links above) to decide on optimal methods and parameters for your dataset.
 
-You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`.
+You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. If running with `fastp`, complexity filtering happens inclusively within the earlier shortread preprocessing step. Therefore there will not be an independent pipeline step for complexity filtering, and no independent FASTQ file (i.e. `--save_complexityfiltered_reads` will be ignored) - your complexity filtered reads will also be in the `fastp/` folder in the same file(s) as the preprocessed read.
 
 #### Host Removal
 
diff --git a/nextflow.config b/nextflow.config
index ca9e280..411e7a6 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -74,6 +74,7 @@ params {
     shortread_complexityfilter_bbduk_mask                = false
     shortread_complexityfilter_prinseqplusplus_mode      = 'entropy'
     shortread_complexityfilter_prinseqplusplus_dustscore = 0.5
+    shortread_complexityfilter_fastp_threshold           = 30
     save_complexityfiltered_reads                        = false
 
     // run merging
@@ -185,6 +186,8 @@ profiles {
     }
     test      { includeConfig 'conf/test.config'      }
     test_full { includeConfig 'conf/test_full.config' }
+    test_noprofiling { includeConfig 'conf/test_noprofiling.config' }
+    test_nopreprocessing { includeConfig 'conf/test_preprocessing.config' }
 }
 
 // Load igenomes.config if required
diff --git a/nextflow_schema.json b/nextflow_schema.json
index ab2108e..a0a830c 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -10,7 +10,10 @@
             "type": "object",
             "fa_icon": "fas fa-terminal",
             "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input", "outdir"],
+            "required": [
+                "input",
+                "outdir"
+            ],
             "properties": {
                 "input": {
                     "type": "string",
@@ -173,7 +176,14 @@
                     "description": "Method used to save pipeline results to output directory.",
                     "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                     "fa_icon": "fas fa-copy",
-                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
+                    "enum": [
+                        "symlink",
+                        "rellink",
+                        "link",
+                        "copy",
+                        "copyNoFollow",
+                        "move"
+                    ],
                     "hidden": true
                 },
                 "email_on_fail": {
@@ -294,7 +304,10 @@
         "shortread_clipmerge_tool": {
             "type": "string",
             "default": "fastp",
-            "enum": ["fastp", "adapterremoval"]
+            "enum": [
+                "fastp",
+                "adapterremoval"
+            ]
         },
         "shortread_clipmerge_skipadaptertrim": {
             "type": "boolean"
@@ -319,7 +332,12 @@
         },
         "shortread_complexityfilter_tool": {
             "type": "string",
-            "default": "bbduk"
+            "default": "bbduk",
+            "enum": [
+                "bbduk",
+                "prinseqplusplus",
+                "fastp"
+            ]
         },
         "shortread_complexityfilter_bbduk_windowsize": {
             "type": "integer",
@@ -335,7 +353,10 @@
         "shortread_complexityfilter_prinseqplusplus_mode": {
             "type": "string",
             "default": "entropy",
-            "enum": ["entropy", "dust"]
+            "enum": [
+                "entropy",
+                "dust"
+            ]
         },
         "shortread_complexityfilter_prinseqplusplus_dustscore": {
             "type": "number",
@@ -391,7 +412,14 @@
         "kaiju_taxon_name": {
             "type": "string",
             "default": "species",
-            "enum": ["phylum", "class", "order", "family", "genus", "species"]
+            "enum": [
+                "phylum",
+                "class",
+                "order",
+                "family",
+                "genus",
+                "species"
+            ]
         },
         "run_diamond": {
             "type": "boolean"
@@ -399,11 +427,19 @@
         "diamond_output_format": {
             "type": "string",
             "default": "tsv",
-            "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"]
+            "enum": [
+                "blast",
+                "xml",
+                "txt",
+                "daa",
+                "sam",
+                "tsv",
+                "paf"
+            ]
         },
         "longread_hostremoval_index": {
             "type": "string",
             "default": "None"
         }
     }
-}
+}
\ No newline at end of file
diff --git a/subworkflows/local/shortread_complexityfiltering.nf b/subworkflows/local/shortread_complexityfiltering.nf
index 12686d7..a34440d 100644
--- a/subworkflows/local/shortread_complexityfiltering.nf
+++ b/subworkflows/local/shortread_complexityfiltering.nf
@@ -13,6 +13,7 @@ workflow SHORTREAD_COMPLEXITYFILTERING {
     ch_versions       = Channel.empty()
     ch_multiqc_files  = Channel.empty()
 
+    // fastp complexity filtering is activated via modules.conf in shortread_preprocessing
     if ( params.shortread_complexityfilter_tool == 'bbduk' ) {
         ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads
         ch_versions        =  ch_versions.mix( BBMAP_BBDUK.out.versions.first() )
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index 7a6cd09..b8b953b 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -19,9 +19,12 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true
 // Check mandatory parameters
 if (params.input    ) { ch_input     = file(params.input)     } else { exit 1, 'Input samplesheet not specified!' }
 if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
+
 if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files."
 if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs"
 
+if (params.shortread_complexityfilter_tool == 'fastp' && ( params.perform_shortread_clipmerge == false || params.shortread_clipmerge_tool != 'fastp' ))  exit 1, "ERROR: [nf-core/taxprofiler] cannot use fastp complexity filtering if preprocessing not turned on and/or tool is not fastp. Please specify --perform_shortread_clipmerge and/or --shortread_clipmerge_tool 'fastp'"
+
 if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." }
 if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." }
 
@@ -131,7 +134,8 @@ workflow TAXPROFILER {
         SUBWORKFLOW: COMPLEXITY FILTERING
     */
 
-    if ( params.perform_shortread_complexityfilter ) {
+    // fastp complexity filtering is activated via modules.conf in shortread_preprocessing
+    if ( params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp' ) {
         ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads
         ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions )
     } else {
@@ -228,7 +232,7 @@ workflow TAXPROFILER {
         ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) )
     }
 
-    if (params.perform_shortread_complexityfilter){
+    if (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp'){
         ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) )
     }