Add FASTP complexity option

2024-11-22 00:46:04 +00:00 · 2022-05-07 06:09:05 +02:00 · 2022-05-07 06:09:05 +02:00 · 47a5ae0cff
commit 47a5ae0cff
parent e7b54801ed
9 changed files with 153 additions and 14 deletions
--- a/conf/modules.config
+++ b/conf/modules.config
@ -54,7 +54,8 @@ process {
            params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "",
            params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
            // filtering options
-            "--length_required ${params.shortread_clipmerge_minlength}"
+            "--length_required ${params.shortread_clipmerge_minlength}",
+           params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : ''
        ].join(' ').trim()
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
@ -74,7 +75,8 @@ process {
            params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
            params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe",
            // filtering options
-            "--length_required ${params.shortread_clipmerge_minlength}"
+            "--length_required ${params.shortread_clipmerge_minlength}",
+           params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : ''
        ].join(' ').trim()
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
--- a/conf/test.config
+++ b/conf/test.config
@ -29,6 +29,7 @@ params {
    perform_shortread_complexityfilter    = true
    perform_shortread_hostremoval         = true
    perform_longread_hostremoval          = true
+    perform_runmerging                    = true
    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
    run_kaiju                             = true
    run_kraken2                           = true
--- a/conf/test_nopreprocessing.config
+++ b/conf/test_nopreprocessing.config
@ -0,0 +1,46 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset skipping all preprocessing to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
+    // TODO nf-core: Give any required params for the test so that command line flags are not needed
+    input                                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
+    databases                             = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
+    perform_shortread_clipmerge           = false
+    perform_longread_clip                 = false
+    perform_shortread_complexityfilter    = false
+    perform_shortread_hostremoval         = false
+    perform_longread_hostremoval          = false
+    perform_runmerging                    = false
+    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
+    run_kaiju                             = true
+    run_kraken2                           = true
+    run_malt                              = true
+    run_metaphlan3                        = true
+    run_centrifuge                        = true
+    run_diamond                           = true
+}
+
+process {
+    withName: MALT_RUN {
+        maxForks = 1
+    }
+}
--- a/conf/test_noprofiling.config
+++ b/conf/test_noprofiling.config
@ -0,0 +1,46 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset without performing any profiling to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
+    // TODO nf-core: Give any required params for the test so that command line flags are not needed
+    input                                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
+    databases                             = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
+    perform_shortread_clipmerge           = true
+    perform_longread_clip                 = true
+    perform_shortread_complexityfilter    = true
+    perform_shortread_hostremoval         = true
+    perform_longread_hostremoval          = true
+    perform_runmerging                    = true
+    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
+    run_kaiju                             = false
+    run_kraken2                           = false
+    run_malt                              = false
+    run_metaphlan3                        = false
+    run_centrifuge                        = false
+    run_diamond                           = false
+}
+
+process {
+    withName: MALT_RUN {
+        maxForks = 1
+    }
+}
--- a/docs/usage.md
+++ b/docs/usage.md
@ -183,11 +183,11 @@ Complexity filtering can be activated via the `--perform_shortread_complexityfil

 Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources.

-There are currently two options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) and [`prinseq++`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/).
+There are currently three options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus), and [`fastp`](https://github.com/OpenGene/fastp#low-complexity-filter).

 The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of both tools (see links above) to decide on optimal methods and parameters for your dataset.

-You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`.
+You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. If running with `fastp`, complexity filtering happens inclusively within the earlier shortread preprocessing step. Therefore there will not be an independent pipeline step for complexity filtering, and no independent FASTQ file (i.e. `--save_complexityfiltered_reads` will be ignored) - your complexity filtered reads will also be in the `fastp/` folder in the same file(s) as the preprocessed read.

 #### Host Removal

--- a/nextflow.config
+++ b/nextflow.config
@ -74,6 +74,7 @@ params {
    shortread_complexityfilter_bbduk_mask                = false
    shortread_complexityfilter_prinseqplusplus_mode      = 'entropy'
    shortread_complexityfilter_prinseqplusplus_dustscore = 0.5
+    shortread_complexityfilter_fastp_threshold           = 30
    save_complexityfiltered_reads                        = false

    // run merging
@ -185,6 +186,8 @@ profiles {
    }
    test      { includeConfig 'conf/test.config'      }
    test_full { includeConfig 'conf/test_full.config' }
+    test_noprofiling { includeConfig 'conf/test_noprofiling.config' }
+    test_nopreprocessing { includeConfig 'conf/test_preprocessing.config' }
 }

 // Load igenomes.config if required
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -10,7 +10,10 @@
            "type": "object",
            "fa_icon": "fas fa-terminal",
            "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input", "outdir"],
+            "required": [
+                "input",
+                "outdir"
+            ],
            "properties": {
                "input": {
                    "type": "string",
@ -173,7 +176,14 @@
                    "description": "Method used to save pipeline results to output directory.",
                    "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                    "fa_icon": "fas fa-copy",
-                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
+                    "enum": [
+                        "symlink",
+                        "rellink",
+                        "link",
+                        "copy",
+                        "copyNoFollow",
+                        "move"
+                    ],
                    "hidden": true
                },
                "email_on_fail": {
@ -294,7 +304,10 @@
        "shortread_clipmerge_tool": {
            "type": "string",
            "default": "fastp",
-            "enum": ["fastp", "adapterremoval"]
+            "enum": [
+                "fastp",
+                "adapterremoval"
+            ]
        },
        "shortread_clipmerge_skipadaptertrim": {
            "type": "boolean"
@ -319,7 +332,12 @@
        },
        "shortread_complexityfilter_tool": {
            "type": "string",
-            "default": "bbduk"
+            "default": "bbduk",
+            "enum": [
+                "bbduk",
+                "prinseqplusplus",
+                "fastp"
+            ]
        },
        "shortread_complexityfilter_bbduk_windowsize": {
            "type": "integer",
@ -335,7 +353,10 @@
        "shortread_complexityfilter_prinseqplusplus_mode": {
            "type": "string",
            "default": "entropy",
-            "enum": ["entropy", "dust"]
+            "enum": [
+                "entropy",
+                "dust"
+            ]
        },
        "shortread_complexityfilter_prinseqplusplus_dustscore": {
            "type": "number",
@ -391,7 +412,14 @@
        "kaiju_taxon_name": {
            "type": "string",
            "default": "species",
-            "enum": ["phylum", "class", "order", "family", "genus", "species"]
+            "enum": [
+                "phylum",
+                "class",
+                "order",
+                "family",
+                "genus",
+                "species"
+            ]
        },
        "run_diamond": {
            "type": "boolean"
@ -399,11 +427,19 @@
        "diamond_output_format": {
            "type": "string",
            "default": "tsv",
-            "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"]
+            "enum": [
+                "blast",
+                "xml",
+                "txt",
+                "daa",
+                "sam",
+                "tsv",
+                "paf"
+            ]
        },
        "longread_hostremoval_index": {
            "type": "string",
            "default": "None"
        }
    }
-}
+}
--- a/subworkflows/local/shortread_complexityfiltering.nf
+++ b/subworkflows/local/shortread_complexityfiltering.nf
@ -13,6 +13,7 @@ workflow SHORTREAD_COMPLEXITYFILTERING {
    ch_versions       = Channel.empty()
    ch_multiqc_files  = Channel.empty()

+    // fastp complexity filtering is activated via modules.conf in shortread_preprocessing
    if ( params.shortread_complexityfilter_tool == 'bbduk' ) {
        ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads
        ch_versions        =  ch_versions.mix( BBMAP_BBDUK.out.versions.first() )
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -19,9 +19,12 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true
 // Check mandatory parameters
 if (params.input    ) { ch_input     = file(params.input)     } else { exit 1, 'Input samplesheet not specified!' }
 if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
+
 if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files."
 if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs"

+if (params.shortread_complexityfilter_tool == 'fastp' && ( params.perform_shortread_clipmerge == false || params.shortread_clipmerge_tool != 'fastp' ))  exit 1, "ERROR: [nf-core/taxprofiler] cannot use fastp complexity filtering if preprocessing not turned on and/or tool is not fastp. Please specify --perform_shortread_clipmerge and/or --shortread_clipmerge_tool 'fastp'"
+
 if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." }
 if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." }

@ -131,7 +134,8 @@ workflow TAXPROFILER {
        SUBWORKFLOW: COMPLEXITY FILTERING
    */

-    if ( params.perform_shortread_complexityfilter ) {
+    // fastp complexity filtering is activated via modules.conf in shortread_preprocessing
+    if ( params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp' ) {
        ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads
        ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions )
    } else {
@ -228,7 +232,7 @@ workflow TAXPROFILER {
        ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) )
    }

-    if (params.perform_shortread_complexityfilter){
+    if (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp'){
        ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) )
    }