Merge pull request #79 from nf-core/fastp-complexity

Add FASTP complexity option
2024-11-22 07:59:54 +00:00 · 2022-05-07 14:05:32 +02:00 · 2022-05-07 14:05:32 +02:00 · fd71b71929
commit fd71b71929
parent e7b54801ed d67543503b
10 changed files with 117 additions and 9 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -38,7 +38,7 @@ jobs:
          - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs --shortread_clipmerge_excludeunmerged"
          - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs"
          - "--shortread_complexityfilter_tool bbduk"
-          - "--shortread_complexityfilter_tool prinseq"
+          - "--shortread_complexityfilter_tool prinseqplusplus"
          - "--perform_runmerging"
          - "--perform_runmerging --shortread_clipmerge_mergepairs"
          - "--shortread_complexityfilter false --perform_shortread_hostremoval"
--- a/conf/modules.config
+++ b/conf/modules.config
@ -54,7 +54,8 @@ process {
            params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "",
            params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
            // filtering options
-            "--length_required ${params.shortread_clipmerge_minlength}"
+            "--length_required ${params.shortread_clipmerge_minlength}",
+            (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp') ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : ''
        ].join(' ').trim()
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
@ -74,7 +75,8 @@ process {
            params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
            params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe",
            // filtering options
-            "--length_required ${params.shortread_clipmerge_minlength}"
+            "--length_required ${params.shortread_clipmerge_minlength}",
+            params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : ''
        ].join(' ').trim()
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
--- a/conf/test.config
+++ b/conf/test.config
@ -29,6 +29,7 @@ params {
    perform_shortread_complexityfilter    = true
    perform_shortread_hostremoval         = true
    perform_longread_hostremoval          = true
+    perform_runmerging                    = true
    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
    run_kaiju                             = true
    run_kraken2                           = true
--- a/conf/test_nopreprocessing.config
+++ b/conf/test_nopreprocessing.config
@ -0,0 +1,46 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset skipping all preprocessing to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
+    // TODO nf-core: Give any required params for the test so that command line flags are not needed
+    input                                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
+    databases                             = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
+    perform_shortread_clipmerge           = false
+    perform_longread_clip                 = false
+    perform_shortread_complexityfilter    = false
+    perform_shortread_hostremoval         = false
+    perform_longread_hostremoval          = false
+    perform_runmerging                    = false
+    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
+    run_kaiju                             = true
+    run_kraken2                           = true
+    run_malt                              = true
+    run_metaphlan3                        = true
+    run_centrifuge                        = true
+    run_diamond                           = true
+}
+
+process {
+    withName: MALT_RUN {
+        maxForks = 1
+    }
+}
--- a/conf/test_noprofiling.config
+++ b/conf/test_noprofiling.config
@ -0,0 +1,46 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset without performing any profiling to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
+    // TODO nf-core: Give any required params for the test so that command line flags are not needed
+    input                                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
+    databases                             = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
+    perform_shortread_clipmerge           = true
+    perform_longread_clip                 = true
+    perform_shortread_complexityfilter    = true
+    perform_shortread_hostremoval         = true
+    perform_longread_hostremoval          = true
+    perform_runmerging                    = true
+    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
+    run_kaiju                             = false
+    run_kraken2                           = false
+    run_malt                              = false
+    run_metaphlan3                        = false
+    run_centrifuge                        = false
+    run_diamond                           = false
+}
+
+process {
+    withName: MALT_RUN {
+        maxForks = 1
+    }
+}
--- a/docs/usage.md
+++ b/docs/usage.md
@ -183,11 +183,11 @@ Complexity filtering can be activated via the `--perform_shortread_complexityfil

 Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources.

-There are currently two options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) and [`prinseq++`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/).
+There are currently three options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus), and [`fastp`](https://github.com/OpenGene/fastp#low-complexity-filter).

-The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of both tools (see links above) to decide on optimal methods and parameters for your dataset.
+The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of the tools (see links above) to decide on optimal methods and parameters for your dataset.

-You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`.
+You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. If running with `fastp`, complexity filtering happens inclusively within the earlier shortread preprocessing step. Therefore there will not be an independent pipeline step for complexity filtering, and no independent FASTQ file (i.e. `--save_complexityfiltered_reads` will be ignored) - your complexity filtered reads will also be in the `fastp/` folder in the same file(s) as the preprocessed read.

 #### Host Removal

--- a/nextflow.config
+++ b/nextflow.config
@ -74,6 +74,7 @@ params {
    shortread_complexityfilter_bbduk_mask                = false
    shortread_complexityfilter_prinseqplusplus_mode      = 'entropy'
    shortread_complexityfilter_prinseqplusplus_dustscore = 0.5
+    shortread_complexityfilter_fastp_threshold           = 30
    save_complexityfiltered_reads                        = false

    // run merging
@ -185,6 +186,8 @@ profiles {
    }
    test      { includeConfig 'conf/test.config'      }
    test_full { includeConfig 'conf/test_full.config' }
+    test_noprofiling { includeConfig 'conf/test_noprofiling.config' }
+    test_nopreprocessing { includeConfig 'conf/test_preprocessing.config' }
 }

 // Load igenomes.config if required
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -319,7 +319,8 @@
        },
        "shortread_complexityfilter_tool": {
            "type": "string",
-            "default": "bbduk"
+            "default": "bbduk",
+            "enum": ["bbduk", "prinseqplusplus", "fastp"]
        },
        "shortread_complexityfilter_bbduk_windowsize": {
            "type": "integer",
@ -404,6 +405,10 @@
        "longread_hostremoval_index": {
            "type": "string",
            "default": "None"
+        },
+        "shortread_complexityfilter_fastp_threshold": {
+            "type": "integer",
+            "default": 30
        }
    }
 }
--- a/subworkflows/local/shortread_complexityfiltering.nf
+++ b/subworkflows/local/shortread_complexityfiltering.nf
@ -13,6 +13,7 @@ workflow SHORTREAD_COMPLEXITYFILTERING {
    ch_versions       = Channel.empty()
    ch_multiqc_files  = Channel.empty()

+    // fastp complexity filtering is activated via modules.conf in shortread_preprocessing
    if ( params.shortread_complexityfilter_tool == 'bbduk' ) {
        ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads
        ch_versions        =  ch_versions.mix( BBMAP_BBDUK.out.versions.first() )
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -19,9 +19,12 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true
 // Check mandatory parameters
 if (params.input    ) { ch_input     = file(params.input)     } else { exit 1, 'Input samplesheet not specified!' }
 if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
+
 if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files."
 if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs"

+if (params.shortread_complexityfilter_tool == 'fastp' && ( params.perform_shortread_clipmerge == false || params.shortread_clipmerge_tool != 'fastp' ))  exit 1, "ERROR: [nf-core/taxprofiler] cannot use fastp complexity filtering if preprocessing not turned on and/or tool is not fastp. Please specify --perform_shortread_clipmerge and/or --shortread_clipmerge_tool 'fastp'"
+
 if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." }
 if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." }

@ -131,7 +134,8 @@ workflow TAXPROFILER {
        SUBWORKFLOW: COMPLEXITY FILTERING
    */

-    if ( params.perform_shortread_complexityfilter ) {
+    // fastp complexity filtering is activated via modules.conf in shortread_preprocessing
+    if ( params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp' ) {
        ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads
        ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions )
    } else {
@ -228,7 +232,7 @@ workflow TAXPROFILER {
        ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) )
    }

-    if (params.perform_shortread_complexityfilter){
+    if (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp'){
        ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) )
    }