Add FASTP complexity option

2024-11-25 19:09:56 +00:00 · 2022-05-07 06:09:05 +02:00 · 2022-05-07 06:09:05 +02:00 · 47a5ae0cff
commit 47a5ae0cff
parent e7b54801ed
9 changed files with 153 additions and 14 deletions
--- a/conf/modules.config
+++ b/conf/modules.config
@ -54,7 +54,8 @@ process {
            params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "",
            params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
            // filtering options
-            "--length_required ${params.shortread_clipmerge_minlength}"
+            "--length_required ${params.shortread_clipmerge_minlength}",
           params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : ''
        ].join(' ').trim()
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
@ -74,7 +75,8 @@ process {
            params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "",
            params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe",
            // filtering options
-            "--length_required ${params.shortread_clipmerge_minlength}"
+            "--length_required ${params.shortread_clipmerge_minlength}",
           params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : ''
        ].join(' ').trim()
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
--- a/conf/test.config
+++ b/conf/test.config
@ -29,6 +29,7 @@ params {
    perform_shortread_complexityfilter    = true
    perform_shortread_hostremoval         = true
    perform_longread_hostremoval          = true
    perform_runmerging                    = true
    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
    run_kaiju                             = true
    run_kraken2                           = true
--- a/conf/test_nopreprocessing.config
+++ b/conf/test_nopreprocessing.config
@ -0,0 +1,46 @@
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    Nextflow config file for running minimal tests
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    Defines input files and everything required to run a fast and simple pipeline test.
    Use as follows:
        nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
 ----------------------------------------------------------------------------------------
 */
 params {
    config_profile_name        = 'Test profile'
    config_profile_description = 'Minimal test dataset skipping all preprocessing to check pipeline function'
    // Limit resources so that this can run on GitHub Actions
    max_cpus   = 2
    max_memory = '6.GB'
    max_time   = '6.h'
    // Input data
    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
    // TODO nf-core: Give any required params for the test so that command line flags are not needed
    input                                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
    databases                             = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
    perform_shortread_clipmerge           = false
    perform_longread_clip                 = false
    perform_shortread_complexityfilter    = false
    perform_shortread_hostremoval         = false
    perform_longread_hostremoval          = false
    perform_runmerging                    = false
    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
    run_kaiju                             = true
    run_kraken2                           = true
    run_malt                              = true
    run_metaphlan3                        = true
    run_centrifuge                        = true
    run_diamond                           = true
 }
 process {
    withName: MALT_RUN {
        maxForks = 1
    }
 }
--- a/conf/test_noprofiling.config
+++ b/conf/test_noprofiling.config
@ -0,0 +1,46 @@
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    Nextflow config file for running minimal tests
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    Defines input files and everything required to run a fast and simple pipeline test.
    Use as follows:
        nextflow run nf-core/taxprofiler -profile test,<docker/singularity> --outdir <OUTDIR>
 ----------------------------------------------------------------------------------------
 */
 params {
    config_profile_name        = 'Test profile'
    config_profile_description = 'Minimal test dataset without performing any profiling to check pipeline function'
    // Limit resources so that this can run on GitHub Actions
    max_cpus   = 2
    max_memory = '6.GB'
    max_time   = '6.h'
    // Input data
    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
    // TODO nf-core: Give any required params for the test so that command line flags are not needed
    input                                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
    databases                             = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
    perform_shortread_clipmerge           = true
    perform_longread_clip                 = true
    perform_shortread_complexityfilter    = true
    perform_shortread_hostremoval         = true
    perform_longread_hostremoval          = true
    perform_runmerging                    = true
    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
    run_kaiju                             = false
    run_kraken2                           = false
    run_malt                              = false
    run_metaphlan3                        = false
    run_centrifuge                        = false
    run_diamond                           = false
 }
 process {
    withName: MALT_RUN {
        maxForks = 1
    }
 }
--- a/docs/usage.md
+++ b/docs/usage.md
@ -183,11 +183,11 @@ Complexity filtering can be activated via the `--perform_shortread_complexityfil
 Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources.
-There are currently two options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) and [`prinseq++`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/).
+There are currently three options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus), and [`fastp`](https://github.com/OpenGene/fastp#low-complexity-filter).
 The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of both tools (see links above) to decide on optimal methods and parameters for your dataset.
-You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`.
+You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. If running with `fastp`, complexity filtering happens inclusively within the earlier shortread preprocessing step. Therefore there will not be an independent pipeline step for complexity filtering, and no independent FASTQ file (i.e. `--save_complexityfiltered_reads` will be ignored) - your complexity filtered reads will also be in the `fastp/` folder in the same file(s) as the preprocessed read.
 #### Host Removal
--- a/nextflow.config
+++ b/nextflow.config
@ -74,6 +74,7 @@ params {
    shortread_complexityfilter_bbduk_mask                = false
    shortread_complexityfilter_prinseqplusplus_mode      = 'entropy'
    shortread_complexityfilter_prinseqplusplus_dustscore = 0.5
    shortread_complexityfilter_fastp_threshold           = 30
    save_complexityfiltered_reads                        = false
    // run merging
@ -185,6 +186,8 @@ profiles {
    }
    test      { includeConfig 'conf/test.config'      }
    test_full { includeConfig 'conf/test_full.config' }
    test_noprofiling { includeConfig 'conf/test_noprofiling.config' }
    test_nopreprocessing { includeConfig 'conf/test_preprocessing.config' }
 }
 // Load igenomes.config if required
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -10,7 +10,10 @@
            "type": "object",
            "fa_icon": "fas fa-terminal",
            "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input", "outdir"],
+            "required": [
                "input",
                "outdir"
            ],
            "properties": {
                "input": {
                    "type": "string",
@ -173,7 +176,14 @@
                    "description": "Method used to save pipeline results to output directory.",
                    "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                    "fa_icon": "fas fa-copy",
-                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
+                    "enum": [
                        "symlink",
                        "rellink",
                        "link",
                        "copy",
                        "copyNoFollow",
                        "move"
                    ],
                    "hidden": true
                },
                "email_on_fail": {
@ -294,7 +304,10 @@
        "shortread_clipmerge_tool": {
            "type": "string",
            "default": "fastp",
-            "enum": ["fastp", "adapterremoval"]
+            "enum": [
                "fastp",
                "adapterremoval"
            ]
        },
        "shortread_clipmerge_skipadaptertrim": {
            "type": "boolean"
@ -319,7 +332,12 @@
        },
        "shortread_complexityfilter_tool": {
            "type": "string",
-            "default": "bbduk"
+            "default": "bbduk",
            "enum": [
                "bbduk",
                "prinseqplusplus",
                "fastp"
            ]
        },
        "shortread_complexityfilter_bbduk_windowsize": {
            "type": "integer",
@ -335,7 +353,10 @@
        "shortread_complexityfilter_prinseqplusplus_mode": {
            "type": "string",
            "default": "entropy",
-            "enum": ["entropy", "dust"]
+            "enum": [
                "entropy",
                "dust"
            ]
        },
        "shortread_complexityfilter_prinseqplusplus_dustscore": {
            "type": "number",
@ -391,7 +412,14 @@
        "kaiju_taxon_name": {
            "type": "string",
            "default": "species",
-            "enum": ["phylum", "class", "order", "family", "genus", "species"]
+            "enum": [
                "phylum",
                "class",
                "order",
                "family",
                "genus",
                "species"
            ]
        },
        "run_diamond": {
            "type": "boolean"
@ -399,7 +427,15 @@
        "diamond_output_format": {
            "type": "string",
            "default": "tsv",
-            "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"]
+            "enum": [
                "blast",
                "xml",
                "txt",
                "daa",
                "sam",
                "tsv",
                "paf"
            ]
        },
        "longread_hostremoval_index": {
            "type": "string",
--- a/subworkflows/local/shortread_complexityfiltering.nf
+++ b/subworkflows/local/shortread_complexityfiltering.nf
@ -13,6 +13,7 @@ workflow SHORTREAD_COMPLEXITYFILTERING {
    ch_versions       = Channel.empty()
    ch_multiqc_files  = Channel.empty()
    // fastp complexity filtering is activated via modules.conf in shortread_preprocessing
    if ( params.shortread_complexityfilter_tool == 'bbduk' ) {
        ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads
        ch_versions        =  ch_versions.mix( BBMAP_BBDUK.out.versions.first() )
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -19,9 +19,12 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true
 // Check mandatory parameters
 if (params.input    ) { ch_input     = file(params.input)     } else { exit 1, 'Input samplesheet not specified!' }
 if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
 if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files."
 if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs"
 if (params.shortread_complexityfilter_tool == 'fastp' && ( params.perform_shortread_clipmerge == false || params.shortread_clipmerge_tool != 'fastp' ))  exit 1, "ERROR: [nf-core/taxprofiler] cannot use fastp complexity filtering if preprocessing not turned on and/or tool is not fastp. Please specify --perform_shortread_clipmerge and/or --shortread_clipmerge_tool 'fastp'"
 if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." }
 if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." }
@ -131,7 +134,8 @@ workflow TAXPROFILER {
        SUBWORKFLOW: COMPLEXITY FILTERING
    */
-    if ( params.perform_shortread_complexityfilter ) {
+    // fastp complexity filtering is activated via modules.conf in shortread_preprocessing
    if ( params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp' ) {
        ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads
        ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions )
    } else {
@ -228,7 +232,7 @@ workflow TAXPROFILER {
        ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) )
    }
-    if (params.perform_shortread_complexityfilter){
+    if (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp'){
        ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) )
    }