Add Falco as an alternative to FastQC

2024-11-25 06:09:54 +00:00 · 2022-10-18 17:43:16 +02:00 · 2022-10-18 17:43:16 +02:00 · 8eddb32b88
commit 8eddb32b88
parent 4f27998852
12 changed files with 217 additions and 22 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -23,6 +23,7 @@ jobs:
          - "21.10.3"
          - "latest-everything"
        parameters:
+          - "--perform_fastqc_alternative false"
          - "--perform_longread_qc false"
          - "--perform_shortread_qc false"
          - "--shortread_qc_tool fastp"
--- a/CITATIONS.md
+++ b/CITATIONS.md
@ -62,6 +62,10 @@

 - [FILTLONG](https://github.com/rrwick/Filtlong)

+- [Falco](https://doi.org/10.12688/f1000research.21142.2)
+
+> de Sena Brandine G and Smith AD. Falco: high-speed FastQC emulation for quality control of sequencing data. F1000Research 2021, 8:1874
+
 ## Software packaging/containerisation tools

 - [Anaconda](https://anaconda.com)
--- a/README.md
+++ b/README.md
@ -30,7 +30,7 @@ On release, automated continuous integration tests run the pipeline on a full-si

 ![](docs/images/taxprofiler_tube.png)

-1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
+1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) or [`Falco`](https://github.com/smithlabcode/falco) as an alternative option)
 2. Performs optional read pre-processing
   - Adapter clipping and merging (short-read: [fastp](https://github.com/OpenGene/fastp), [AdapterRemoval2](https://github.com/MikkelSchubert/adapterremoval); long-read: [porechop](https://github.com/rrwick/Porechop))
   - Low complexity and quality filtering (short-read: [bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus); long-read: [Filtlong](https://github.com/rrwick/Filtlong))
--- a/conf/modules.config
+++ b/conf/modules.config
@ -40,6 +40,24 @@ process {
        ]
    }

+    withName: FALCO {
+         ext.prefix = { "${meta.id}_${meta.run_accession}_raw" }
+        publishDir = [
+            path: { "${params.outdir}/falco/raw" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{html,txt}'
+        ]
+    }
+
+    withName: FALCO_PROCESSED {
+        ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
+        publishDir = [
+            path: { "${params.outdir}/falco/processed" },
+            mode: params.publish_dir_mode,
+            pattern: '*.{html,txt}'
+        ]
+    }
+
    withName: FASTP_SINGLE {
        ext.args   = [
            // trimming options
--- a/docs/usage.md
+++ b/docs/usage.md
@ -165,7 +165,9 @@ work                # Directory containing the nextflow working files
 .nextflow_log       # Log file from Nextflow
 # Other nextflow hidden files, eg. history of pipeline runs and old logs.
 ```
+### Sequencing quality control

+nf-core taxprofiler offers [`Falco`](https://github.com/smithlabcode/falco] as an alternative option to [`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
 ### Preprocessing Steps

 nf-core/taxprofiler offers four main preprocessing steps
@ -179,7 +181,7 @@ nf-core/taxprofiler offers four main preprocessing steps

 Raw sequencing read processing in the form of adapter clipping and paired-end read merging can be activated via the `--perform_shortread_qc` or `--perform_longread_qc` flags.

-It is highly recommended to run this on raw reads to remove artefacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles.
+It is highly recommended to run this on raw reads to remove artifacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles.

 There are currently two options for short-read preprocessing: `fastp` or `adapterremoval`.

--- a/modules.json
+++ b/modules.json
@ -49,6 +49,10 @@
                        "branch": "master",
                        "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905"
                    },
+                    "falco": {
+                        "branch": "master",
+                        "git_sha": "fc959214036403ad83efe7a41d43d0606c445cda"
+                    },
                    "fastp": {
                        "branch": "master",
                        "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905"
--- a/modules/nf-core/falco/main.nf
+++ b/modules/nf-core/falco/main.nf
@ -0,0 +1,57 @@
+process FALCO {
+    tag "$meta.id"
+    label 'process_single'
+
+
+    conda (params.enable_conda ? "bioconda::falco=1.2.1" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/falco:1.2.1--h867801b_3':
+        'quay.io/biocontainers/falco:1.2.1--h867801b_3' }"
+
+    input:
+    tuple val(meta), path(reads)
+
+    output:
+    tuple val(meta), path("*.html"), emit: html
+    tuple val(meta), path("*.txt") , emit: txt
+    path  "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    if ( reads.toList().size() == 1 ) {
+        """
+        falco $args --threads $task.cpus ${reads} -D ${prefix}_data.txt -S ${prefix}_summary.txt -R ${prefix}_report.html
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            falco:\$( falco --version | sed -e "s/falco//g" )
+        END_VERSIONS
+        """
+    } else {
+        """
+        falco $args --threads $task.cpus ${reads}
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            falco:\$( falco --version | sed -e "s/falco//g" )
+        END_VERSIONS
+        """
+    }
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}_data.txt
+    touch ${prefix}_fastqc_data.html
+    touch ${prefix}_summary.txt
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        falco: \$( falco --version | sed -e "s/falco v//g" )
+    END_VERSIONS
+    """
+}
--- a/modules/nf-core/falco/meta.yml
+++ b/modules/nf-core/falco/meta.yml
@ -0,0 +1,52 @@
+name: falco
+description: Run falco on sequenced reads
+keywords:
+  - quality control
+  - qc
+  - adapters
+  - fastq
+tools:
+  - fastqc:
+      description: "falco is a drop-in C++ implementation of FastQC to assess the quality of sequence reads."
+
+      homepage: "https://falco.readthedocs.io/"
+      documentation: "https://falco.readthedocs.io/"
+      tool_dev_url: "None"
+      doi: ""
+      licence: "['GPL v3']"
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: |
+        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+        respectively.
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - html:
+      type: file
+      description: FastQC like report
+      pattern: "*_{fastqc_report.html}"
+  - txt:
+      type: file
+      description: falco report data
+      pattern: "*_{data.txt}"
+  - txt:
+      type: file
+      description: falco summary file
+      pattern: "*_{summary.txt}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@lucacozzuto"
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -10,7 +10,11 @@
            "type": "object",
            "fa_icon": "fas fa-terminal",
            "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input", "outdir", "databases"],
+            "required": [
+                "input",
+                "outdir",
+                "databases"
+            ],
            "properties": {
                "input": {
                    "type": "string",
@ -80,7 +84,10 @@
                "shortread_qc_tool": {
                    "type": "string",
                    "default": "fastp",
-                    "enum": ["fastp", "adapterremoval"],
+                    "enum": [
+                        "fastp",
+                        "adapterremoval"
+                    ],
                    "fa_icon": "fas fa-tools",
                    "description": "Specify which tool to use for short-read QC"
                },
@ -133,7 +140,11 @@
                "shortread_complexityfilter_tool": {
                    "type": "string",
                    "default": "bbduk",
-                    "enum": ["bbduk", "prinseqplusplus", "fastp"],
+                    "enum": [
+                        "bbduk",
+                        "prinseqplusplus",
+                        "fastp"
+                    ],
                    "fa_icon": "fas fa-hammer",
                    "description": "Specify which tool to use for complexity filtering"
                },
@ -167,7 +178,10 @@
                "shortread_complexityfilter_prinseqplusplus_mode": {
                    "type": "string",
                    "default": "entropy",
-                    "enum": ["entropy", "dust"],
+                    "enum": [
+                        "entropy",
+                        "dust"
+                    ],
                    "fa_icon": "fas fa-check-square",
                    "description": "Specify the complexity filter mode for PRINSEQ++"
                },
@ -341,7 +355,15 @@
                "diamond_output_format": {
                    "type": "string",
                    "default": "tsv",
-                    "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"],
+                    "enum": [
+                        "blast",
+                        "xml",
+                        "txt",
+                        "daa",
+                        "sam",
+                        "tsv",
+                        "paf"
+                    ],
                    "fa_icon": "fas fa-file",
                    "description": "Specify output format from DIAMOND profiling.",
                    "help_text": "DIAMOND can produce output in a number of different formats, you can specify here which to produce.\n\nNote that DIAMOND can only produce one format at a time, and depending on which you pick, some downstream steps may not be executed. For example, selecting `daa` or `sam` will mean you will not get a tabular taxonomic profile as with the other tools.\n\nWill be overriden by `--diamond_save_reads.`\n\n> Modifies tool parameter(s):\n> - diamond blastx: `--outfmt`"
@ -360,7 +382,14 @@
                "kaiju_taxon_rank": {
                    "type": "string",
                    "default": "species",
-                    "enum": ["phylum", "class", "order", "family", "genus", "species"],
+                    "enum": [
+                        "phylum",
+                        "class",
+                        "order",
+                        "family",
+                        "genus",
+                        "species"
+                    ],
                    "fa_icon": "fas fa-tag",
                    "description": "Specify taxonomic rank to be displayed in Kaiju taxon table",
                    "help_text": "Specify the taxonomic level(s) to be displayed in the resulting Kaiju taxon table, as generated by the kaiju2table helper tool.\n\nThis can be either a single level (e.g. `species`), or a comma separated list to display the full taxonomic path (e.g. `superkingdom,phylum,class,order,family,genus,species.`).\n\n> Modifies tool parameter(s):\n> - kaiju2table: `-l`"
@ -555,7 +584,14 @@
                    "description": "Method used to save pipeline results to output directory.",
                    "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                    "fa_icon": "fas fa-copy",
-                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
+                    "enum": [
+                        "symlink",
+                        "rellink",
+                        "link",
+                        "copy",
+                        "copyNoFollow",
+                        "move"
+                    ],
                    "hidden": true
                },
                "email_on_fail": {
--- a/subworkflows/local/longread_preprocessing.nf
+++ b/subworkflows/local/longread_preprocessing.nf
@ -3,6 +3,8 @@
 //

 include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/fastqc/main'
+include { FALCO as FALCO_PROCESSED   } from '../../modules/nf-core/falco/main'
+
 include { PORECHOP                   } from '../../modules/nf-core/porechop/main'
 include { FILTLONG                   } from '../../modules/nf-core/filtlong/main'

@ -52,8 +54,14 @@ workflow LONGREAD_PREPROCESSING {
        ch_multiqc_files = ch_multiqc_files.mix( FILTLONG.out.log )
    }

+    if (params.perform_fastqc_alternative) {
+        FALCO_PROCESSED ( ch_processed_reads )
+        ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt )
+
+    } else {
        FASTQC_PROCESSED ( ch_processed_reads )
        ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip )
+    }

    emit:
    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@ -6,6 +6,7 @@
 include { SHORTREAD_FASTP             } from './shortread_fastp'
 include { SHORTREAD_ADAPTERREMOVAL    } from './shortread_adapterremoval'
 include { FASTQC as FASTQC_PROCESSED  } from '../../modules/nf-core/fastqc/main'
+include { FALCO as FALCO_PROCESSED    } from '../../modules/nf-core/falco/main'

 workflow SHORTREAD_PREPROCESSING {
    take:
@ -27,9 +28,16 @@ workflow SHORTREAD_PREPROCESSING {
        ch_processed_reads = reads
    }

+    if (params.perform_fastqc_alternative) {
+        FALCO_PROCESSED ( ch_processed_reads )
+        ch_versions = ch_versions.mix( FALCO_PROCESSED.out.versions )
+        ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt )
+
+    } else {
        FASTQC_PROCESSED ( ch_processed_reads )
        ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions )
        ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip )
+    }

    emit:
    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -84,6 +84,7 @@ include { STANDARDISATION_PROFILES      } from '../subworkflows/local/standardis
 // MODULE: Installed directly from nf-core/modules
 //
 include { FASTQC                      } from '../modules/nf-core/fastqc/main'
+include { FALCO                       } from '../modules/nf-core/falco/main'
 include { MULTIQC                     } from '../modules/nf-core/multiqc/main'
 include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main'
 include { CAT_FASTQ                   } from '../modules/nf-core/cat/fastq/main'
@ -120,12 +121,13 @@ workflow TAXPROFILER {
    */
    ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore )

-    FASTQC (
-        ch_input_for_fastqc
-    )
-
+    if ( params.perform_fastqc_alternative ) {
+        FALCO ( ch_input_for_fastqc )
+        ch_versions = ch_versions.mix(FALCO.out.versions.first())
+    } else {
+        FASTQC ( ch_input_for_fastqc )
        ch_versions = ch_versions.mix(FASTQC.out.versions.first())
-
+    }
    /*
        SUBWORKFLOW: PERFORM PREPROCESSING
    */
@ -254,7 +256,10 @@ workflow TAXPROFILER {
    ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
    ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml'))
    ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
+
+    if (!params.perform_fastqc_alternative) {
        ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
+    }

    if (params.perform_shortread_qc) {
        ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) )