Merge pull request #148 from genomic-medicine-sweden/add_falco

Add Falco as an alternative to FastQC
2024-11-26 03:09:56 +00:00 · 2022-10-25 12:40:26 +02:00 · 2022-10-25 12:40:26 +02:00 · 63c260bfbc
commit 63c260bfbc
parent 4f27998852 de6816ec49
13 changed files with 192 additions and 16 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -23,6 +23,7 @@ jobs:
          - "21.10.3"
          - "latest-everything"
        parameters:
          - "--preprocessing_qc_tool falco"
          - "--perform_longread_qc false"
          - "--perform_shortread_qc false"
          - "--shortread_qc_tool fastp"
--- a/CITATIONS.md
+++ b/CITATIONS.md
@ -62,6 +62,10 @@
 - [FILTLONG](https://github.com/rrwick/Filtlong)
 - [falco](https://doi.org/10.12688/f1000research.21142.2)
 > de Sena Brandine G and Smith AD. Falco: high-speed FastQC emulation for quality control of sequencing data. F1000Research 2021, 8:1874
 ## Software packaging/containerisation tools
 - [Anaconda](https://anaconda.com)
--- a/README.md
+++ b/README.md
@ -30,7 +30,7 @@ On release, automated continuous integration tests run the pipeline on a full-si
 ![](docs/images/taxprofiler_tube.png)
-1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
+1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) or [`falco`](https://github.com/smithlabcode/falco) as an alternative option)
 2. Performs optional read pre-processing
   - Adapter clipping and merging (short-read: [fastp](https://github.com/OpenGene/fastp), [AdapterRemoval2](https://github.com/MikkelSchubert/adapterremoval); long-read: [porechop](https://github.com/rrwick/Porechop))
   - Low complexity and quality filtering (short-read: [bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus); long-read: [Filtlong](https://github.com/rrwick/Filtlong))
--- a/conf/modules.config
+++ b/conf/modules.config
@ -40,6 +40,24 @@ process {
        ]
    }
    withName: FALCO {
        ext.prefix = { "${meta.id}_${meta.run_accession}_raw" }
        publishDir = [
            path: { "${params.outdir}/falco/raw" },
            mode: params.publish_dir_mode,
            pattern: '*.{html,txt}'
        ]
    }
    withName: FALCO_PROCESSED {
        ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
        publishDir = [
            path: { "${params.outdir}/falco/processed" },
            mode: params.publish_dir_mode,
            pattern: '*.{html,txt}'
        ]
    }
    withName: FASTP_SINGLE {
        ext.args   = [
            // trimming options
--- a/docs/usage.md
+++ b/docs/usage.md
@ -166,6 +166,10 @@ work                # Directory containing the nextflow working files
 # Other nextflow hidden files, eg. history of pipeline runs and old logs.
 ```
 ### Sequencing quality control
 nf-core taxprofiler offers [`falco`](https://github.com/smithlabcode/falco] as an alternative option to [`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
 ### Preprocessing Steps
 nf-core/taxprofiler offers four main preprocessing steps
@ -179,7 +183,7 @@ nf-core/taxprofiler offers four main preprocessing steps
 Raw sequencing read processing in the form of adapter clipping and paired-end read merging can be activated via the `--perform_shortread_qc` or `--perform_longread_qc` flags.
-It is highly recommended to run this on raw reads to remove artefacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles.
+It is highly recommended to run this on raw reads to remove artifacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles.
 There are currently two options for short-read preprocessing: `fastp` or `adapterremoval`.
--- a/modules.json
+++ b/modules.json
@ -49,6 +49,10 @@
                        "branch": "master",
                        "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905"
                    },
                    "falco": {
                        "branch": "master",
                        "git_sha": "fc959214036403ad83efe7a41d43d0606c445cda"
                    },
                    "fastp": {
                        "branch": "master",
                        "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905"
--- a/modules/nf-core/falco/main.nf
+++ b/modules/nf-core/falco/main.nf
@ -0,0 +1,57 @@
 process FALCO {
    tag "$meta.id"
    label 'process_single'
    conda (params.enable_conda ? "bioconda::falco=1.2.1" : null)
    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
        'https://depot.galaxyproject.org/singularity/falco:1.2.1--h867801b_3':
        'quay.io/biocontainers/falco:1.2.1--h867801b_3' }"
    input:
    tuple val(meta), path(reads)
    output:
    tuple val(meta), path("*.html"), emit: html
    tuple val(meta), path("*.txt") , emit: txt
    path  "versions.yml"           , emit: versions
    when:
    task.ext.when == null || task.ext.when
    script:
    def args = task.ext.args ?: ''
    def prefix = task.ext.prefix ?: "${meta.id}"
    if ( reads.toList().size() == 1 ) {
        """
        falco $args --threads $task.cpus ${reads} -D ${prefix}_data.txt -S ${prefix}_summary.txt -R ${prefix}_report.html
        cat <<-END_VERSIONS > versions.yml
        "${task.process}":
            falco:\$( falco --version | sed -e "s/falco//g" )
        END_VERSIONS
        """
    } else {
        """
        falco $args --threads $task.cpus ${reads}
        cat <<-END_VERSIONS > versions.yml
        "${task.process}":
            falco:\$( falco --version | sed -e "s/falco//g" )
        END_VERSIONS
        """
    }
    stub:
    def prefix = task.ext.prefix ?: "${meta.id}"
    """
    touch ${prefix}_data.txt
    touch ${prefix}_fastqc_data.html
    touch ${prefix}_summary.txt
    cat <<-END_VERSIONS > versions.yml
    "${task.process}":
        falco: \$( falco --version | sed -e "s/falco v//g" )
    END_VERSIONS
    """
 }
--- a/modules/nf-core/falco/meta.yml
+++ b/modules/nf-core/falco/meta.yml
@ -0,0 +1,52 @@
 name: falco
 description: Run falco on sequenced reads
 keywords:
  - quality control
  - qc
  - adapters
  - fastq
 tools:
  - fastqc:
      description: "falco is a drop-in C++ implementation of FastQC to assess the quality of sequence reads."
      homepage: "https://falco.readthedocs.io/"
      documentation: "https://falco.readthedocs.io/"
      tool_dev_url: "None"
      doi: ""
      licence: "['GPL v3']"
 input:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - reads:
      type: file
      description: |
        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
        respectively.
 output:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - html:
      type: file
      description: FastQC like report
      pattern: "*_{fastqc_report.html}"
  - txt:
      type: file
      description: falco report data
      pattern: "*_{data.txt}"
  - txt:
      type: file
      description: falco summary file
      pattern: "*_{summary.txt}"
  - versions:
      type: file
      description: File containing software versions
      pattern: "versions.yml"
 authors:
  - "@lucacozzuto"
--- a/nextflow.config
+++ b/nextflow.config
@ -59,6 +59,8 @@ params {
    // Databases
    databases = null
    preprocessing_qc_tool            = 'fastqc'
    // FASTQ preprocessing
    perform_shortread_qc             = false
    shortread_qc_tool                = 'fastp'
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -707,5 +707,14 @@
        {
            "$ref": "#/definitions/reference_genome_options"
        }
-    ]
+    ],
    "properties": {
        "preprocessing_qc_tool": {
            "type": "string",
            "default": "fastqc",
            "enum": ["fastqc", "falco"],
            "help_text": "Falco is designed as a drop-in replacement for FastQC but written in C++ for faster computation. We particularly recommend using falco when using long reads (due to reduced memory constraints), however is also applicable for short reads.",
            "description": "Specify the tool used for quality control of raw sequencing reads"
        }
    }
 }
--- a/subworkflows/local/longread_preprocessing.nf
+++ b/subworkflows/local/longread_preprocessing.nf
@ -3,6 +3,8 @@
 //
 include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/fastqc/main'
 include { FALCO as FALCO_PROCESSED   } from '../../modules/nf-core/falco/main'
 include { PORECHOP                   } from '../../modules/nf-core/porechop/main'
 include { FILTLONG                   } from '../../modules/nf-core/filtlong/main'
@ -52,8 +54,16 @@ workflow LONGREAD_PREPROCESSING {
        ch_multiqc_files = ch_multiqc_files.mix( FILTLONG.out.log )
    }
-    FASTQC_PROCESSED ( ch_processed_reads )
+    if (params.preprocessing_qc_tool == 'fastqc') {
-    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip )
+        FASTQC_PROCESSED ( ch_processed_reads )
        ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions )
        ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip )
    } else if (params.preprocessing_qc_tool == 'falco') {
        FALCO_PROCESSED ( ch_processed_reads )
        ch_versions = ch_versions.mix( FALCO_PROCESSED.out.versions )
        ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt )
    }
    emit:
    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@ -5,7 +5,8 @@
 include { SHORTREAD_FASTP             } from './shortread_fastp'
 include { SHORTREAD_ADAPTERREMOVAL    } from './shortread_adapterremoval'
-include { FASTQC as FASTQC_PROCESSED       } from '../../modules/nf-core/fastqc/main'
+include { FASTQC as FASTQC_PROCESSED  } from '../../modules/nf-core/fastqc/main'
 include { FALCO as FALCO_PROCESSED    } from '../../modules/nf-core/falco/main'
 workflow SHORTREAD_PREPROCESSING {
    take:
@ -27,9 +28,15 @@ workflow SHORTREAD_PREPROCESSING {
        ch_processed_reads = reads
    }
-    FASTQC_PROCESSED ( ch_processed_reads )
+    if (params.preprocessing_qc_tool == 'fastqc') {
-    ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions )
+        FASTQC_PROCESSED ( ch_processed_reads )
-    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip )
+        ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions )
        ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip )
    } else if  (params.preprocessing_qc_tool == 'falco') {
        FALCO_PROCESSED ( ch_processed_reads )
        ch_versions = ch_versions.mix( FALCO_PROCESSED.out.versions )
        ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt )
    }
    emit:
    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -84,6 +84,7 @@ include { STANDARDISATION_PROFILES      } from '../subworkflows/local/standardis
 // MODULE: Installed directly from nf-core/modules
 //
 include { FASTQC                      } from '../modules/nf-core/fastqc/main'
 include { FALCO                       } from '../modules/nf-core/falco/main'
 include { MULTIQC                     } from '../modules/nf-core/multiqc/main'
 include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main'
 include { CAT_FASTQ                   } from '../modules/nf-core/cat/fastq/main'
@ -120,12 +121,13 @@ workflow TAXPROFILER {
    */
    ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore )
-    FASTQC (
+    if ( params.preprocessing_qc_tool == 'falco' ) {
-        ch_input_for_fastqc
+        FALCO ( ch_input_for_fastqc )
-    )
+        ch_versions = ch_versions.mix(FALCO.out.versions.first())
-
+    } else {
-    ch_versions = ch_versions.mix(FASTQC.out.versions.first())
+        FASTQC ( ch_input_for_fastqc )
-
+        ch_versions = ch_versions.mix(FASTQC.out.versions.first())
    }
    /*
        SUBWORKFLOW: PERFORM PREPROCESSING
    */
@ -254,7 +256,13 @@ workflow TAXPROFILER {
    ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
    ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml'))
    ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
-    ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
+
    if ( params.preprocessing_qc_tool == 'falco' ) {
        ch_multiqc_files = ch_multiqc_files.mix(FALCO.out.txt.collect{it[1]}.ifEmpty([]))
    } else {
        ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
    }
    if (params.perform_shortread_qc) {
        ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) )