diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5cea5b7..862e072 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,6 +23,7 @@ jobs: - "21.10.3" - "latest-everything" parameters: + - "--preprocessing_qc_tool falco" - "--perform_longread_qc false" - "--perform_shortread_qc false" - "--shortread_qc_tool fastp" diff --git a/CITATIONS.md b/CITATIONS.md index 1ce4ec2..599e049 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -62,6 +62,10 @@ - [FILTLONG](https://github.com/rrwick/Filtlong) +- [falco](https://doi.org/10.12688/f1000research.21142.2) + +> de Sena Brandine G and Smith AD. Falco: high-speed FastQC emulation for quality control of sequencing data. F1000Research 2021, 8:1874 + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index 2bd45a9..3f0a840 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ On release, automated continuous integration tests run the pipeline on a full-si ![](docs/images/taxprofiler_tube.png) -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) or [`falco`](https://github.com/smithlabcode/falco) as an alternative option) 2. Performs optional read pre-processing - Adapter clipping and merging (short-read: [fastp](https://github.com/OpenGene/fastp), [AdapterRemoval2](https://github.com/MikkelSchubert/adapterremoval); long-read: [porechop](https://github.com/rrwick/Porechop)) - Low complexity and quality filtering (short-read: [bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus); long-read: [Filtlong](https://github.com/rrwick/Filtlong)) diff --git a/conf/modules.config b/conf/modules.config index d2a0051..b1d7b87 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -40,6 +40,24 @@ process { ] } + withName: FALCO { + ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } + publishDir = [ + path: { "${params.outdir}/falco/raw" }, + mode: params.publish_dir_mode, + pattern: '*.{html,txt}' + ] + } + + withName: FALCO_PROCESSED { + ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } + publishDir = [ + path: { "${params.outdir}/falco/processed" }, + mode: params.publish_dir_mode, + pattern: '*.{html,txt}' + ] + } + withName: FASTP_SINGLE { ext.args = [ // trimming options diff --git a/docs/usage.md b/docs/usage.md index 9194220..32fec4d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -166,6 +166,10 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` +### Sequencing quality control + +nf-core taxprofiler offers [`falco`](https://github.com/smithlabcode/falco] as an alternative option to [`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). + ### Preprocessing Steps nf-core/taxprofiler offers four main preprocessing steps @@ -179,7 +183,7 @@ nf-core/taxprofiler offers four main preprocessing steps Raw sequencing read processing in the form of adapter clipping and paired-end read merging can be activated via the `--perform_shortread_qc` or `--perform_longread_qc` flags. -It is highly recommended to run this on raw reads to remove artefacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles. +It is highly recommended to run this on raw reads to remove artifacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles. There are currently two options for short-read preprocessing: `fastp` or `adapterremoval`. diff --git a/modules.json b/modules.json index 10d6c74..aa77c4e 100644 --- a/modules.json +++ b/modules.json @@ -49,6 +49,10 @@ "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" }, + "falco": { + "branch": "master", + "git_sha": "fc959214036403ad83efe7a41d43d0606c445cda" + }, "fastp": { "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" diff --git a/modules/nf-core/falco/main.nf b/modules/nf-core/falco/main.nf new file mode 100644 index 0000000..1688162 --- /dev/null +++ b/modules/nf-core/falco/main.nf @@ -0,0 +1,57 @@ +process FALCO { + tag "$meta.id" + label 'process_single' + + + conda (params.enable_conda ? "bioconda::falco=1.2.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/falco:1.2.1--h867801b_3': + 'quay.io/biocontainers/falco:1.2.1--h867801b_3' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.html"), emit: html + tuple val(meta), path("*.txt") , emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ( reads.toList().size() == 1 ) { + """ + falco $args --threads $task.cpus ${reads} -D ${prefix}_data.txt -S ${prefix}_summary.txt -R ${prefix}_report.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + falco:\$( falco --version | sed -e "s/falco//g" ) + END_VERSIONS + """ + } else { + """ + falco $args --threads $task.cpus ${reads} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + falco:\$( falco --version | sed -e "s/falco//g" ) + END_VERSIONS + """ + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_data.txt + touch ${prefix}_fastqc_data.html + touch ${prefix}_summary.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + falco: \$( falco --version | sed -e "s/falco v//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/falco/meta.yml b/modules/nf-core/falco/meta.yml new file mode 100644 index 0000000..6f77fb1 --- /dev/null +++ b/modules/nf-core/falco/meta.yml @@ -0,0 +1,52 @@ +name: falco +description: Run falco on sequenced reads +keywords: + - quality control + - qc + - adapters + - fastq +tools: + - fastqc: + description: "falco is a drop-in C++ implementation of FastQC to assess the quality of sequence reads." + + homepage: "https://falco.readthedocs.io/" + documentation: "https://falco.readthedocs.io/" + tool_dev_url: "None" + doi: "" + licence: "['GPL v3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - html: + type: file + description: FastQC like report + pattern: "*_{fastqc_report.html}" + - txt: + type: file + description: falco report data + pattern: "*_{data.txt}" + - txt: + type: file + description: falco summary file + pattern: "*_{summary.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@lucacozzuto" diff --git a/nextflow.config b/nextflow.config index efb5aff..b69948e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -59,6 +59,8 @@ params { // Databases databases = null + preprocessing_qc_tool = 'fastqc' + // FASTQ preprocessing perform_shortread_qc = false shortread_qc_tool = 'fastp' diff --git a/nextflow_schema.json b/nextflow_schema.json index f88443f..4479f54 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -707,5 +707,14 @@ { "$ref": "#/definitions/reference_genome_options" } - ] + ], + "properties": { + "preprocessing_qc_tool": { + "type": "string", + "default": "fastqc", + "enum": ["fastqc", "falco"], + "help_text": "Falco is designed as a drop-in replacement for FastQC but written in C++ for faster computation. We particularly recommend using falco when using long reads (due to reduced memory constraints), however is also applicable for short reads.", + "description": "Specify the tool used for quality control of raw sequencing reads" + } + } } diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf index ce537e8..3da49ce 100644 --- a/subworkflows/local/longread_preprocessing.nf +++ b/subworkflows/local/longread_preprocessing.nf @@ -3,6 +3,8 @@ // include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/fastqc/main' +include { FALCO as FALCO_PROCESSED } from '../../modules/nf-core/falco/main' + include { PORECHOP } from '../../modules/nf-core/porechop/main' include { FILTLONG } from '../../modules/nf-core/filtlong/main' @@ -52,8 +54,16 @@ workflow LONGREAD_PREPROCESSING { ch_multiqc_files = ch_multiqc_files.mix( FILTLONG.out.log ) } - FASTQC_PROCESSED ( ch_processed_reads ) - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + if (params.preprocessing_qc_tool == 'fastqc') { + FASTQC_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + + } else if (params.preprocessing_qc_tool == 'falco') { + FALCO_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FALCO_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt ) + } emit: reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index 859c1d5..f2cd738 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -5,7 +5,8 @@ include { SHORTREAD_FASTP } from './shortread_fastp' include { SHORTREAD_ADAPTERREMOVAL } from './shortread_adapterremoval' -include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/fastqc/main' +include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/fastqc/main' +include { FALCO as FALCO_PROCESSED } from '../../modules/nf-core/falco/main' workflow SHORTREAD_PREPROCESSING { take: @@ -27,9 +28,15 @@ workflow SHORTREAD_PREPROCESSING { ch_processed_reads = reads } - FASTQC_PROCESSED ( ch_processed_reads ) - ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + if (params.preprocessing_qc_tool == 'fastqc') { + FASTQC_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + } else if (params.preprocessing_qc_tool == 'falco') { + FALCO_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FALCO_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt ) + } emit: reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 8b9edb7..1ac519c 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -84,6 +84,7 @@ include { STANDARDISATION_PROFILES } from '../subworkflows/local/standardis // MODULE: Installed directly from nf-core/modules // include { FASTQC } from '../modules/nf-core/fastqc/main' +include { FALCO } from '../modules/nf-core/falco/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' @@ -120,12 +121,13 @@ workflow TAXPROFILER { */ ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ) - FASTQC ( - ch_input_for_fastqc - ) - - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - + if ( params.preprocessing_qc_tool == 'falco' ) { + FALCO ( ch_input_for_fastqc ) + ch_versions = ch_versions.mix(FALCO.out.versions.first()) + } else { + FASTQC ( ch_input_for_fastqc ) + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + } /* SUBWORKFLOW: PERFORM PREPROCESSING */ @@ -254,7 +256,13 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + + if ( params.preprocessing_qc_tool == 'falco' ) { + ch_multiqc_files = ch_multiqc_files.mix(FALCO.out.txt.collect{it[1]}.ifEmpty([])) + } else { + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + } + if (params.perform_shortread_qc) { ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) )