diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5cea5b7..344505a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,6 +23,7 @@ jobs: - "21.10.3" - "latest-everything" parameters: + - "--perform_fastqc_alternative false" - "--perform_longread_qc false" - "--perform_shortread_qc false" - "--shortread_qc_tool fastp" diff --git a/CITATIONS.md b/CITATIONS.md index 1ce4ec2..510c74b 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -62,6 +62,10 @@ - [FILTLONG](https://github.com/rrwick/Filtlong) +- [Falco](https://doi.org/10.12688/f1000research.21142.2) + +> de Sena Brandine G and Smith AD. Falco: high-speed FastQC emulation for quality control of sequencing data. F1000Research 2021, 8:1874 + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index 2bd45a9..e17c7b5 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ On release, automated continuous integration tests run the pipeline on a full-si ![](docs/images/taxprofiler_tube.png) -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) or [`Falco`](https://github.com/smithlabcode/falco) as an alternative option) 2. Performs optional read pre-processing - Adapter clipping and merging (short-read: [fastp](https://github.com/OpenGene/fastp), [AdapterRemoval2](https://github.com/MikkelSchubert/adapterremoval); long-read: [porechop](https://github.com/rrwick/Porechop)) - Low complexity and quality filtering (short-read: [bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus); long-read: [Filtlong](https://github.com/rrwick/Filtlong)) diff --git a/conf/modules.config b/conf/modules.config index d2a0051..1e2058a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -40,6 +40,24 @@ process { ] } + withName: FALCO { + ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } + publishDir = [ + path: { "${params.outdir}/falco/raw" }, + mode: params.publish_dir_mode, + pattern: '*.{html,txt}' + ] + } + + withName: FALCO_PROCESSED { + ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } + publishDir = [ + path: { "${params.outdir}/falco/processed" }, + mode: params.publish_dir_mode, + pattern: '*.{html,txt}' + ] + } + withName: FASTP_SINGLE { ext.args = [ // trimming options diff --git a/docs/usage.md b/docs/usage.md index 9194220..7478636 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -165,7 +165,9 @@ work # Directory containing the nextflow working files .nextflow_log # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` +### Sequencing quality control +nf-core taxprofiler offers [`Falco`](https://github.com/smithlabcode/falco] as an alternative option to [`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). ### Preprocessing Steps nf-core/taxprofiler offers four main preprocessing steps @@ -179,7 +181,7 @@ nf-core/taxprofiler offers four main preprocessing steps Raw sequencing read processing in the form of adapter clipping and paired-end read merging can be activated via the `--perform_shortread_qc` or `--perform_longread_qc` flags. -It is highly recommended to run this on raw reads to remove artefacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles. +It is highly recommended to run this on raw reads to remove artifacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles. There are currently two options for short-read preprocessing: `fastp` or `adapterremoval`. diff --git a/modules.json b/modules.json index 10d6c74..aa77c4e 100644 --- a/modules.json +++ b/modules.json @@ -49,6 +49,10 @@ "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" }, + "falco": { + "branch": "master", + "git_sha": "fc959214036403ad83efe7a41d43d0606c445cda" + }, "fastp": { "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" diff --git a/modules/nf-core/falco/main.nf b/modules/nf-core/falco/main.nf new file mode 100644 index 0000000..1688162 --- /dev/null +++ b/modules/nf-core/falco/main.nf @@ -0,0 +1,57 @@ +process FALCO { + tag "$meta.id" + label 'process_single' + + + conda (params.enable_conda ? "bioconda::falco=1.2.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/falco:1.2.1--h867801b_3': + 'quay.io/biocontainers/falco:1.2.1--h867801b_3' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.html"), emit: html + tuple val(meta), path("*.txt") , emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ( reads.toList().size() == 1 ) { + """ + falco $args --threads $task.cpus ${reads} -D ${prefix}_data.txt -S ${prefix}_summary.txt -R ${prefix}_report.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + falco:\$( falco --version | sed -e "s/falco//g" ) + END_VERSIONS + """ + } else { + """ + falco $args --threads $task.cpus ${reads} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + falco:\$( falco --version | sed -e "s/falco//g" ) + END_VERSIONS + """ + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_data.txt + touch ${prefix}_fastqc_data.html + touch ${prefix}_summary.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + falco: \$( falco --version | sed -e "s/falco v//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/falco/meta.yml b/modules/nf-core/falco/meta.yml new file mode 100644 index 0000000..6f77fb1 --- /dev/null +++ b/modules/nf-core/falco/meta.yml @@ -0,0 +1,52 @@ +name: falco +description: Run falco on sequenced reads +keywords: + - quality control + - qc + - adapters + - fastq +tools: + - fastqc: + description: "falco is a drop-in C++ implementation of FastQC to assess the quality of sequence reads." + + homepage: "https://falco.readthedocs.io/" + documentation: "https://falco.readthedocs.io/" + tool_dev_url: "None" + doi: "" + licence: "['GPL v3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - html: + type: file + description: FastQC like report + pattern: "*_{fastqc_report.html}" + - txt: + type: file + description: falco report data + pattern: "*_{data.txt}" + - txt: + type: file + description: falco summary file + pattern: "*_{summary.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@lucacozzuto" diff --git a/nextflow_schema.json b/nextflow_schema.json index f88443f..f48ed4a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,11 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir", "databases"], + "required": [ + "input", + "outdir", + "databases" + ], "properties": { "input": { "type": "string", @@ -80,7 +84,10 @@ "shortread_qc_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"], + "enum": [ + "fastp", + "adapterremoval" + ], "fa_icon": "fas fa-tools", "description": "Specify which tool to use for short-read QC" }, @@ -133,7 +140,11 @@ "shortread_complexityfilter_tool": { "type": "string", "default": "bbduk", - "enum": ["bbduk", "prinseqplusplus", "fastp"], + "enum": [ + "bbduk", + "prinseqplusplus", + "fastp" + ], "fa_icon": "fas fa-hammer", "description": "Specify which tool to use for complexity filtering" }, @@ -167,7 +178,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"], + "enum": [ + "entropy", + "dust" + ], "fa_icon": "fas fa-check-square", "description": "Specify the complexity filter mode for PRINSEQ++" }, @@ -341,7 +355,15 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"], + "enum": [ + "blast", + "xml", + "txt", + "daa", + "sam", + "tsv", + "paf" + ], "fa_icon": "fas fa-file", "description": "Specify output format from DIAMOND profiling.", "help_text": "DIAMOND can produce output in a number of different formats, you can specify here which to produce.\n\nNote that DIAMOND can only produce one format at a time, and depending on which you pick, some downstream steps may not be executed. For example, selecting `daa` or `sam` will mean you will not get a tabular taxonomic profile as with the other tools.\n\nWill be overriden by `--diamond_save_reads.`\n\n> Modifies tool parameter(s):\n> - diamond blastx: `--outfmt`" @@ -360,7 +382,14 @@ "kaiju_taxon_rank": { "type": "string", "default": "species", - "enum": ["phylum", "class", "order", "family", "genus", "species"], + "enum": [ + "phylum", + "class", + "order", + "family", + "genus", + "species" + ], "fa_icon": "fas fa-tag", "description": "Specify taxonomic rank to be displayed in Kaiju taxon table", "help_text": "Specify the taxonomic level(s) to be displayed in the resulting Kaiju taxon table, as generated by the kaiju2table helper tool.\n\nThis can be either a single level (e.g. `species`), or a comma separated list to display the full taxonomic path (e.g. `superkingdom,phylum,class,order,family,genus,species.`).\n\n> Modifies tool parameter(s):\n> - kaiju2table: `-l`" @@ -555,7 +584,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf index ce537e8..cf0d168 100644 --- a/subworkflows/local/longread_preprocessing.nf +++ b/subworkflows/local/longread_preprocessing.nf @@ -3,6 +3,8 @@ // include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/fastqc/main' +include { FALCO as FALCO_PROCESSED } from '../../modules/nf-core/falco/main' + include { PORECHOP } from '../../modules/nf-core/porechop/main' include { FILTLONG } from '../../modules/nf-core/filtlong/main' @@ -52,8 +54,14 @@ workflow LONGREAD_PREPROCESSING { ch_multiqc_files = ch_multiqc_files.mix( FILTLONG.out.log ) } - FASTQC_PROCESSED ( ch_processed_reads ) - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + if (params.perform_fastqc_alternative) { + FALCO_PROCESSED ( ch_processed_reads ) + ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt ) + + } else { + FASTQC_PROCESSED ( ch_processed_reads ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + } emit: reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index 859c1d5..a67539a 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -5,7 +5,8 @@ include { SHORTREAD_FASTP } from './shortread_fastp' include { SHORTREAD_ADAPTERREMOVAL } from './shortread_adapterremoval' -include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/fastqc/main' +include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/fastqc/main' +include { FALCO as FALCO_PROCESSED } from '../../modules/nf-core/falco/main' workflow SHORTREAD_PREPROCESSING { take: @@ -27,9 +28,16 @@ workflow SHORTREAD_PREPROCESSING { ch_processed_reads = reads } - FASTQC_PROCESSED ( ch_processed_reads ) - ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + if (params.perform_fastqc_alternative) { + FALCO_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FALCO_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt ) + + } else { + FASTQC_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + } emit: reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 8b9edb7..2435782 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -84,6 +84,7 @@ include { STANDARDISATION_PROFILES } from '../subworkflows/local/standardis // MODULE: Installed directly from nf-core/modules // include { FASTQC } from '../modules/nf-core/fastqc/main' +include { FALCO } from '../modules/nf-core/falco/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' @@ -120,12 +121,13 @@ workflow TAXPROFILER { */ ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ) - FASTQC ( - ch_input_for_fastqc - ) - - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - + if ( params.perform_fastqc_alternative ) { + FALCO ( ch_input_for_fastqc ) + ch_versions = ch_versions.mix(FALCO.out.versions.first()) + } else { + FASTQC ( ch_input_for_fastqc ) + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + } /* SUBWORKFLOW: PERFORM PREPROCESSING */ @@ -254,7 +256,10 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + + if (!params.perform_fastqc_alternative) { + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + } if (params.perform_shortread_qc) { ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) )