From 8eddb32b8807db582ca7fd1e31074c50788162ba Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Tue, 18 Oct 2022 17:43:16 +0200 Subject: [PATCH 1/9] Add Falco as an alternative to FastQC --- .github/workflows/ci.yml | 1 + CITATIONS.md | 4 ++ README.md | 2 +- conf/modules.config | 18 ++++++ docs/usage.md | 4 +- modules.json | 4 ++ modules/nf-core/falco/main.nf | 57 +++++++++++++++++++ modules/nf-core/falco/meta.yml | 52 +++++++++++++++++ nextflow_schema.json | 50 +++++++++++++--- subworkflows/local/longread_preprocessing.nf | 12 +++- subworkflows/local/shortread_preprocessing.nf | 16 ++++-- workflows/taxprofiler.nf | 19 ++++--- 12 files changed, 217 insertions(+), 22 deletions(-) create mode 100644 modules/nf-core/falco/main.nf create mode 100644 modules/nf-core/falco/meta.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5cea5b7..344505a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,6 +23,7 @@ jobs: - "21.10.3" - "latest-everything" parameters: + - "--perform_fastqc_alternative false" - "--perform_longread_qc false" - "--perform_shortread_qc false" - "--shortread_qc_tool fastp" diff --git a/CITATIONS.md b/CITATIONS.md index 1ce4ec2..510c74b 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -62,6 +62,10 @@ - [FILTLONG](https://github.com/rrwick/Filtlong) +- [Falco](https://doi.org/10.12688/f1000research.21142.2) + +> de Sena Brandine G and Smith AD. Falco: high-speed FastQC emulation for quality control of sequencing data. F1000Research 2021, 8:1874 + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index 2bd45a9..e17c7b5 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ On release, automated continuous integration tests run the pipeline on a full-si ![](docs/images/taxprofiler_tube.png) -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) or [`Falco`](https://github.com/smithlabcode/falco) as an alternative option) 2. Performs optional read pre-processing - Adapter clipping and merging (short-read: [fastp](https://github.com/OpenGene/fastp), [AdapterRemoval2](https://github.com/MikkelSchubert/adapterremoval); long-read: [porechop](https://github.com/rrwick/Porechop)) - Low complexity and quality filtering (short-read: [bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus); long-read: [Filtlong](https://github.com/rrwick/Filtlong)) diff --git a/conf/modules.config b/conf/modules.config index d2a0051..1e2058a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -40,6 +40,24 @@ process { ] } + withName: FALCO { + ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } + publishDir = [ + path: { "${params.outdir}/falco/raw" }, + mode: params.publish_dir_mode, + pattern: '*.{html,txt}' + ] + } + + withName: FALCO_PROCESSED { + ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } + publishDir = [ + path: { "${params.outdir}/falco/processed" }, + mode: params.publish_dir_mode, + pattern: '*.{html,txt}' + ] + } + withName: FASTP_SINGLE { ext.args = [ // trimming options diff --git a/docs/usage.md b/docs/usage.md index 9194220..7478636 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -165,7 +165,9 @@ work # Directory containing the nextflow working files .nextflow_log # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` +### Sequencing quality control +nf-core taxprofiler offers [`Falco`](https://github.com/smithlabcode/falco] as an alternative option to [`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). ### Preprocessing Steps nf-core/taxprofiler offers four main preprocessing steps @@ -179,7 +181,7 @@ nf-core/taxprofiler offers four main preprocessing steps Raw sequencing read processing in the form of adapter clipping and paired-end read merging can be activated via the `--perform_shortread_qc` or `--perform_longread_qc` flags. -It is highly recommended to run this on raw reads to remove artefacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles. +It is highly recommended to run this on raw reads to remove artifacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles. There are currently two options for short-read preprocessing: `fastp` or `adapterremoval`. diff --git a/modules.json b/modules.json index 10d6c74..aa77c4e 100644 --- a/modules.json +++ b/modules.json @@ -49,6 +49,10 @@ "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" }, + "falco": { + "branch": "master", + "git_sha": "fc959214036403ad83efe7a41d43d0606c445cda" + }, "fastp": { "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" diff --git a/modules/nf-core/falco/main.nf b/modules/nf-core/falco/main.nf new file mode 100644 index 0000000..1688162 --- /dev/null +++ b/modules/nf-core/falco/main.nf @@ -0,0 +1,57 @@ +process FALCO { + tag "$meta.id" + label 'process_single' + + + conda (params.enable_conda ? "bioconda::falco=1.2.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/falco:1.2.1--h867801b_3': + 'quay.io/biocontainers/falco:1.2.1--h867801b_3' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.html"), emit: html + tuple val(meta), path("*.txt") , emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ( reads.toList().size() == 1 ) { + """ + falco $args --threads $task.cpus ${reads} -D ${prefix}_data.txt -S ${prefix}_summary.txt -R ${prefix}_report.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + falco:\$( falco --version | sed -e "s/falco//g" ) + END_VERSIONS + """ + } else { + """ + falco $args --threads $task.cpus ${reads} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + falco:\$( falco --version | sed -e "s/falco//g" ) + END_VERSIONS + """ + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_data.txt + touch ${prefix}_fastqc_data.html + touch ${prefix}_summary.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + falco: \$( falco --version | sed -e "s/falco v//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/falco/meta.yml b/modules/nf-core/falco/meta.yml new file mode 100644 index 0000000..6f77fb1 --- /dev/null +++ b/modules/nf-core/falco/meta.yml @@ -0,0 +1,52 @@ +name: falco +description: Run falco on sequenced reads +keywords: + - quality control + - qc + - adapters + - fastq +tools: + - fastqc: + description: "falco is a drop-in C++ implementation of FastQC to assess the quality of sequence reads." + + homepage: "https://falco.readthedocs.io/" + documentation: "https://falco.readthedocs.io/" + tool_dev_url: "None" + doi: "" + licence: "['GPL v3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - html: + type: file + description: FastQC like report + pattern: "*_{fastqc_report.html}" + - txt: + type: file + description: falco report data + pattern: "*_{data.txt}" + - txt: + type: file + description: falco summary file + pattern: "*_{summary.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@lucacozzuto" diff --git a/nextflow_schema.json b/nextflow_schema.json index f88443f..f48ed4a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,11 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir", "databases"], + "required": [ + "input", + "outdir", + "databases" + ], "properties": { "input": { "type": "string", @@ -80,7 +84,10 @@ "shortread_qc_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"], + "enum": [ + "fastp", + "adapterremoval" + ], "fa_icon": "fas fa-tools", "description": "Specify which tool to use for short-read QC" }, @@ -133,7 +140,11 @@ "shortread_complexityfilter_tool": { "type": "string", "default": "bbduk", - "enum": ["bbduk", "prinseqplusplus", "fastp"], + "enum": [ + "bbduk", + "prinseqplusplus", + "fastp" + ], "fa_icon": "fas fa-hammer", "description": "Specify which tool to use for complexity filtering" }, @@ -167,7 +178,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"], + "enum": [ + "entropy", + "dust" + ], "fa_icon": "fas fa-check-square", "description": "Specify the complexity filter mode for PRINSEQ++" }, @@ -341,7 +355,15 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"], + "enum": [ + "blast", + "xml", + "txt", + "daa", + "sam", + "tsv", + "paf" + ], "fa_icon": "fas fa-file", "description": "Specify output format from DIAMOND profiling.", "help_text": "DIAMOND can produce output in a number of different formats, you can specify here which to produce.\n\nNote that DIAMOND can only produce one format at a time, and depending on which you pick, some downstream steps may not be executed. For example, selecting `daa` or `sam` will mean you will not get a tabular taxonomic profile as with the other tools.\n\nWill be overriden by `--diamond_save_reads.`\n\n> Modifies tool parameter(s):\n> - diamond blastx: `--outfmt`" @@ -360,7 +382,14 @@ "kaiju_taxon_rank": { "type": "string", "default": "species", - "enum": ["phylum", "class", "order", "family", "genus", "species"], + "enum": [ + "phylum", + "class", + "order", + "family", + "genus", + "species" + ], "fa_icon": "fas fa-tag", "description": "Specify taxonomic rank to be displayed in Kaiju taxon table", "help_text": "Specify the taxonomic level(s) to be displayed in the resulting Kaiju taxon table, as generated by the kaiju2table helper tool.\n\nThis can be either a single level (e.g. `species`), or a comma separated list to display the full taxonomic path (e.g. `superkingdom,phylum,class,order,family,genus,species.`).\n\n> Modifies tool parameter(s):\n> - kaiju2table: `-l`" @@ -555,7 +584,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf index ce537e8..cf0d168 100644 --- a/subworkflows/local/longread_preprocessing.nf +++ b/subworkflows/local/longread_preprocessing.nf @@ -3,6 +3,8 @@ // include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/fastqc/main' +include { FALCO as FALCO_PROCESSED } from '../../modules/nf-core/falco/main' + include { PORECHOP } from '../../modules/nf-core/porechop/main' include { FILTLONG } from '../../modules/nf-core/filtlong/main' @@ -52,8 +54,14 @@ workflow LONGREAD_PREPROCESSING { ch_multiqc_files = ch_multiqc_files.mix( FILTLONG.out.log ) } - FASTQC_PROCESSED ( ch_processed_reads ) - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + if (params.perform_fastqc_alternative) { + FALCO_PROCESSED ( ch_processed_reads ) + ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt ) + + } else { + FASTQC_PROCESSED ( ch_processed_reads ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + } emit: reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index 859c1d5..a67539a 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -5,7 +5,8 @@ include { SHORTREAD_FASTP } from './shortread_fastp' include { SHORTREAD_ADAPTERREMOVAL } from './shortread_adapterremoval' -include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/fastqc/main' +include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/fastqc/main' +include { FALCO as FALCO_PROCESSED } from '../../modules/nf-core/falco/main' workflow SHORTREAD_PREPROCESSING { take: @@ -27,9 +28,16 @@ workflow SHORTREAD_PREPROCESSING { ch_processed_reads = reads } - FASTQC_PROCESSED ( ch_processed_reads ) - ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) - ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + if (params.perform_fastqc_alternative) { + FALCO_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FALCO_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt ) + + } else { + FASTQC_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + } emit: reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 8b9edb7..2435782 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -84,6 +84,7 @@ include { STANDARDISATION_PROFILES } from '../subworkflows/local/standardis // MODULE: Installed directly from nf-core/modules // include { FASTQC } from '../modules/nf-core/fastqc/main' +include { FALCO } from '../modules/nf-core/falco/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' @@ -120,12 +121,13 @@ workflow TAXPROFILER { */ ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ) - FASTQC ( - ch_input_for_fastqc - ) - - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - + if ( params.perform_fastqc_alternative ) { + FALCO ( ch_input_for_fastqc ) + ch_versions = ch_versions.mix(FALCO.out.versions.first()) + } else { + FASTQC ( ch_input_for_fastqc ) + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + } /* SUBWORKFLOW: PERFORM PREPROCESSING */ @@ -254,7 +256,10 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + + if (!params.perform_fastqc_alternative) { + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + } if (params.perform_shortread_qc) { ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) From 0481bb9449a1d35a8fe8324811180ef68f0d361f Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Tue, 18 Oct 2022 17:48:40 +0200 Subject: [PATCH 2/9] Prettier --- docs/usage.md | 2 ++ nextflow_schema.json | 50 +++++++------------------------------------- 2 files changed, 9 insertions(+), 43 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 7478636..4a44cf4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -165,9 +165,11 @@ work # Directory containing the nextflow working files .nextflow_log # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` + ### Sequencing quality control nf-core taxprofiler offers [`Falco`](https://github.com/smithlabcode/falco] as an alternative option to [`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). + ### Preprocessing Steps nf-core/taxprofiler offers four main preprocessing steps diff --git a/nextflow_schema.json b/nextflow_schema.json index f48ed4a..f88443f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,11 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir", - "databases" - ], + "required": ["input", "outdir", "databases"], "properties": { "input": { "type": "string", @@ -84,10 +80,7 @@ "shortread_qc_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ], + "enum": ["fastp", "adapterremoval"], "fa_icon": "fas fa-tools", "description": "Specify which tool to use for short-read QC" }, @@ -140,11 +133,7 @@ "shortread_complexityfilter_tool": { "type": "string", "default": "bbduk", - "enum": [ - "bbduk", - "prinseqplusplus", - "fastp" - ], + "enum": ["bbduk", "prinseqplusplus", "fastp"], "fa_icon": "fas fa-hammer", "description": "Specify which tool to use for complexity filtering" }, @@ -178,10 +167,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ], + "enum": ["entropy", "dust"], "fa_icon": "fas fa-check-square", "description": "Specify the complexity filter mode for PRINSEQ++" }, @@ -355,15 +341,7 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": [ - "blast", - "xml", - "txt", - "daa", - "sam", - "tsv", - "paf" - ], + "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"], "fa_icon": "fas fa-file", "description": "Specify output format from DIAMOND profiling.", "help_text": "DIAMOND can produce output in a number of different formats, you can specify here which to produce.\n\nNote that DIAMOND can only produce one format at a time, and depending on which you pick, some downstream steps may not be executed. For example, selecting `daa` or `sam` will mean you will not get a tabular taxonomic profile as with the other tools.\n\nWill be overriden by `--diamond_save_reads.`\n\n> Modifies tool parameter(s):\n> - diamond blastx: `--outfmt`" @@ -382,14 +360,7 @@ "kaiju_taxon_rank": { "type": "string", "default": "species", - "enum": [ - "phylum", - "class", - "order", - "family", - "genus", - "species" - ], + "enum": ["phylum", "class", "order", "family", "genus", "species"], "fa_icon": "fas fa-tag", "description": "Specify taxonomic rank to be displayed in Kaiju taxon table", "help_text": "Specify the taxonomic level(s) to be displayed in the resulting Kaiju taxon table, as generated by the kaiju2table helper tool.\n\nThis can be either a single level (e.g. `species`), or a comma separated list to display the full taxonomic path (e.g. `superkingdom,phylum,class,order,family,genus,species.`).\n\n> Modifies tool parameter(s):\n> - kaiju2table: `-l`" @@ -584,14 +555,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { From 7881ba9aefdfd541d49d412edc683728bd82a97a Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Tue, 18 Oct 2022 17:50:49 +0200 Subject: [PATCH 3/9] Fix the amount of left-padding spaces --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 1e2058a..b1d7b87 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -41,7 +41,7 @@ process { } withName: FALCO { - ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } + ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } publishDir = [ path: { "${params.outdir}/falco/raw" }, mode: params.publish_dir_mode, From 511cb16a6119fe746950abc5fed4100517a41775 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Thu, 20 Oct 2022 16:55:57 +0200 Subject: [PATCH 4/9] Apply review suggestions --- .github/workflows/ci.yml | 3 +- CITATIONS.md | 2 +- README.md | 2 +- nextflow.config | 2 + nextflow_schema.json | 62 ++++++++++++++++--- subworkflows/local/longread_preprocessing.nf | 10 +-- subworkflows/local/shortread_preprocessing.nf | 11 ++-- workflows/taxprofiler.nf | 7 ++- 8 files changed, 75 insertions(+), 24 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 344505a..12185fe 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,7 +23,8 @@ jobs: - "21.10.3" - "latest-everything" parameters: - - "--perform_fastqc_alternative false" + - "--preprocessing_qc_tool fastqc" + - "--preprocessing_qc_tool falco" - "--perform_longread_qc false" - "--perform_shortread_qc false" - "--shortread_qc_tool fastp" diff --git a/CITATIONS.md b/CITATIONS.md index 510c74b..599e049 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -62,7 +62,7 @@ - [FILTLONG](https://github.com/rrwick/Filtlong) -- [Falco](https://doi.org/10.12688/f1000research.21142.2) +- [falco](https://doi.org/10.12688/f1000research.21142.2) > de Sena Brandine G and Smith AD. Falco: high-speed FastQC emulation for quality control of sequencing data. F1000Research 2021, 8:1874 diff --git a/README.md b/README.md index e17c7b5..3f0a840 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ On release, automated continuous integration tests run the pipeline on a full-si ![](docs/images/taxprofiler_tube.png) -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) or [`Falco`](https://github.com/smithlabcode/falco) as an alternative option) +1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) or [`falco`](https://github.com/smithlabcode/falco) as an alternative option) 2. Performs optional read pre-processing - Adapter clipping and merging (short-read: [fastp](https://github.com/OpenGene/fastp), [AdapterRemoval2](https://github.com/MikkelSchubert/adapterremoval); long-read: [porechop](https://github.com/rrwick/Porechop)) - Low complexity and quality filtering (short-read: [bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus); long-read: [Filtlong](https://github.com/rrwick/Filtlong)) diff --git a/nextflow.config b/nextflow.config index efb5aff..b69948e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -59,6 +59,8 @@ params { // Databases databases = null + preprocessing_qc_tool = 'fastqc' + // FASTQ preprocessing perform_shortread_qc = false shortread_qc_tool = 'fastp' diff --git a/nextflow_schema.json b/nextflow_schema.json index f88443f..055c368 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,11 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir", "databases"], + "required": [ + "input", + "outdir", + "databases" + ], "properties": { "input": { "type": "string", @@ -80,7 +84,10 @@ "shortread_qc_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"], + "enum": [ + "fastp", + "adapterremoval" + ], "fa_icon": "fas fa-tools", "description": "Specify which tool to use for short-read QC" }, @@ -133,7 +140,11 @@ "shortread_complexityfilter_tool": { "type": "string", "default": "bbduk", - "enum": ["bbduk", "prinseqplusplus", "fastp"], + "enum": [ + "bbduk", + "prinseqplusplus", + "fastp" + ], "fa_icon": "fas fa-hammer", "description": "Specify which tool to use for complexity filtering" }, @@ -167,7 +178,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"], + "enum": [ + "entropy", + "dust" + ], "fa_icon": "fas fa-check-square", "description": "Specify the complexity filter mode for PRINSEQ++" }, @@ -341,7 +355,15 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"], + "enum": [ + "blast", + "xml", + "txt", + "daa", + "sam", + "tsv", + "paf" + ], "fa_icon": "fas fa-file", "description": "Specify output format from DIAMOND profiling.", "help_text": "DIAMOND can produce output in a number of different formats, you can specify here which to produce.\n\nNote that DIAMOND can only produce one format at a time, and depending on which you pick, some downstream steps may not be executed. For example, selecting `daa` or `sam` will mean you will not get a tabular taxonomic profile as with the other tools.\n\nWill be overriden by `--diamond_save_reads.`\n\n> Modifies tool parameter(s):\n> - diamond blastx: `--outfmt`" @@ -360,7 +382,14 @@ "kaiju_taxon_rank": { "type": "string", "default": "species", - "enum": ["phylum", "class", "order", "family", "genus", "species"], + "enum": [ + "phylum", + "class", + "order", + "family", + "genus", + "species" + ], "fa_icon": "fas fa-tag", "description": "Specify taxonomic rank to be displayed in Kaiju taxon table", "help_text": "Specify the taxonomic level(s) to be displayed in the resulting Kaiju taxon table, as generated by the kaiju2table helper tool.\n\nThis can be either a single level (e.g. `species`), or a comma separated list to display the full taxonomic path (e.g. `superkingdom,phylum,class,order,family,genus,species.`).\n\n> Modifies tool parameter(s):\n> - kaiju2table: `-l`" @@ -555,7 +584,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -707,5 +743,15 @@ { "$ref": "#/definitions/reference_genome_options" } - ] + ], + "properties": { + "preprocessing_qc_tool": { + "type": "string", + "default": "fastqc", + "enum": [ + "fastqc", + "falco" + ] + } + } } diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf index cf0d168..d16db3f 100644 --- a/subworkflows/local/longread_preprocessing.nf +++ b/subworkflows/local/longread_preprocessing.nf @@ -54,13 +54,13 @@ workflow LONGREAD_PREPROCESSING { ch_multiqc_files = ch_multiqc_files.mix( FILTLONG.out.log ) } - if (params.perform_fastqc_alternative) { - FALCO_PROCESSED ( ch_processed_reads ) - ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt ) - - } else { + if (params.preprocessing_qc_tool == 'fastqc') { FASTQC_PROCESSED ( ch_processed_reads ) ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + + } else if (params.preprocessing_qc_tool == 'falco') { + FALCO_PROCESSED ( ch_processed_reads ) + ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt ) } emit: diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf index a67539a..f2cd738 100644 --- a/subworkflows/local/shortread_preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -28,15 +28,14 @@ workflow SHORTREAD_PREPROCESSING { ch_processed_reads = reads } - if (params.perform_fastqc_alternative) { - FALCO_PROCESSED ( ch_processed_reads ) - ch_versions = ch_versions.mix( FALCO_PROCESSED.out.versions ) - ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt ) - - } else { + if (params.preprocessing_qc_tool == 'fastqc') { FASTQC_PROCESSED ( ch_processed_reads ) ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + } else if (params.preprocessing_qc_tool == 'falco') { + FALCO_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FALCO_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt ) } emit: diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 2435782..905cbd9 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -121,7 +121,7 @@ workflow TAXPROFILER { */ ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ) - if ( params.perform_fastqc_alternative ) { + if ( params.preprocessing_qc_tool == 'falco' ) { FALCO ( ch_input_for_fastqc ) ch_versions = ch_versions.mix(FALCO.out.versions.first()) } else { @@ -257,9 +257,12 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - if (!params.perform_fastqc_alternative) { + if (!params.preprocessing_qc_tool == 'falco') { ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) } + else { + ch_multiqc_files = ch_multiqc_files.mix(FALCO.out.txt.collect{it[1]}.ifEmpty([])) + } if (params.perform_shortread_qc) { ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) From 19231538daac4365596f9e827680fb007df53079 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Thu, 20 Oct 2022 16:59:32 +0200 Subject: [PATCH 5/9] Prettier --- nextflow_schema.json | 55 +++++++------------------------------------- 1 file changed, 8 insertions(+), 47 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 055c368..ff2505f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,11 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir", - "databases" - ], + "required": ["input", "outdir", "databases"], "properties": { "input": { "type": "string", @@ -84,10 +80,7 @@ "shortread_qc_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ], + "enum": ["fastp", "adapterremoval"], "fa_icon": "fas fa-tools", "description": "Specify which tool to use for short-read QC" }, @@ -140,11 +133,7 @@ "shortread_complexityfilter_tool": { "type": "string", "default": "bbduk", - "enum": [ - "bbduk", - "prinseqplusplus", - "fastp" - ], + "enum": ["bbduk", "prinseqplusplus", "fastp"], "fa_icon": "fas fa-hammer", "description": "Specify which tool to use for complexity filtering" }, @@ -178,10 +167,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ], + "enum": ["entropy", "dust"], "fa_icon": "fas fa-check-square", "description": "Specify the complexity filter mode for PRINSEQ++" }, @@ -355,15 +341,7 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": [ - "blast", - "xml", - "txt", - "daa", - "sam", - "tsv", - "paf" - ], + "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"], "fa_icon": "fas fa-file", "description": "Specify output format from DIAMOND profiling.", "help_text": "DIAMOND can produce output in a number of different formats, you can specify here which to produce.\n\nNote that DIAMOND can only produce one format at a time, and depending on which you pick, some downstream steps may not be executed. For example, selecting `daa` or `sam` will mean you will not get a tabular taxonomic profile as with the other tools.\n\nWill be overriden by `--diamond_save_reads.`\n\n> Modifies tool parameter(s):\n> - diamond blastx: `--outfmt`" @@ -382,14 +360,7 @@ "kaiju_taxon_rank": { "type": "string", "default": "species", - "enum": [ - "phylum", - "class", - "order", - "family", - "genus", - "species" - ], + "enum": ["phylum", "class", "order", "family", "genus", "species"], "fa_icon": "fas fa-tag", "description": "Specify taxonomic rank to be displayed in Kaiju taxon table", "help_text": "Specify the taxonomic level(s) to be displayed in the resulting Kaiju taxon table, as generated by the kaiju2table helper tool.\n\nThis can be either a single level (e.g. `species`), or a comma separated list to display the full taxonomic path (e.g. `superkingdom,phylum,class,order,family,genus,species.`).\n\n> Modifies tool parameter(s):\n> - kaiju2table: `-l`" @@ -584,14 +555,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -748,10 +712,7 @@ "preprocessing_qc_tool": { "type": "string", "default": "fastqc", - "enum": [ - "fastqc", - "falco" - ] + "enum": ["fastqc", "falco"] } } } From d3d4297ca932ff1b33cc60827995af673ae832f5 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 24 Oct 2022 16:28:07 +0200 Subject: [PATCH 6/9] Apply review suggestions --- workflows/taxprofiler.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 905cbd9..1ac519c 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -257,12 +257,12 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - if (!params.preprocessing_qc_tool == 'falco') { + if ( params.preprocessing_qc_tool == 'falco' ) { + ch_multiqc_files = ch_multiqc_files.mix(FALCO.out.txt.collect{it[1]}.ifEmpty([])) + } else { ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) } - else { - ch_multiqc_files = ch_multiqc_files.mix(FALCO.out.txt.collect{it[1]}.ifEmpty([])) - } + if (params.perform_shortread_qc) { ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) From 8c3e80b6ea85b316b5f476c016a068180a09ffdd Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Tue, 25 Oct 2022 11:13:26 +0200 Subject: [PATCH 7/9] Add helptest to nextflow_schema,add versions to longread_preprocessing --- .github/workflows/ci.yml | 1 - docs/usage.md | 2 +- nextflow_schema.json | 4 +++- subworkflows/local/longread_preprocessing.nf | 2 ++ 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 12185fe..862e072 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,7 +23,6 @@ jobs: - "21.10.3" - "latest-everything" parameters: - - "--preprocessing_qc_tool fastqc" - "--preprocessing_qc_tool falco" - "--perform_longread_qc false" - "--perform_shortread_qc false" diff --git a/docs/usage.md b/docs/usage.md index 4a44cf4..32fec4d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -168,7 +168,7 @@ work # Directory containing the nextflow working files ### Sequencing quality control -nf-core taxprofiler offers [`Falco`](https://github.com/smithlabcode/falco] as an alternative option to [`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). +nf-core taxprofiler offers [`falco`](https://github.com/smithlabcode/falco] as an alternative option to [`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). ### Preprocessing Steps diff --git a/nextflow_schema.json b/nextflow_schema.json index ff2505f..aec2b3f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -712,7 +712,9 @@ "preprocessing_qc_tool": { "type": "string", "default": "fastqc", - "enum": ["fastqc", "falco"] + "enum": ["fastqc", "falco"], + "help_text": "Falco is aimed at being a drop-in replacement for FastQC but written in C++ for faster computation. We particularly recommend using falco when using long reads (due to reduced memory constraints), however is also applicable for short reads.", + "description": "By default, the pipeline utilizes FastQC tool for quality control of raw sequencing reads" } } } diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf index d16db3f..3da49ce 100644 --- a/subworkflows/local/longread_preprocessing.nf +++ b/subworkflows/local/longread_preprocessing.nf @@ -56,10 +56,12 @@ workflow LONGREAD_PREPROCESSING { if (params.preprocessing_qc_tool == 'fastqc') { FASTQC_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) } else if (params.preprocessing_qc_tool == 'falco') { FALCO_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FALCO_PROCESSED.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt ) } From 7b1a53b06267ecb672a785b7c72a4e6f3cf60b26 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli <91951607+sofstam@users.noreply.github.com> Date: Tue, 25 Oct 2022 11:19:08 +0200 Subject: [PATCH 8/9] Update nextflow_schema.json Co-authored-by: James A. Fellows Yates --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index aec2b3f..225e05d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -713,7 +713,7 @@ "type": "string", "default": "fastqc", "enum": ["fastqc", "falco"], - "help_text": "Falco is aimed at being a drop-in replacement for FastQC but written in C++ for faster computation. We particularly recommend using falco when using long reads (due to reduced memory constraints), however is also applicable for short reads.", + "help_text": "Falco is designed as a drop-in replacement for FastQC but written in C++ for faster computation. We particularly recommend using falco when using long reads (due to reduced memory constraints), however is also applicable for short reads.", "description": "By default, the pipeline utilizes FastQC tool for quality control of raw sequencing reads" } } From de6816ec490cce6ba93c86a557c1800acf6ff5d9 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli <91951607+sofstam@users.noreply.github.com> Date: Tue, 25 Oct 2022 11:19:16 +0200 Subject: [PATCH 9/9] Update nextflow_schema.json Co-authored-by: James A. Fellows Yates --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 225e05d..4479f54 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -714,7 +714,7 @@ "default": "fastqc", "enum": ["fastqc", "falco"], "help_text": "Falco is designed as a drop-in replacement for FastQC but written in C++ for faster computation. We particularly recommend using falco when using long reads (due to reduced memory constraints), however is also applicable for short reads.", - "description": "By default, the pipeline utilizes FastQC tool for quality control of raw sequencing reads" + "description": "Specify the tool used for quality control of raw sequencing reads" } } }