From 3ff54e620e9b9212a3bad5c687769b8c37e5b89d Mon Sep 17 00:00:00 2001 From: sofstam Date: Thu, 24 Mar 2022 12:51:45 +0100 Subject: [PATCH 01/19] Add centrifuge classification --- conf/modules.config | 10 +++ conf/test.config | 1 + modules.json | 5 +- modules/nf-core/modules/centrifuge/main.nf | 63 ++++++++++++++++++ modules/nf-core/modules/centrifuge/meta.yml | 73 +++++++++++++++++++++ nextflow.config | 8 ++- subworkflows/local/db_check.nf | 2 +- subworkflows/local/input_check.nf | 3 +- workflows/taxprofiler.nf | 22 +++++-- 9 files changed, 179 insertions(+), 8 deletions(-) create mode 100644 modules/nf-core/modules/centrifuge/main.nf create mode 100644 modules/nf-core/modules/centrifuge/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 29a5135..20e6bba 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -121,4 +121,14 @@ process { ] } + withName: CENTRIFUGE { + publishDir = [ + path: { "${params.outdir}/centrifuge/${meta.db_name}" }, + mode: 'copy', + pattern: '*.{fastq.gz,txt}' + ] + ext.args = { "${meta.db_params}" } + ext.prefix = { "${meta.id}-${meta.db_name}" } + } + } diff --git a/conf/test.config b/conf/test.config index 42d8de6..6fca9c0 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,5 +29,6 @@ params { run_kraken2 = true run_malt = true shortread_clipmerge = true + run_centrifuge = true } diff --git a/modules.json b/modules.json index 673a69b..b9dfc87 100644 --- a/modules.json +++ b/modules.json @@ -29,6 +29,9 @@ "porechop": { "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046" } + "centrifuge": { + "git_sha": "ea41a8a6f761b9993d857570e872abaae3fea555" + } } } -} \ No newline at end of file +} diff --git a/modules/nf-core/modules/centrifuge/main.nf b/modules/nf-core/modules/centrifuge/main.nf new file mode 100644 index 0000000..7eb566d --- /dev/null +++ b/modules/nf-core/modules/centrifuge/main.nf @@ -0,0 +1,63 @@ +process CENTRIFUGE { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? "bioconda::centrifuge=1.0.4_beta" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6' : + 'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }" + + input: + tuple val(meta), path(reads) + path db + val save_unaligned + val save_aligned + val sam_format + + output: + tuple val(meta), path('*report.txt') , emit: report + tuple val(meta), path('*results.txt') , emit: results + tuple val(meta), path('*kreport.txt') , emit: kreport + tuple val(meta), path('*.sam') , optional: true, emit: sam + tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_mapped + tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "-U ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" + def db_name = db.toString().replace(".tar.gz","") + def unaligned = '' + def aligned = '' + if (meta.single_end) { + unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : '' + } else { + unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : '' + } + def sam_output = sam_format ? "--out-fmt 'sam'" : '' + """ + tar -xf $db + centrifuge \\ + -x $db_name \\ + -p $task.cpus \\ + $paired \\ + --report-file ${prefix}.report.txt \\ + -S ${prefix}.results.txt \\ + $unaligned \\ + $aligned \\ + $sam_output \\ + $args + centrifuge-kreport -x $db_name ${prefix}.results.txt > ${prefix}.kreport.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/centrifuge/meta.yml b/modules/nf-core/modules/centrifuge/meta.yml new file mode 100644 index 0000000..3adf0e2 --- /dev/null +++ b/modules/nf-core/modules/centrifuge/meta.yml @@ -0,0 +1,73 @@ +name: centrifuge +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - centrifuge: + description: Centrifuge is a classifier for metagenomic sequences. + homepage: https://ccb.jhu.edu/software/centrifuge/ + documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml + doi: 10.1101/gr.210641.116 + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Centrifuge database in .tar.gz format + pattern: "*.tar.gz" + - save_unaligned: + type: value + description: If true unmapped fastq files are saved + - save_aligned: + type: value + description: If true mapped fastq files are saved +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - report: + type: file + description: | + File containing a classification summary + pattern: "*.{report.txt}" + - results: + type: file + description: | + File containing classification results + pattern: "*.{results.txt}" + - kreport: + type: file + description: | + File containing kraken-style report from centrifuge + out files. + pattern: "*.{kreport.txt}" + - fastq_unmapped: + type: file + description: Unmapped fastq files + pattern: "*.unmapped.fastq.gz" + - fastq_mapped: + type: file + description: Mapped fastq files + pattern: "*.mapped.fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@sofstam" + - "@jfy133" + - "@sateeshperi" diff --git a/nextflow.config b/nextflow.config index 5f7aec6..5bd8f39 100644 --- a/nextflow.config +++ b/nextflow.config @@ -56,7 +56,7 @@ params { // FASTQ preprocessing shortread_clipmerge = false - shortread_excludeunmerged = true + shortread_excludeunmerged = true longread_clip = false // MALT @@ -65,6 +65,12 @@ params { // kraken2 run_kraken2 = false + + // centrifuge + run_centrifuge = false + save_unaligned = false + save_aligned = false + sam_format = false } // Load base.config by default for all pipelines diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index 890e373..28268c3 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -21,7 +21,7 @@ workflow DB_CHECK { ch_dbs_for_untar = parsed_samplesheet .branch { - untar: it[1].toString().endsWith(".tar.gz") + untar: it[1].toString().endsWith(".tar.gz") && it[0]['tool']!="centrifuge" skip: true } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 4501386..b64e31e 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -67,8 +67,9 @@ def create_fastq_channel(LinkedHashMap row) { if (!file(row.fastq_2).exists()) { exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] + fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] } + } return fastq_meta } diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 6fc5450..ea3ef18 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -58,7 +58,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/ include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' include { MALT_RUN } from '../modules/nf-core/modules/malt/run/main' include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/modules/kraken2/kraken2/main' - +include { CENTRIFUGE } from '../modules/nf-core/modules/centrifuge/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -149,9 +149,10 @@ workflow TAXPROFILER { .combine(DB_CHECK.out.dbs) .dump(tag: "reads_plus_db") .branch { - malt: it[2]['tool'] == 'malt' - kraken2: it[2]['tool'] == 'kraken2' - unknown: true + malt: it[2]['tool'] == 'malt' + kraken2: it[2]['tool'] == 'kraken2' + centrifuge: it[2]['tool'] == 'centrifuge' + unknown: true } // @@ -184,6 +185,15 @@ workflow TAXPROFILER { db: it[3] } + // We can run centrifuge one-by-one sample-wise + ch_input_for_centrifuge = ch_input_for_profiling.centrifuge + .dump(tag: "input for centrifuge") + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + // // RUN PROFILING // @@ -195,6 +205,10 @@ workflow TAXPROFILER { KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) } + if ( params.run_centrifuge ) { + CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.save_unaligned, params.save_aligned, params.sam_format ) + } + // // MODULE: MultiQC // From 16bdc79cc08750e8df36acc8c14aa90e8c522f76 Mon Sep 17 00:00:00 2001 From: sofstam Date: Fri, 25 Mar 2022 16:30:26 +0100 Subject: [PATCH 02/19] Apply prettier --- modules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 2dbde56..ffaff90 100644 --- a/modules.json +++ b/modules.json @@ -29,7 +29,7 @@ }, "porechop": { "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046" - } + }, "centrifuge": { "git_sha": "ea41a8a6f761b9993d857570e872abaae3fea555" } From 59d3f18a753bf5b06ab495c98f96df77a84513d5 Mon Sep 17 00:00:00 2001 From: sofstam Date: Fri, 25 Mar 2022 17:18:04 +0100 Subject: [PATCH 03/19] Update nextflow_schema.json --- nextflow_schema.json | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index b8d5a1d..b4b3e07 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -293,20 +293,16 @@ "type": "boolean" }, "run_centrifuge": { - "type": "string", - "default": "false" + "type": "boolean" }, "centrifuge_save_unaligned": { - "type": "string", - "default": "false" + "type": "boolean" }, "centrifuge_save_aligned": { - "type": "string", - "default": "false" + "type": "boolean" }, "centrifuge_sam_format": { - "type": "string", - "default": "false" + "type": "boolean" } } -} \ No newline at end of file +} From 0e0e8128e868df6ea8a4f4c22d7841f518d84c9f Mon Sep 17 00:00:00 2001 From: sofstam Date: Mon, 28 Mar 2022 13:43:32 +0200 Subject: [PATCH 04/19] Prettier format --- nextflow_schema.json | 579 +++++++++++++++++++++---------------------- 1 file changed, 288 insertions(+), 291 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index b4b3e07..b61a50e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,308 +1,305 @@ { - "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/nf-core/taxprofiler/master/nextflow_schema.json", - "title": "nf-core/taxprofiler pipeline parameters", - "description": "Taxonomic profiling of shotgun metagenomic data", - "type": "object", - "definitions": { - "input_output_options": { - "title": "Input/output options", - "type": "object", - "fa_icon": "fas fa-terminal", - "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], - "properties": { - "input": { - "type": "string", - "format": "file-path", - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "schema": "assets/schema_input.json", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" - }, - "outdir": { - "type": "string", - "format": "directory-path", - "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", - "fa_icon": "fas fa-folder-open" - }, - "email": { - "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" - }, - "multiqc_title": { - "type": "string", - "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", - "fa_icon": "fas fa-file-signature" - } - } + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/taxprofiler/master/nextflow_schema.json", + "title": "nf-core/taxprofiler pipeline parameters", + "description": "Taxonomic profiling of shotgun metagenomic data", + "type": "object", + "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": ["input", "outdir"], + "properties": { + "input": { + "type": "string", + "format": "file-path", + "mimetype": "text/csv", + "pattern": "^\\S+\\.csv$", + "schema": "assets/schema_input.json", + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).", + "fa_icon": "fas fa-file-csv" }, - "reference_genome_options": { - "title": "Reference genome options", - "type": "object", - "fa_icon": "fas fa-dna", - "description": "Reference genome related files and options required for the workflow.", - "properties": { - "genome": { - "type": "string", - "description": "Name of iGenomes reference.", - "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." - }, - "igenomes_base": { - "type": "string", - "format": "directory-path", - "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes", - "fa_icon": "fas fa-cloud-download-alt", - "hidden": true - }, - "igenomes_ignore": { - "type": "boolean", - "description": "Do not load the iGenomes reference config.", - "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." - } - } + "outdir": { + "type": "string", + "format": "directory-path", + "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "fa_icon": "fas fa-folder-open" }, - "institutional_config_options": { - "title": "Institutional config options", - "type": "object", - "fa_icon": "fas fa-university", - "description": "Parameters used to describe centralised config profiles. These should not be edited.", - "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", - "properties": { - "custom_config_version": { - "type": "string", - "description": "Git commit id for Institutional configs.", - "default": "master", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "custom_config_base": { - "type": "string", - "description": "Base directory for Institutional configs.", - "default": "https://raw.githubusercontent.com/nf-core/configs/master", - "hidden": true, - "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.", - "fa_icon": "fas fa-users-cog" - }, - "config_profile_name": { - "type": "string", - "description": "Institutional config name.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_description": { - "type": "string", - "description": "Institutional config description.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_contact": { - "type": "string", - "description": "Institutional config contact information.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_url": { - "type": "string", - "description": "Institutional config URL link.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - } - } + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" }, - "max_job_request_options": { - "title": "Max job request options", - "type": "object", - "fa_icon": "fab fa-acquisitions-incorporated", - "description": "Set the top limit for requested resources for any single job.", - "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", - "properties": { - "max_cpus": { - "type": "integer", - "description": "Maximum number of CPUs that can be requested for any single job.", - "default": 16, - "fa_icon": "fas fa-microchip", - "hidden": true, - "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" - }, - "max_memory": { - "type": "string", - "description": "Maximum amount of memory that can be requested for any single job.", - "default": "128.GB", - "fa_icon": "fas fa-memory", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "hidden": true, - "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" - }, - "max_time": { - "type": "string", - "description": "Maximum amount of time that can be requested for any single job.", - "default": "240.h", - "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", - "hidden": true, - "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" - } - } - }, - "generic_options": { - "title": "Generic options", - "type": "object", - "fa_icon": "fas fa-file-import", - "description": "Less common options for the pipeline, typically set in a config file.", - "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", - "properties": { - "help": { - "type": "boolean", - "description": "Display help text.", - "fa_icon": "fas fa-question-circle", - "hidden": true - }, - "publish_dir_mode": { - "type": "string", - "default": "copy", - "description": "Method used to save pipeline results to output directory.", - "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", - "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], - "hidden": true - }, - "email_on_fail": { - "type": "string", - "description": "Email address for completion summary, only when pipeline fails.", - "fa_icon": "fas fa-exclamation-triangle", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", - "help_text": "An email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.", - "hidden": true - }, - "plaintext_email": { - "type": "boolean", - "description": "Send plain-text email instead of HTML.", - "fa_icon": "fas fa-remove-format", - "hidden": true - }, - "max_multiqc_email_size": { - "type": "string", - "description": "File size limit when attaching MultiQC reports to summary emails.", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "default": "25.MB", - "fa_icon": "fas fa-file-upload", - "hidden": true - }, - "monochrome_logs": { - "type": "boolean", - "description": "Do not use coloured log outputs.", - "fa_icon": "fas fa-palette", - "hidden": true - }, - "multiqc_config": { - "type": "string", - "description": "Custom config file to supply to MultiQC.", - "fa_icon": "fas fa-cog", - "hidden": true - }, - "tracedir": { - "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", - "fa_icon": "fas fa-cogs", - "hidden": true - }, - "validate_params": { - "type": "boolean", - "description": "Boolean whether to validate parameters against the schema at runtime", - "default": true, - "fa_icon": "fas fa-check-square", - "hidden": true - }, - "show_hidden_params": { - "type": "boolean", - "fa_icon": "far fa-eye-slash", - "description": "Show all params when using `--help`", - "hidden": true, - "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." - }, - "enable_conda": { - "type": "boolean", - "description": "Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.", - "hidden": true, - "fa_icon": "fas fa-bacon" - } - } + "multiqc_title": { + "type": "string", + "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", + "fa_icon": "fas fa-file-signature" } + } }, - "allOf": [ - { - "$ref": "#/definitions/input_output_options" + "reference_genome_options": { + "title": "Reference genome options", + "type": "object", + "fa_icon": "fas fa-dna", + "description": "Reference genome related files and options required for the workflow.", + "properties": { + "genome": { + "type": "string", + "description": "Name of iGenomes reference.", + "fa_icon": "fas fa-book", + "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." }, - { - "$ref": "#/definitions/reference_genome_options" + "igenomes_base": { + "type": "string", + "format": "directory-path", + "description": "Directory / URL base for iGenomes references.", + "default": "s3://ngi-igenomes/igenomes", + "fa_icon": "fas fa-cloud-download-alt", + "hidden": true }, - { - "$ref": "#/definitions/institutional_config_options" - }, - { - "$ref": "#/definitions/max_job_request_options" - }, - { - "$ref": "#/definitions/generic_options" + "igenomes_ignore": { + "type": "boolean", + "description": "Do not load the iGenomes reference config.", + "fa_icon": "fas fa-ban", + "hidden": true, + "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." } - ], - "properties": { - "databases": { - "type": "string", - "default": "None" + } + }, + "institutional_config_options": { + "title": "Institutional config options", + "type": "object", + "fa_icon": "fas fa-university", + "description": "Parameters used to describe centralised config profiles. These should not be edited.", + "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", + "properties": { + "custom_config_version": { + "type": "string", + "description": "Git commit id for Institutional configs.", + "default": "master", + "hidden": true, + "fa_icon": "fas fa-users-cog" }, - "shortread_clipmerge": { - "type": "boolean" + "custom_config_base": { + "type": "string", + "description": "Base directory for Institutional configs.", + "default": "https://raw.githubusercontent.com/nf-core/configs/master", + "hidden": true, + "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.", + "fa_icon": "fas fa-users-cog" }, - "shortread_excludeunmerged": { - "type": "boolean", - "default": true + "config_profile_name": { + "type": "string", + "description": "Institutional config name.", + "hidden": true, + "fa_icon": "fas fa-users-cog" }, - "longread_clip": { - "type": "boolean" + "config_profile_description": { + "type": "string", + "description": "Institutional config description.", + "hidden": true, + "fa_icon": "fas fa-users-cog" }, - "run_malt": { - "type": "boolean" + "config_profile_contact": { + "type": "string", + "description": "Institutional config contact information.", + "hidden": true, + "fa_icon": "fas fa-users-cog" }, - "malt_mode": { - "type": "string", - "default": "BlastN" - }, - "run_kraken2": { - "type": "boolean" - }, - "run_centrifuge": { - "type": "boolean" - }, - "centrifuge_save_unaligned": { - "type": "boolean" - }, - "centrifuge_save_aligned": { - "type": "boolean" - }, - "centrifuge_sam_format": { - "type": "boolean" + "config_profile_url": { + "type": "string", + "description": "Institutional config URL link.", + "hidden": true, + "fa_icon": "fas fa-users-cog" } + } + }, + "max_job_request_options": { + "title": "Max job request options", + "type": "object", + "fa_icon": "fab fa-acquisitions-incorporated", + "description": "Set the top limit for requested resources for any single job.", + "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", + "properties": { + "max_cpus": { + "type": "integer", + "description": "Maximum number of CPUs that can be requested for any single job.", + "default": 16, + "fa_icon": "fas fa-microchip", + "hidden": true, + "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" + }, + "max_memory": { + "type": "string", + "description": "Maximum amount of memory that can be requested for any single job.", + "default": "128.GB", + "fa_icon": "fas fa-memory", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "hidden": true, + "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" + }, + "max_time": { + "type": "string", + "description": "Maximum amount of time that can be requested for any single job.", + "default": "240.h", + "fa_icon": "far fa-clock", + "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", + "hidden": true, + "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + } + } + }, + "generic_options": { + "title": "Generic options", + "type": "object", + "fa_icon": "fas fa-file-import", + "description": "Less common options for the pipeline, typically set in a config file.", + "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", + "properties": { + "help": { + "type": "boolean", + "description": "Display help text.", + "fa_icon": "fas fa-question-circle", + "hidden": true + }, + "publish_dir_mode": { + "type": "string", + "default": "copy", + "description": "Method used to save pipeline results to output directory.", + "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", + "fa_icon": "fas fa-copy", + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], + "hidden": true + }, + "email_on_fail": { + "type": "string", + "description": "Email address for completion summary, only when pipeline fails.", + "fa_icon": "fas fa-exclamation-triangle", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", + "help_text": "An email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.", + "hidden": true + }, + "plaintext_email": { + "type": "boolean", + "description": "Send plain-text email instead of HTML.", + "fa_icon": "fas fa-remove-format", + "hidden": true + }, + "max_multiqc_email_size": { + "type": "string", + "description": "File size limit when attaching MultiQC reports to summary emails.", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "default": "25.MB", + "fa_icon": "fas fa-file-upload", + "hidden": true + }, + "monochrome_logs": { + "type": "boolean", + "description": "Do not use coloured log outputs.", + "fa_icon": "fas fa-palette", + "hidden": true + }, + "multiqc_config": { + "type": "string", + "description": "Custom config file to supply to MultiQC.", + "fa_icon": "fas fa-cog", + "hidden": true + }, + "tracedir": { + "type": "string", + "description": "Directory to keep pipeline Nextflow logs and reports.", + "default": "${params.outdir}/pipeline_info", + "fa_icon": "fas fa-cogs", + "hidden": true + }, + "validate_params": { + "type": "boolean", + "description": "Boolean whether to validate parameters against the schema at runtime", + "default": true, + "fa_icon": "fas fa-check-square", + "hidden": true + }, + "show_hidden_params": { + "type": "boolean", + "fa_icon": "far fa-eye-slash", + "description": "Show all params when using `--help`", + "hidden": true, + "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "enable_conda": { + "type": "boolean", + "description": "Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.", + "hidden": true, + "fa_icon": "fas fa-bacon" + } + } } + }, + "allOf": [ + { + "$ref": "#/definitions/input_output_options" + }, + { + "$ref": "#/definitions/reference_genome_options" + }, + { + "$ref": "#/definitions/institutional_config_options" + }, + { + "$ref": "#/definitions/max_job_request_options" + }, + { + "$ref": "#/definitions/generic_options" + } + ], + "properties": { + "databases": { + "type": "string", + "default": "None" + }, + "shortread_clipmerge": { + "type": "boolean" + }, + "shortread_excludeunmerged": { + "type": "boolean", + "default": true + }, + "longread_clip": { + "type": "boolean" + }, + "run_malt": { + "type": "boolean" + }, + "malt_mode": { + "type": "string", + "default": "BlastN" + }, + "run_kraken2": { + "type": "boolean" + }, + "run_centrifuge": { + "type": "boolean" + }, + "centrifuge_save_unaligned": { + "type": "boolean" + }, + "centrifuge_save_aligned": { + "type": "boolean" + }, + "centrifuge_sam_format": { + "type": "boolean" + } + } } From d25f97c5061ef7759196f04017ec2b7710075e48 Mon Sep 17 00:00:00 2001 From: sofstam Date: Mon, 28 Mar 2022 22:08:05 +0200 Subject: [PATCH 05/19] Prettier format --- nextflow_schema.json | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index b61a50e..f1b1de0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", From c552819c725a51fe2f04af981ebd90c311b33c30 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 28 Mar 2022 22:15:31 +0200 Subject: [PATCH 06/19] Apply prettier again --- nextflow_schema.json | 572 +++++++++++++++++++++---------------------- 1 file changed, 281 insertions(+), 291 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index f1b1de0..0e52ee5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,308 +1,298 @@ { - "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/nf-core/taxprofiler/master/nextflow_schema.json", - "title": "nf-core/taxprofiler pipeline parameters", - "description": "Taxonomic profiling of shotgun metagenomic data", - "type": "object", - "definitions": { - "input_output_options": { - "title": "Input/output options", - "type": "object", - "fa_icon": "fas fa-terminal", - "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], - "properties": { - "input": { - "type": "string", - "format": "file-path", - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "schema": "assets/schema_input.json", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/taxprofiler/master/nextflow_schema.json", + "title": "nf-core/taxprofiler pipeline parameters", + "description": "Taxonomic profiling of shotgun metagenomic data", + "type": "object", + "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": ["input", "outdir"], + "properties": { + "input": { + "type": "string", + "format": "file-path", + "mimetype": "text/csv", + "pattern": "^\\S+\\.csv$", + "schema": "assets/schema_input.json", + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).", + "fa_icon": "fas fa-file-csv" + }, + "outdir": { + "type": "string", + "format": "directory-path", + "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "fa_icon": "fas fa-folder-open" + }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + }, + "multiqc_title": { + "type": "string", + "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", + "fa_icon": "fas fa-file-signature" + } + } }, - "outdir": { - "type": "string", - "format": "directory-path", - "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", - "fa_icon": "fas fa-folder-open" + "reference_genome_options": { + "title": "Reference genome options", + "type": "object", + "fa_icon": "fas fa-dna", + "description": "Reference genome related files and options required for the workflow.", + "properties": { + "genome": { + "type": "string", + "description": "Name of iGenomes reference.", + "fa_icon": "fas fa-book", + "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + }, + "igenomes_base": { + "type": "string", + "format": "directory-path", + "description": "Directory / URL base for iGenomes references.", + "default": "s3://ngi-igenomes/igenomes", + "fa_icon": "fas fa-cloud-download-alt", + "hidden": true + }, + "igenomes_ignore": { + "type": "boolean", + "description": "Do not load the iGenomes reference config.", + "fa_icon": "fas fa-ban", + "hidden": true, + "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + } + } }, - "email": { - "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + "institutional_config_options": { + "title": "Institutional config options", + "type": "object", + "fa_icon": "fas fa-university", + "description": "Parameters used to describe centralised config profiles. These should not be edited.", + "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", + "properties": { + "custom_config_version": { + "type": "string", + "description": "Git commit id for Institutional configs.", + "default": "master", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "custom_config_base": { + "type": "string", + "description": "Base directory for Institutional configs.", + "default": "https://raw.githubusercontent.com/nf-core/configs/master", + "hidden": true, + "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.", + "fa_icon": "fas fa-users-cog" + }, + "config_profile_name": { + "type": "string", + "description": "Institutional config name.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_description": { + "type": "string", + "description": "Institutional config description.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_contact": { + "type": "string", + "description": "Institutional config contact information.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_url": { + "type": "string", + "description": "Institutional config URL link.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + } + } }, - "multiqc_title": { - "type": "string", - "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", - "fa_icon": "fas fa-file-signature" + "max_job_request_options": { + "title": "Max job request options", + "type": "object", + "fa_icon": "fab fa-acquisitions-incorporated", + "description": "Set the top limit for requested resources for any single job.", + "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", + "properties": { + "max_cpus": { + "type": "integer", + "description": "Maximum number of CPUs that can be requested for any single job.", + "default": 16, + "fa_icon": "fas fa-microchip", + "hidden": true, + "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" + }, + "max_memory": { + "type": "string", + "description": "Maximum amount of memory that can be requested for any single job.", + "default": "128.GB", + "fa_icon": "fas fa-memory", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "hidden": true, + "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" + }, + "max_time": { + "type": "string", + "description": "Maximum amount of time that can be requested for any single job.", + "default": "240.h", + "fa_icon": "far fa-clock", + "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", + "hidden": true, + "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + } + } + }, + "generic_options": { + "title": "Generic options", + "type": "object", + "fa_icon": "fas fa-file-import", + "description": "Less common options for the pipeline, typically set in a config file.", + "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", + "properties": { + "help": { + "type": "boolean", + "description": "Display help text.", + "fa_icon": "fas fa-question-circle", + "hidden": true + }, + "publish_dir_mode": { + "type": "string", + "default": "copy", + "description": "Method used to save pipeline results to output directory.", + "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", + "fa_icon": "fas fa-copy", + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "hidden": true + }, + "email_on_fail": { + "type": "string", + "description": "Email address for completion summary, only when pipeline fails.", + "fa_icon": "fas fa-exclamation-triangle", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", + "help_text": "An email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.", + "hidden": true + }, + "plaintext_email": { + "type": "boolean", + "description": "Send plain-text email instead of HTML.", + "fa_icon": "fas fa-remove-format", + "hidden": true + }, + "max_multiqc_email_size": { + "type": "string", + "description": "File size limit when attaching MultiQC reports to summary emails.", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "default": "25.MB", + "fa_icon": "fas fa-file-upload", + "hidden": true + }, + "monochrome_logs": { + "type": "boolean", + "description": "Do not use coloured log outputs.", + "fa_icon": "fas fa-palette", + "hidden": true + }, + "multiqc_config": { + "type": "string", + "description": "Custom config file to supply to MultiQC.", + "fa_icon": "fas fa-cog", + "hidden": true + }, + "tracedir": { + "type": "string", + "description": "Directory to keep pipeline Nextflow logs and reports.", + "default": "${params.outdir}/pipeline_info", + "fa_icon": "fas fa-cogs", + "hidden": true + }, + "validate_params": { + "type": "boolean", + "description": "Boolean whether to validate parameters against the schema at runtime", + "default": true, + "fa_icon": "fas fa-check-square", + "hidden": true + }, + "show_hidden_params": { + "type": "boolean", + "fa_icon": "far fa-eye-slash", + "description": "Show all params when using `--help`", + "hidden": true, + "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "enable_conda": { + "type": "boolean", + "description": "Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.", + "hidden": true, + "fa_icon": "fas fa-bacon" + } + } } - } }, - "reference_genome_options": { - "title": "Reference genome options", - "type": "object", - "fa_icon": "fas fa-dna", - "description": "Reference genome related files and options required for the workflow.", - "properties": { - "genome": { - "type": "string", - "description": "Name of iGenomes reference.", - "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + "allOf": [ + { + "$ref": "#/definitions/input_output_options" }, - "igenomes_base": { - "type": "string", - "format": "directory-path", - "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes", - "fa_icon": "fas fa-cloud-download-alt", - "hidden": true + { + "$ref": "#/definitions/reference_genome_options" }, - "igenomes_ignore": { - "type": "boolean", - "description": "Do not load the iGenomes reference config.", - "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + { + "$ref": "#/definitions/institutional_config_options" + }, + { + "$ref": "#/definitions/max_job_request_options" + }, + { + "$ref": "#/definitions/generic_options" } - } - }, - "institutional_config_options": { - "title": "Institutional config options", - "type": "object", - "fa_icon": "fas fa-university", - "description": "Parameters used to describe centralised config profiles. These should not be edited.", - "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", - "properties": { - "custom_config_version": { - "type": "string", - "description": "Git commit id for Institutional configs.", - "default": "master", - "hidden": true, - "fa_icon": "fas fa-users-cog" + ], + "properties": { + "databases": { + "type": "string", + "default": "None" }, - "custom_config_base": { - "type": "string", - "description": "Base directory for Institutional configs.", - "default": "https://raw.githubusercontent.com/nf-core/configs/master", - "hidden": true, - "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.", - "fa_icon": "fas fa-users-cog" + "shortread_clipmerge": { + "type": "boolean" }, - "config_profile_name": { - "type": "string", - "description": "Institutional config name.", - "hidden": true, - "fa_icon": "fas fa-users-cog" + "shortread_excludeunmerged": { + "type": "boolean", + "default": true }, - "config_profile_description": { - "type": "string", - "description": "Institutional config description.", - "hidden": true, - "fa_icon": "fas fa-users-cog" + "longread_clip": { + "type": "boolean" }, - "config_profile_contact": { - "type": "string", - "description": "Institutional config contact information.", - "hidden": true, - "fa_icon": "fas fa-users-cog" + "run_malt": { + "type": "boolean" }, - "config_profile_url": { - "type": "string", - "description": "Institutional config URL link.", - "hidden": true, - "fa_icon": "fas fa-users-cog" + "malt_mode": { + "type": "string", + "default": "BlastN" + }, + "run_kraken2": { + "type": "boolean" + }, + "run_centrifuge": { + "type": "boolean" + }, + "centrifuge_save_unaligned": { + "type": "boolean" + }, + "centrifuge_save_aligned": { + "type": "boolean" + }, + "centrifuge_sam_format": { + "type": "boolean" } - } - }, - "max_job_request_options": { - "title": "Max job request options", - "type": "object", - "fa_icon": "fab fa-acquisitions-incorporated", - "description": "Set the top limit for requested resources for any single job.", - "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", - "properties": { - "max_cpus": { - "type": "integer", - "description": "Maximum number of CPUs that can be requested for any single job.", - "default": 16, - "fa_icon": "fas fa-microchip", - "hidden": true, - "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" - }, - "max_memory": { - "type": "string", - "description": "Maximum amount of memory that can be requested for any single job.", - "default": "128.GB", - "fa_icon": "fas fa-memory", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "hidden": true, - "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" - }, - "max_time": { - "type": "string", - "description": "Maximum amount of time that can be requested for any single job.", - "default": "240.h", - "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", - "hidden": true, - "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" - } - } - }, - "generic_options": { - "title": "Generic options", - "type": "object", - "fa_icon": "fas fa-file-import", - "description": "Less common options for the pipeline, typically set in a config file.", - "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", - "properties": { - "help": { - "type": "boolean", - "description": "Display help text.", - "fa_icon": "fas fa-question-circle", - "hidden": true - }, - "publish_dir_mode": { - "type": "string", - "default": "copy", - "description": "Method used to save pipeline results to output directory.", - "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", - "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], - "hidden": true - }, - "email_on_fail": { - "type": "string", - "description": "Email address for completion summary, only when pipeline fails.", - "fa_icon": "fas fa-exclamation-triangle", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", - "help_text": "An email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.", - "hidden": true - }, - "plaintext_email": { - "type": "boolean", - "description": "Send plain-text email instead of HTML.", - "fa_icon": "fas fa-remove-format", - "hidden": true - }, - "max_multiqc_email_size": { - "type": "string", - "description": "File size limit when attaching MultiQC reports to summary emails.", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "default": "25.MB", - "fa_icon": "fas fa-file-upload", - "hidden": true - }, - "monochrome_logs": { - "type": "boolean", - "description": "Do not use coloured log outputs.", - "fa_icon": "fas fa-palette", - "hidden": true - }, - "multiqc_config": { - "type": "string", - "description": "Custom config file to supply to MultiQC.", - "fa_icon": "fas fa-cog", - "hidden": true - }, - "tracedir": { - "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", - "fa_icon": "fas fa-cogs", - "hidden": true - }, - "validate_params": { - "type": "boolean", - "description": "Boolean whether to validate parameters against the schema at runtime", - "default": true, - "fa_icon": "fas fa-check-square", - "hidden": true - }, - "show_hidden_params": { - "type": "boolean", - "fa_icon": "far fa-eye-slash", - "description": "Show all params when using `--help`", - "hidden": true, - "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." - }, - "enable_conda": { - "type": "boolean", - "description": "Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.", - "hidden": true, - "fa_icon": "fas fa-bacon" - } - } } - }, - "allOf": [ - { - "$ref": "#/definitions/input_output_options" - }, - { - "$ref": "#/definitions/reference_genome_options" - }, - { - "$ref": "#/definitions/institutional_config_options" - }, - { - "$ref": "#/definitions/max_job_request_options" - }, - { - "$ref": "#/definitions/generic_options" - } - ], - "properties": { - "databases": { - "type": "string", - "default": "None" - }, - "shortread_clipmerge": { - "type": "boolean" - }, - "shortread_excludeunmerged": { - "type": "boolean", - "default": true - }, - "longread_clip": { - "type": "boolean" - }, - "run_malt": { - "type": "boolean" - }, - "malt_mode": { - "type": "string", - "default": "BlastN" - }, - "run_kraken2": { - "type": "boolean" - }, - "run_centrifuge": { - "type": "boolean" - }, - "centrifuge_save_unaligned": { - "type": "boolean" - }, - "centrifuge_save_aligned": { - "type": "boolean" - }, - "centrifuge_sam_format": { - "type": "boolean" - } - } } From 3a562065138a4ef9035e43b558c844f58a36611d Mon Sep 17 00:00:00 2001 From: sofstam Date: Mon, 4 Apr 2022 16:55:48 +0200 Subject: [PATCH 07/19] Centrifuge classification --- nextflow.config | 2 +- nextflow_schema.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 725f892..a99481c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -74,7 +74,7 @@ params { // centrifuge run_centrifuge = false - centrifuge_db_name = false + centrifuge_db_name = 'minigut_cf' centrifuge_save_unaligned = false centrifuge_save_aligned = false centrifuge_sam_format = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 3ec2aae..777b82f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -336,7 +336,7 @@ }, "centrifuge_db_name": { "type": "string", - "default": "false" + "default": null } } } \ No newline at end of file From 26779a4420f5d71d55a7bb99ff85ee1863740401 Mon Sep 17 00:00:00 2001 From: sofstam Date: Thu, 7 Apr 2022 16:27:19 +0200 Subject: [PATCH 08/19] Remove db_name from nextflow.config --- conf/test.config | 4 ++-- nextflow.config | 1 - nextflow_schema.json | 7 +++++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/conf/test.config b/conf/test.config index 6e82300..d392306 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,10 +25,10 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' run_kraken2 = true - run_malt = true + run_malt = false run_metaphlan3 = true run_centrifuge = true shortread_clipmerge = true longread_clip = false - shortread_complexityfilter = true + shortread_complexityfilter = false } diff --git a/nextflow.config b/nextflow.config index 37f886f..b4a8d91 100644 --- a/nextflow.config +++ b/nextflow.config @@ -86,7 +86,6 @@ params { // centrifuge run_centrifuge = false - centrifuge_db_name = 'minigut_cf' centrifuge_save_unaligned = false centrifuge_save_aligned = false centrifuge_sam_format = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 2ed80ed..23eb83b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -358,7 +358,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -369,4 +372,4 @@ "default": false } } -} +} \ No newline at end of file From 3d45ac57aead82baa98a8f0ee8aa668775bf4021 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Thu, 7 Apr 2022 16:42:22 +0200 Subject: [PATCH 09/19] Prettier --- modules.json | 2 +- nextflow_schema.json | 26 +++++--------------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/modules.json b/modules.json index 5abebf6..7fbc65c 100644 --- a/modules.json +++ b/modules.json @@ -47,4 +47,4 @@ } } } -} \ No newline at end of file +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 23eb83b..2b115eb 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -310,10 +300,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -358,10 +345,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -372,4 +356,4 @@ "default": false } } -} \ No newline at end of file +} From 2dfe3b3cc1d3a08aace2f521e84fa2c9820c3d98 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli <91951607+sofstam@users.noreply.github.com> Date: Fri, 8 Apr 2022 10:00:04 +0200 Subject: [PATCH 10/19] Update conf/test.config Co-authored-by: James A. Fellows Yates --- conf/test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index d392306..270ad95 100644 --- a/conf/test.config +++ b/conf/test.config @@ -30,5 +30,5 @@ params { run_centrifuge = true shortread_clipmerge = true longread_clip = false - shortread_complexityfilter = false + shortread_complexityfilter = true } From 48d28cf8d4ac889b80736c86b3ed97eb46554af2 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli <91951607+sofstam@users.noreply.github.com> Date: Fri, 8 Apr 2022 10:00:10 +0200 Subject: [PATCH 11/19] Update conf/test.config Co-authored-by: James A. Fellows Yates --- conf/test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index 270ad95..6e82300 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,7 +25,7 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' run_kraken2 = true - run_malt = false + run_malt = true run_metaphlan3 = true run_centrifuge = true shortread_clipmerge = true From fd5ebea9a697b7cc2e97cc2c056f131b64848bae Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Fri, 8 Apr 2022 10:57:58 +0200 Subject: [PATCH 12/19] Remove old centrifuge module --- modules.json | 2 +- .../nf-core/modules/centrifuge/main.nf | 63 ---------------- .../nf-core/modules/centrifuge/meta.yml | 73 ------------------- 3 files changed, 1 insertion(+), 137 deletions(-) delete mode 100644 modules/nf-core/modules/nf-core/modules/centrifuge/main.nf delete mode 100644 modules/nf-core/modules/nf-core/modules/centrifuge/meta.yml diff --git a/modules.json b/modules.json index 7fbc65c..5abebf6 100644 --- a/modules.json +++ b/modules.json @@ -47,4 +47,4 @@ } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/nf-core/modules/centrifuge/main.nf b/modules/nf-core/modules/nf-core/modules/centrifuge/main.nf deleted file mode 100644 index 7eb566d..0000000 --- a/modules/nf-core/modules/nf-core/modules/centrifuge/main.nf +++ /dev/null @@ -1,63 +0,0 @@ -process CENTRIFUGE { - tag "$meta.id" - label 'process_high' - - conda (params.enable_conda ? "bioconda::centrifuge=1.0.4_beta" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6' : - 'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }" - - input: - tuple val(meta), path(reads) - path db - val save_unaligned - val save_aligned - val sam_format - - output: - tuple val(meta), path('*report.txt') , emit: report - tuple val(meta), path('*results.txt') , emit: results - tuple val(meta), path('*kreport.txt') , emit: kreport - tuple val(meta), path('*.sam') , optional: true, emit: sam - tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_mapped - tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def paired = meta.single_end ? "-U ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" - def db_name = db.toString().replace(".tar.gz","") - def unaligned = '' - def aligned = '' - if (meta.single_end) { - unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' - aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : '' - } else { - unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' - aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : '' - } - def sam_output = sam_format ? "--out-fmt 'sam'" : '' - """ - tar -xf $db - centrifuge \\ - -x $db_name \\ - -p $task.cpus \\ - $paired \\ - --report-file ${prefix}.report.txt \\ - -S ${prefix}.results.txt \\ - $unaligned \\ - $aligned \\ - $sam_output \\ - $args - centrifuge-kreport -x $db_name ${prefix}.results.txt > ${prefix}.kreport.txt - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') - END_VERSIONS - """ -} diff --git a/modules/nf-core/modules/nf-core/modules/centrifuge/meta.yml b/modules/nf-core/modules/nf-core/modules/centrifuge/meta.yml deleted file mode 100644 index 3adf0e2..0000000 --- a/modules/nf-core/modules/nf-core/modules/centrifuge/meta.yml +++ /dev/null @@ -1,73 +0,0 @@ -name: centrifuge -description: Classifies metagenomic sequence data -keywords: - - classify - - metagenomics - - fastq - - db -tools: - - centrifuge: - description: Centrifuge is a classifier for metagenomic sequences. - homepage: https://ccb.jhu.edu/software/centrifuge/ - documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml - doi: 10.1101/gr.210641.116 - licence: ["GPL v3"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. - - db: - type: directory - description: Centrifuge database in .tar.gz format - pattern: "*.tar.gz" - - save_unaligned: - type: value - description: If true unmapped fastq files are saved - - save_aligned: - type: value - description: If true mapped fastq files are saved -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - report: - type: file - description: | - File containing a classification summary - pattern: "*.{report.txt}" - - results: - type: file - description: | - File containing classification results - pattern: "*.{results.txt}" - - kreport: - type: file - description: | - File containing kraken-style report from centrifuge - out files. - pattern: "*.{kreport.txt}" - - fastq_unmapped: - type: file - description: Unmapped fastq files - pattern: "*.unmapped.fastq.gz" - - fastq_mapped: - type: file - description: Mapped fastq files - pattern: "*.mapped.fastq.gz" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@sofstam" - - "@jfy133" - - "@sateeshperi" From 63bc597daf008a4de9ed5e34c0b8d682f349a34a Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Fri, 8 Apr 2022 11:01:45 +0200 Subject: [PATCH 13/19] Prettier --- modules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 5abebf6..7fbc65c 100644 --- a/modules.json +++ b/modules.json @@ -47,4 +47,4 @@ } } } -} \ No newline at end of file +} From ecf0eea4f99f0601124661557b41c5092d507ca6 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 10 Apr 2022 06:43:30 +0200 Subject: [PATCH 14/19] Move profiling to subworkflow and standardise outputs --- conf/modules.config | 24 +++---- subworkflows/local/profiling.nf | 120 ++++++++++++++++++++++++++++++++ workflows/taxprofiler.nf | 97 ++------------------------ 3 files changed, 136 insertions(+), 105 deletions(-) create mode 100644 subworkflows/local/profiling.nf diff --git a/conf/modules.config b/conf/modules.config index de41e69..531bd5a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -167,7 +167,7 @@ process { publishDir = [ path: { "${params.outdir}/malt/${meta.db_name}" }, mode: params.publish_dir_mode, - pattern: '*.{rma6,tab,text,sam,log}' + pattern: '*.{log}' ] } @@ -177,7 +177,7 @@ process { publishDir = [ path: { "${params.outdir}/kraken2/${meta.db_name}" }, mode: params.publish_dir_mode, - pattern: '*.{fastq.gz,txt}' + pattern: '*.{txt}' ] } @@ -190,6 +190,16 @@ process { ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } + withName: CENTRIFUGE_CENTRIFUGE { + publishDir = [ + path: { "${params.outdir}/centrifuge/${meta.db_name}" }, + mode: params.publish_dir_mode, + pattern: '*.txt' + ] + ext.args = { "${meta.db_params}" } + ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -198,14 +208,4 @@ process { ] } - withName: CENTRIFUGE_CENTRIFUGE { - publishDir = [ - path: { "${params.outdir}/centrifuge/${meta.db_name}" }, - mode: params.publish_dir_mode, - pattern: '*.{fastq.gz,txt}' - ] - ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } - } - } diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf new file mode 100644 index 0000000..ac48d77 --- /dev/null +++ b/subworkflows/local/profiling.nf @@ -0,0 +1,120 @@ +// +// Run profiling +// + +include { MALT_RUN } from '../../modules/nf-core/modules/malt/run/main' +include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/modules/kraken2/kraken2/main' +include { CENTRIFUGE_CENTRIFUGE } from '../../modules/nf-core/modules/centrifuge/centrifuge/main' +include { METAPHLAN3 } from '../../modules/nf-core/modules/metaphlan3/main' + +workflow PROFILING { + take: + shortreads // [ [ meta ], [ reads ] ] + longreads // [ [ meta ], [ reads ] ] + databases // [ [ meta ], path ] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + +/* + COMBINE READS WITH POSSIBLE DATABASES + */ + + // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] + ch_input_for_profiling = shortreads + .mix( longreads ) + .combine(databases) + .branch { + malt: it[2]['tool'] == 'malt' + kraken2: it[2]['tool'] == 'kraken2' + metaphlan3: it[2]['tool'] == 'metaphlan3' + centrifuge: it[2]['tool'] == 'centrifuge' + unknown: true + } + + /* + PREPARE PROFILER INPUT CHANNELS + */ + + // Each tool as a slightly different input structure and generally separate + // input channels for reads vs databases. We restructure the channel tuple + // for each tool and make liberal use of multiMap to keep reads/databases + // channel element order in sync with each other + + // MALT: We groupTuple to have all samples in one channel for MALT as database + // loading takes a long time, so we only want to run it once per database + // TODO document somewhere we only accept illumina short reads for MALT? + ch_input_for_malt = ch_input_for_profiling.malt + .filter { it[0]['instrument_platform'] == 'ILLUMINA' } + .map { + it -> + def temp_meta = [ id: it[2]['db_name']] + it[2] + def db = it[3] + [ temp_meta, it[1], db ] + } + .groupTuple(by: [0,2]) + .multiMap { + it -> + reads: [ it[0], it[1].flatten() ] + db: it[2] + } + + // All subsequent tools can easily run on a per-sample basis + + ch_input_for_kraken2 = ch_input_for_profiling.kraken2 + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + ch_input_for_centrifuge = ch_input_for_profiling.centrifuge + .dump(tag: "input for centrifuge") + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + + /* + RUN PROFILING + */ + + if ( params.run_malt ) { + MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) + ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) + } + + if ( params.run_kraken2 ) { + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) + ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) + } + + if ( params.run_centrifuge ) { + CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format ) + ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) + } + + if ( params.run_metaphlan3 ) { + METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db ) + ch_versions = ch_versions.mix( METAPHLAN3.out.versions.first() ) + } + + + emit: + // TODO work out if there is enough standardisation of output to export as one? + //output = ch_filtered_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 6b89c66..38afdda 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -44,6 +44,7 @@ include { DB_CHECK } from '../subworkflows/local/db_check' include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_complexityfiltering' +include { PROFILING } from '../subworkflows/local/profiling' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -59,10 +60,6 @@ include { MULTIQC } from '../modules/nf-core/modules/multiqc include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' -include { MALT_RUN } from '../modules/nf-core/modules/malt/run/main' -include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/modules/kraken2/kraken2/main' -include { CENTRIFUGE_CENTRIFUGE } from '../modules/nf-core/modules/centrifuge/centrifuge/main' -include { METAPHLAN3 } from '../modules/nf-core/modules/metaphlan3/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -127,88 +124,10 @@ workflow TAXPROFILER { } /* - COMBINE READS WITH POSSIBLE DATABASES + SUBWORKFLOW: PROFILING */ - // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] - ch_input_for_profiling = ch_shortreads_filtered - .mix( ch_longreads_preprocessed ) - .combine(DB_CHECK.out.dbs) - .branch { - malt: it[2]['tool'] == 'malt' - kraken2: it[2]['tool'] == 'kraken2' - metaphlan3: it[2]['tool'] == 'metaphlan3' - centrifuge: it[2]['tool'] == 'centrifuge' - unknown: true - } - - /* - PREPARE PROFILER INPUT CHANNELS - */ - - // We groupTuple to have all samples in one channel for MALT as database - // loading takes a long time, so we only want to run it once per database - // TODO document somewhere we only accept illumina short reads for MALT? - ch_input_for_malt = ch_input_for_profiling.malt - .filter { it[0]['instrument_platform'] == 'ILLUMINA' } - .map { - it -> - def temp_meta = [ id: it[2]['db_name']] + it[2] - def db = it[3] - [ temp_meta, it[1], db ] - } - .groupTuple(by: [0,2]) - .multiMap { - it -> - reads: [ it[0], it[1].flatten() ] - db: it[2] - } - - // We can run Kraken2 one-by-one sample-wise - ch_input_for_kraken2 = ch_input_for_profiling.kraken2 - .multiMap { - it -> - reads: [ it[0] + it[2], it[1] ] - db: it[3] - } - - // We can run centrifuge one-by-one sample-wise - ch_input_for_centrifuge = ch_input_for_profiling.centrifuge - .dump(tag: "input for centrifuge") - .multiMap { - it -> - reads: [ it[0] + it[2], it[1] ] - db: it[3] - } - - // - // RUN PROFILING - // - ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 - .multiMap { - it -> - reads: [it[0] + it[2], it[1]] - db: it[3] - } - - /* - MODULE: RUN PROFILING - */ - if ( params.run_malt ) { - MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) - } - - if ( params.run_kraken2 ) { - KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) - } - - if ( params.run_centrifuge ) { - CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format ) - } - - if ( params.run_metaphlan3 ) { - METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db ) - } + PROFILING ( ch_shortreads_filtered, ch_longreads_preprocessed, DB_CHECK.out.dbs ) /* MODULE: MultiQC @@ -244,17 +163,9 @@ workflow TAXPROFILER { ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } - if (params.run_kraken2) { - ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) - } + ch_multiqc_files = ch_multiqc_files.mix( PROFILING.out.mqc ) - if (params.run_malt) { - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) - } - // TODO Versions for Karken/MALT not report? // TODO create multiQC module for metaphlan MULTIQC ( ch_multiqc_files.collect() From 082093f3dee43655b4911e5d81ba0eb0c693454c Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 10 Apr 2022 06:44:10 +0200 Subject: [PATCH 15/19] Prettier --- conf/modules.config | 8 ++++++++ workflows/taxprofiler.nf | 1 - 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 531bd5a..2f5710e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -208,4 +208,12 @@ process { ] } + withName: MULTIQC { + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 38afdda..4afdd6c 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -165,7 +165,6 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix( PROFILING.out.mqc ) - // TODO create multiQC module for metaphlan MULTIQC ( ch_multiqc_files.collect() From 80c08af11b28f89289bc0739fbd13da661b45644 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 10 Apr 2022 06:48:25 +0200 Subject: [PATCH 16/19] Mix in profiling versions into mainline versons channel --- workflows/taxprofiler.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 4afdd6c..237f1ea 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -128,6 +128,7 @@ workflow TAXPROFILER { */ PROFILING ( ch_shortreads_filtered, ch_longreads_preprocessed, DB_CHECK.out.dbs ) + ch_versions = ch_versions.mix( PROFILING.out.versions ) /* MODULE: MultiQC From 030099c559a5eeeee2a65c523a9e946748327b45 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 10 Apr 2022 07:26:20 +0200 Subject: [PATCH 17/19] A bit of clean up --- subworkflows/local/profiling.nf | 1 - subworkflows/local/shortread_fastp.nf | 2 +- workflows/taxprofiler.nf | 2 ++ 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index ac48d77..c74c583 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -70,7 +70,6 @@ workflow PROFILING { } ch_input_for_centrifuge = ch_input_for_profiling.centrifuge - .dump(tag: "input for centrifuge") .multiMap { it -> reads: [ it[0] + it[2], it[1] ] diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index 18baf17..9fb9425 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -10,7 +10,7 @@ workflow SHORTREAD_FASTP { reads // [[meta], [reads]] main: - ch_versions = Channel.empty() + ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() ch_input_for_fastp = reads diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 237f1ea..4b9f927 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -85,6 +85,7 @@ workflow TAXPROFILER { DB_CHECK ( ch_databases ) + ch_versions = ch_versions.mix(DB_CHECK.out.versions) /* MODULE: Run FastQC @@ -101,6 +102,7 @@ workflow TAXPROFILER { SUBWORKFLOW: PERFORM PREPROCESSING */ if ( params.shortread_clipmerge ) { + ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads } else { ch_shortreads_preprocessed = INPUT_CHECK.out.fastq From b892fd0be5b754cc361ab6d9c71f15b732353057 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 10 Apr 2022 22:17:44 +0200 Subject: [PATCH 18/19] Initial attempt at docs --- README.md | 32 ++++++++-------- assets/samplesheet.csv | 9 +++-- docs/usage.md | 84 +++++++++++++++++++++++++++++------------- 3 files changed, 80 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index e976f73..88f643b 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ -**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling against multiple profiling tools and databases and produces standardised output tables. +**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling with multiple profiling tools against multiple databases, produces standardised output tables. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! @@ -32,20 +32,20 @@ On release, automated continuous integration tests run the pipeline on a full-si 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) 2. Performs optional read pre-processing - - Adapter clipping and merging (short, and nanopore reads) - - Low complexity filtering - - Host read removal + - Adapter clipping and merging (short read: [fastp](https://github.com/OpenGene/fastp), [AdapterRemoval2](https://github.com/MikkelSchubert/adapterremoval); long read: [porechop](https://github.com/rrwick/Porechop)) + - Low complexity filtering ([bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus)) + - Host read removal ([BowTie2](http://bowtie-bio.sourceforge.net/bowtie2/)) - Run merging -3. Performs taxonomic profiling a choice of: - - Kraken2 - - MetaPhlAn3 - - MALT - - DIAMOND - - Centrifuge - - Kaiju - - mOTUs +3. Performs taxonomic profiling via a choice of any or all of: + - [Kraken2](https://ccb.jhu.edu/software/kraken2/) + - [MetaPhlAn3](https://huttenhower.sph.harvard.edu/metaphlan/) + - [MALT](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/malt/) + - [DIAMOND](https://github.com/bbuchfink/diamond) + - [Centrifuge](https://ccb.jhu.edu/software/centrifuge/) + - [Kaiju](https://kaiju.binf.ku.dk/) + - [mOTUs](https://motu-tool.org/) 4. Perform optional post-processing with: - - bracken + - [bracken](https://ccb.jhu.edu/software/bracken/) 5. Standardises output tables 6. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) @@ -70,10 +70,8 @@ On release, automated continuous integration tests run the pipeline on a full-si 4. Start running your own analysis! - - ```console - nextflow run nf-core/taxprofiler --input samplesheet.csv --outdir --genome GRCh37 -profile + nextflow run nf-core/taxprofiler --input samplesheet.csv --databases database.csv --outdir --run_ --run_ -profile ``` ## Documentation @@ -86,7 +84,7 @@ nf-core/taxprofiler was originally written by nf-core community. We thank the following people for their extensive assistance in the development of this pipeline: - +[James A. Fellows Yates](https://github.com/jfy133), [Moritz Beber](https://github.com/Midnighter), [Lauri Mesilaakso](https://github.com/ljmesi), [Sofia Stamouli](https://github.com/sofsam), [Maxime Borry](https://github.com/maxibor). ## Contributions and Support diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab..82565b1 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,6 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta +2611,ERR5766174,ILLUMINA,,,///fasta/ERX5474930_ERR5766174_1.fa.gz +2612,ERR5766176,ILLUMINA,///fastq/ERX5474932_ERR5766176_1.fastq.gz,///fastq/ERX5474932_ERR5766176_2.fastq.gz, +2612,ERR5766180,ILLUMINA,///fastq/ERX5474936_ERR5766180_1.fastq.gz,, +2613,ERR5766181,ILLUMINA,///fastq/ERX5474937_ERR5766181_1.fastq.gz,///fastq/ERX5474937_ERR5766181_2.fastq.gz, +ERR3201952,ERR3201952,OXFORD_NANOPORE,///fastq/ERR3201952.fastq.gz,, diff --git a/docs/usage.md b/docs/usage.md index bd840a4..0091d57 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,56 +8,90 @@ -## Samplesheet input +## Samplesheet inputs -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row as shown in the examples below. Furthermother, nf-core/taxprofiler also requires a second comma-separated file of 3 columns with a header row as in the examples below. + +This samplesheet is then specified on the command line as follows: ```console ---input '[path to samplesheet file]' +--input '[path to samplesheet file]' --databases '[path to database sheet file]' ``` ### Multiple runs of the same sample -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will processed reads before performing profiling. Below is an example for the same sample sequenced across 3 lanes: ```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz +sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta +2612,run1,ILLUMINA,2612_run1_R1.fq.gz,, +2612,run2,ILLUMINA,2612_run2_R1.fq.gz,, +2612,run3,ILLUMINA,2612_run3_R1.fq.gz,2612_run3_R2.fq.gz, + ``` +> ⚠️ Runs of the sample sample sequenced on Illumina platforms with a combination of single and paired-end data will **not** be run-wise concatenated, unless pair-merging is specified. In the example above, `run3` will be profiled independently of `run1` and `run2` if pairs not merged. + ### Full samplesheet -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 6 columns to match those defined in the table below. -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +A final samplesheet file consisting of both single- and paired-end data, as well as long-read FASTA fies may look something like the one below. This is for 6 samples, where `2612` has been sequenced twice. ```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +2611,ERR5766174,ILLUMINA,,,///fasta/ERX5474930_ERR5766174_1.fa.gz +2612,ERR5766176,ILLUMINA,///fastq/ERX5474932_ERR5766176_1.fastq.gz,///fastq/ERX5474932_ERR5766176_2.fastq.gz, +2612,ERR5766180,ILLUMINA,///fastq/ERX5474936_ERR5766180_1.fastq.gz,, +2613,ERR5766181,ILLUMINA,///fastq/ERX5474937_ERR5766181_1.fastq.gz,///fastq/ERX5474937_ERR5766181_2.fastq.gz, +ERR3201952,ERR3201952,OXFORD_NANOPORE,///fastq/ERR3201952.fastq.gz,, ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1 or Nanopore reads. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| Column | Description | +| --------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Unique sample name [required]. | +| `run_accession` | Run ID or name unique for each (pairs of) file(s) .Can also supply sample name again here, if only a single run was generated [required]. | +| `instrument_platform` | Sequencing platform reads generated on, selected from the EBI ENA [controlled vocabulary](https://www.ebi.ac.uk/ena/portal/api/controlledVocab?field=instrument_platform) [required]. | +| `fastq_1` | Path or URL to sequencing reads or for Illumina R1 sequencing reads in FASTQ format. GZipped compressed files accepted. Can be left empty if data in FASTA is specifed. Cannot be combined with `fasta`. | +| `fastq_2` | Path or URL to Illumina R2 sequencing reads in FASTQ format. GZipped compressed files accepted. Can be left empty if single end data. Cannot be combined with `fasta`. | +| `fasta` | Path or URL to long-reads or contigs in FASTA format. GZipped compressed files accepted. Can be left empty if data in FASTA is specifed. Cannot be combined with `fastq_1` or `fastq_2`. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +### Full database sheet + +nf-core/taxprofiler supports multiple databases being profiled in parallel for each tool. These databases, and specific parameters for each, can be specified in a 4 column comma-separated sheet. + +> ⚠️ nf-core/taxprofiler does not provide any databases by default, nor currently generates them for you. This must be performed manually by the user. + +An example database sheet can look as follows, where 4 tools are being used, and `malt` and `kraken2` will be used against two databases each. + +```console +tool,db_name,db_params,db_path +malt,malt85,-id 85,///malt/testdb-malt/ +malt,malt95,-id 90,///malt/testdb-malt.tar.gz +kraken2,db1,,///kraken2/testdb-kraken2.tar.gz +kraken2,db2,--quick,///kraken2/testdb-kraken2.tar.gz +centrifuge,db1,,///centrifuge/minigut_cf.tar.gz +metaphlan3,db1,,///metaphlan3/metaphlan_database/ +``` + +Column specifications are as follows: + +| Column | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tool` | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required]. | +| `db_name` | A unique name of the particular database [required]. | +| `db_params` | Any parameters of the given taxonomic profiler that you wish to specify that the taxonomic profiling tool should use when profiling against this specific. Can be empty to use taxonomic profiler defaults Must not be surrounded by quotes [required]. | +| `db_path` | Path to the database. Can either be a path to a directory containing the database index files or a `.tar.gz` file which contains the compressed database directory with the same name as the tar archive, minus `.tar.gz` [required]. | + +> 💡 You can also specify the same database directory/file twice (ensuring unique `db_name`s) and specify different parameters for each database to compare the effect of different parameters during profiling. + ## Running the pipeline The typical command for running the pipeline is as follows: ```console -nextflow run nf-core/taxprofiler --input samplesheet.csv --outdir --genome GRCh37 -profile docker +nextflow run nf-core/taxprofiler --input samplesheet.csv --databases databases.csv --outdir -profile docker --run_ --run_ ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -66,7 +100,7 @@ Note that the pipeline will create the following files in your working directory ```console work # Directory containing the nextflow working files - # Finished results in specified location (defined with --outdir) + # Finished results in specified location (defined with --outdir) .nextflow_log # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` From b8b11fd065ee04ced28f586c505c2e180109b30e Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Mon, 11 Apr 2022 10:30:29 +0200 Subject: [PATCH 19/19] Apply suggestions from code review Co-authored-by: Moritz E. Beber --- README.md | 2 +- docs/usage.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 88f643b..f1d59d5 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ On release, automated continuous integration tests run the pipeline on a full-si - Low complexity filtering ([bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus)) - Host read removal ([BowTie2](http://bowtie-bio.sourceforge.net/bowtie2/)) - Run merging -3. Performs taxonomic profiling via a choice of any or all of: +3. Performs taxonomic profiling using one or more of: - [Kraken2](https://ccb.jhu.edu/software/kraken2/) - [MetaPhlAn3](https://huttenhower.sph.harvard.edu/metaphlan/) - [MALT](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/malt/) diff --git a/docs/usage.md b/docs/usage.md index 0091d57..239a55d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -20,7 +20,7 @@ This samplesheet is then specified on the command line as follows: ### Multiple runs of the same sample -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will processed reads before performing profiling. Below is an example for the same sample sequenced across 3 lanes: +The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will process reads before performing profiling. Below is an example for the same sample sequenced across 3 lanes: ```console sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta @@ -30,7 +30,7 @@ sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta ``` -> ⚠️ Runs of the sample sample sequenced on Illumina platforms with a combination of single and paired-end data will **not** be run-wise concatenated, unless pair-merging is specified. In the example above, `run3` will be profiled independently of `run1` and `run2` if pairs not merged. +> ⚠️ Runs of the same sample sequenced on Illumina platforms with a combination of single and paired-end data will **not** be run-wise concatenated, unless pair-merging is specified. In the example above, `run3` will be profiled independently of `run1` and `run2` if pairs are not merged. ### Full samplesheet @@ -61,7 +61,7 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p nf-core/taxprofiler supports multiple databases being profiled in parallel for each tool. These databases, and specific parameters for each, can be specified in a 4 column comma-separated sheet. -> ⚠️ nf-core/taxprofiler does not provide any databases by default, nor currently generates them for you. This must be performed manually by the user. +> ⚠️ nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user. An example database sheet can look as follows, where 4 tools are being used, and `malt` and `kraken2` will be used against two databases each.