From 21ef25be4372be989736da7889048bc8c8451a75 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Wed, 18 Oct 2023 21:06:48 -0500 Subject: [PATCH 01/11] feat: Add nf-validation plugin --- nextflow.config | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nextflow.config b/nextflow.config index dab2fc4..d1ac49c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -150,6 +150,10 @@ env { R_ENVIRON_USER = "/.Renviron" } +plugins { + id 'nf-validation' +} + def check_max(obj, type) { if (type == 'memory') { try { From 41ac5852bec99780ab3ec823e42487b85c0b9a4d Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Wed, 18 Oct 2023 21:08:40 -0500 Subject: [PATCH 02/11] feat: Add parameters for input and output directories --- nextflow.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nextflow.config b/nextflow.config index d1ac49c..cf6f3c6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,5 +1,7 @@ params { + input = null reference = null + outdir = null // Config options config_profile_name = null From a5e25287815ad45b40e560ad48efb4cd12a55767 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Wed, 18 Oct 2023 21:11:55 -0500 Subject: [PATCH 03/11] chore: Add pre-commit config In order to use nf-validation infrastructure, we have to have pre-commit hooks installed, so add some basic hooks --- .pre-commit-config.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..0c31cdb --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,5 @@ +repos: + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v2.7.1" + hooks: + - id: prettier From b415c29ec0cbf881497c5c99c649f8d8d5c2ae25 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Wed, 18 Oct 2023 21:33:58 -0500 Subject: [PATCH 04/11] feat: Add parameter schema --- nextflow_schema.json | 166 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 nextflow_schema.json diff --git a/nextflow_schema.json b/nextflow_schema.json new file mode 100644 index 0000000..d4c5c27 --- /dev/null +++ b/nextflow_schema.json @@ -0,0 +1,166 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json", + "title": ". pipeline parameters", + "description": "", + "type": "object", + "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": ["input", "reference", "outdir"], + "properties": { + "input": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/csv", + "pattern": "^\\S+\\.csv$", + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.", + "fa_icon": "fas fa-file-csv" + }, + "reference": { + "type": "string", + "fa_icon": "fas fa-dna", + "description": "NCBI accession number of the reference genome to align reads against", + "pattern": "^[:upper:]{2}_[:digit:]{6}\\.[:digit:]+$" + }, + "outdir": { + "type": "string", + "format": "directory-path", + "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "fa_icon": "fas fa-folder-open" + } + } + }, + "institutional_config_options": { + "title": "Institutional config options", + "type": "object", + "fa_icon": "fas fa-university", + "description": "Parameters used to describe centralised config profiles. These should not be edited.", + "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", + "properties": { + "custom_config_version": { + "type": "string", + "description": "Git commit id for Institutional configs.", + "default": "master", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "custom_config_base": { + "type": "string", + "description": "Base directory for Institutional configs.", + "default": "https://raw.githubusercontent.com/nf-core/configs/master", + "hidden": true, + "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.", + "fa_icon": "fas fa-users-cog" + }, + "config_profile_name": { + "type": "string", + "description": "Institutional config name.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_description": { + "type": "string", + "description": "Institutional config description.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_contact": { + "type": "string", + "description": "Institutional config contact information.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_url": { + "type": "string", + "description": "Institutional config URL link.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + } + } + }, + "max_job_request_options": { + "title": "Max job request options", + "type": "object", + "fa_icon": "fab fa-acquisitions-incorporated", + "description": "Set the top limit for requested resources for any single job.", + "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", + "properties": { + "max_cpus": { + "type": "integer", + "description": "Maximum number of CPUs that can be requested for any single job.", + "default": 16, + "fa_icon": "fas fa-microchip", + "hidden": true, + "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" + }, + "max_memory": { + "type": "string", + "description": "Maximum amount of memory that can be requested for any single job.", + "default": "128.GB", + "fa_icon": "fas fa-memory", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "hidden": true, + "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" + }, + "max_time": { + "type": "string", + "description": "Maximum amount of time that can be requested for any single job.", + "default": "240.h", + "fa_icon": "far fa-clock", + "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", + "hidden": true, + "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + } + } + }, + "control_flow": { + "title": "Control Flow", + "type": "object", + "description": "", + "default": "", + "fa_icon": "fas fa-toggle-on", + "properties": { + "cliquesnv": { + "type": "boolean", + "default": true + }, + "haplink": { + "type": "boolean", + "default": true + }, + "quasirecomb": { + "type": "boolean", + "default": true + }, + "shorah": { + "type": "boolean", + "default": true + }, + "viquas": { + "type": "boolean", + "default": true + } + } + } + }, + "allOf": [ + { + "$ref": "#/definitions/input_output_options" + }, + { + "$ref": "#/definitions/institutional_config_options" + }, + { + "$ref": "#/definitions/max_job_request_options" + }, + { + "$ref": "#/definitions/control_flow" + } + ] +} From 93bd5cedec451ec13286a226141bf207eb7de57f Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Wed, 18 Oct 2023 21:40:08 -0500 Subject: [PATCH 05/11] chore: Replace results dir with parameterized outdir --- modules/cliquesnv/main.nf | 2 +- modules/efetch/main.nf | 2 +- modules/haplink/haplotypes/main.nf | 2 +- modules/haplink/sequences/main.nf | 2 +- modules/haplink/variants/main.nf | 2 +- modules/minimap2/main.nf | 2 +- modules/quasirecomb/main.nf | 2 +- modules/shorah/amplicon/main.nf | 2 +- modules/shorah/shotgun/main.nf | 2 +- modules/viquas/main.nf | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/modules/cliquesnv/main.nf b/modules/cliquesnv/main.nf index da197e7..bcccabf 100644 --- a/modules/cliquesnv/main.nf +++ b/modules/cliquesnv/main.nf @@ -13,7 +13,7 @@ process CLIQUESNV { tuple val(prefix), path("*.json") tuple val(prefix), path("*.fasta") - publishDir "results/${task.process}", mode: 'copy' + publishDir "${params.outdir}/${task.process}", mode: 'copy' when: task.ext.when == null || task.ext.when diff --git a/modules/efetch/main.nf b/modules/efetch/main.nf index f3ce0b3..3fb7e9a 100644 --- a/modules/efetch/main.nf +++ b/modules/efetch/main.nf @@ -7,7 +7,7 @@ process EFETCH { input: val(genome) - publishDir "results", mode: 'copy' + publishDir "${params.outdir}", mode: 'copy' output: path 'reference.fasta' diff --git a/modules/haplink/haplotypes/main.nf b/modules/haplink/haplotypes/main.nf index dc84ef8..dedc30b 100644 --- a/modules/haplink/haplotypes/main.nf +++ b/modules/haplink/haplotypes/main.nf @@ -12,7 +12,7 @@ process HAPLINK_HAPLOTYPES { output: tuple val(prefix), path("*.yaml") - publishDir "results/${task.process}", mode: 'copy' + publishDir "${params.outdir}/${task.process}", mode: 'copy' when: task.ext.when == null || task.ext.when diff --git a/modules/haplink/sequences/main.nf b/modules/haplink/sequences/main.nf index ed6f6a6..77801cd 100644 --- a/modules/haplink/sequences/main.nf +++ b/modules/haplink/sequences/main.nf @@ -12,7 +12,7 @@ process HAPLINK_SEQUENCES { output: tuple val(prefix), val(method), path("*.fasta") - publishDir "results/${method}", mode: 'copy' + publishDir "${params.outdir}/${method}", mode: 'copy' when: task.ext.when == null || task.ext.when diff --git a/modules/haplink/variants/main.nf b/modules/haplink/variants/main.nf index f467f77..995edfa 100644 --- a/modules/haplink/variants/main.nf +++ b/modules/haplink/variants/main.nf @@ -12,7 +12,7 @@ process HAPLINK_VARIANTS { output: tuple val(prefix), path("*.vcf") - publishDir "results/${task.process}", mode: 'copy' + publishDir "${params.outdir}/${task.process}", mode: 'copy' when: task.ext.when == null || task.ext.when diff --git a/modules/minimap2/main.nf b/modules/minimap2/main.nf index 7d3c2aa..4b39349 100644 --- a/modules/minimap2/main.nf +++ b/modules/minimap2/main.nf @@ -8,7 +8,7 @@ process MINIMAP2 { tuple val(prefix), path(reads) path reference - publishDir "results", mode: 'copy' + publishDir "${params.outdir}", mode: 'copy' output: tuple val(prefix), path("*.bam"), path("*.bam.bai") diff --git a/modules/quasirecomb/main.nf b/modules/quasirecomb/main.nf index 8e054cf..824a47a 100644 --- a/modules/quasirecomb/main.nf +++ b/modules/quasirecomb/main.nf @@ -11,7 +11,7 @@ process QUASIRECOMB { output: tuple val(prefix), path("*.fasta") - publishDir "results/${task.process}", mode: 'copy' + publishDir "${params.outdir}/${task.process}", mode: 'copy' when: task.ext.when == null || task.ext.when diff --git a/modules/shorah/amplicon/main.nf b/modules/shorah/amplicon/main.nf index f37d70c..3c8e373 100644 --- a/modules/shorah/amplicon/main.nf +++ b/modules/shorah/amplicon/main.nf @@ -13,7 +13,7 @@ process SHORAH_AMPLICON { tuple val(prefix), path("*.vcf") tuple val(prefix), path("*support.fas") - publishDir "results/${task.process}", mode: 'copy' + publishDir "${params.outdir}/${task.process}", mode: 'copy' when: task.ext.when == null || task.ext.when diff --git a/modules/shorah/shotgun/main.nf b/modules/shorah/shotgun/main.nf index 2b7aa86..0fb73cb 100644 --- a/modules/shorah/shotgun/main.nf +++ b/modules/shorah/shotgun/main.nf @@ -13,7 +13,7 @@ process SHORAH_SHOTGUN { tuple val(prefix), path("*.vcf") tuple val(prefix), path("*support.fas") - publishDir "results/${task.process}", mode: 'copy' + publishDir "${params.outdir}/${task.process}", mode: 'copy' when: task.ext.when == null || task.ext.when diff --git a/modules/viquas/main.nf b/modules/viquas/main.nf index ad19953..440161b 100644 --- a/modules/viquas/main.nf +++ b/modules/viquas/main.nf @@ -12,7 +12,7 @@ process VIQUAS { output: tuple val(prefix), path("*.fa") - publishDir "results/${task.process}", mode: 'copy' + publishDir "${params.outdir}/${task.process}", mode: 'copy' when: task.ext.when == null || task.ext.when From c55a854345b1d2f1b8f3f4dbd5a09e9fb5e2a122 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Wed, 18 Oct 2023 21:45:52 -0500 Subject: [PATCH 06/11] feat: Add input schema --- assets/schema_input.json | 23 +++++++++++++++++++++++ nextflow_schema.json | 3 +-- 2 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 assets/schema_input.json diff --git a/assets/schema_input.json b/assets/schema_input.json new file mode 100644 index 0000000..80cf68b --- /dev/null +++ b/assets/schema_input.json @@ -0,0 +1,23 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/taxprofiler/master/assets/schema_input.json", + "title": "haplotyper battle royale pipeline - params.input schema", + "description": "Schema for the file provided with params.input", + "type": "array", + "items": { + "type": "object", + "properties": { + "sample": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Sample name must be provided and cannot contain spaces" + }, + "fastq_1": { + "type": "string", + "pattern": "^\\S+\\.f(ast)?q\\.gz$", + "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + } + }, + "required": ["sample", "fastq_1"] + } +} diff --git a/nextflow_schema.json b/nextflow_schema.json index d4c5c27..9871a3d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -25,8 +25,7 @@ "reference": { "type": "string", "fa_icon": "fas fa-dna", - "description": "NCBI accession number of the reference genome to align reads against", - "pattern": "^[:upper:]{2}_[:digit:]{6}\\.[:digit:]+$" + "description": "NCBI accession number of the reference genome to align reads against" }, "outdir": { "type": "string", From eddf9d21f6817f9ea9b96a3c77d1067bbac4afe7 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Wed, 18 Oct 2023 21:56:44 -0500 Subject: [PATCH 07/11] refactor: Change hard-coded reference into parameter --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 045da7b..81346f4 100755 --- a/main.nf +++ b/main.nf @@ -20,7 +20,7 @@ workflow { .map { file -> tuple(file.simpleName, file) } .set { ch_input } - EFETCH('NC_036618.1') + EFETCH("${params.reference}") EFETCH .out .set { ch_reference } From 77535ee726806e82e6547f2bc1db52ce9df9a87f Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Wed, 18 Oct 2023 22:08:27 -0500 Subject: [PATCH 08/11] feat: Add parameter validation --- main.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/main.nf b/main.nf index 81346f4..17096eb 100755 --- a/main.nf +++ b/main.nf @@ -1,5 +1,7 @@ #!/usr/bin/env nextflow +include { validateParameters } from 'plugin/nf-validation' + include { CLIQUESNV } from './modules/cliquesnv' include { EFETCH } from './modules/efetch' include { HAPLINK_HAPLOTYPES as HAPLINK_ML_HAPLOTYPES } from './modules/haplink/haplotypes' @@ -15,6 +17,8 @@ include { VIQUAS } from './modules/viquas' workflow { + validateParameters() + Channel .fromPath("*.fastq.gz") .map { file -> tuple(file.simpleName, file) } From 7e0f64f78b3620dd0f6542aaeda69781ce664cce Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Wed, 18 Oct 2023 22:11:47 -0500 Subject: [PATCH 09/11] feat: Add input schema to parameter schema --- nextflow_schema.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 9871a3d..ddc1c15 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -20,7 +20,8 @@ "pattern": "^\\S+\\.csv$", "description": "Path to comma-separated file containing information about the samples in the experiment.", "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.", - "fa_icon": "fas fa-file-csv" + "fa_icon": "fas fa-file-csv", + "schema": "assets/schema_input.json" }, "reference": { "type": "string", From 9f70b504e29deca2c1524ce4da810f60906945e5 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Thu, 19 Oct 2023 11:23:41 -0500 Subject: [PATCH 10/11] feat: Add fasterq-dump module --- modules/fasterq_dump/main.nf | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 modules/fasterq_dump/main.nf diff --git a/modules/fasterq_dump/main.nf b/modules/fasterq_dump/main.nf new file mode 100644 index 0000000..560d113 --- /dev/null +++ b/modules/fasterq_dump/main.nf @@ -0,0 +1,25 @@ +process FASTERQ_DUMP { + tag "${prefix} (${sra})" + label 'process_single' + + container 'quay.io/biocontainers/sra-tools:3.0.8--h9f5acd7_0' + + input: + tuple val(prefix), val(sra) + + output: + tuple val(prefix), path("*.fastq.gz") + + publishDir "${params.outdir}/${task.process}", mode: 'copy' + + script: + """ + fasterq-dump \\ + --verbose \\ + --concatenate-reads \\ + --stdout \\ + ${sra} \\ + | gzip \\ + > "${prefix}.fastq.gz" + """ +} From 3e411cc6888cd926f8f2fe6c3c46128d284c5e04 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Thu, 19 Oct 2023 11:38:43 -0500 Subject: [PATCH 11/11] refactor: Switch to SRA direct download for samples --- assets/schema_input.json | 8 ++++---- main.nf | 11 ++++++++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 80cf68b..fbcd887 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -12,12 +12,12 @@ "pattern": "^\\S+$", "errorMessage": "Sample name must be provided and cannot contain spaces" }, - "fastq_1": { + "sra": { "type": "string", - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "pattern": "^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(GS[EM])|(syn))(\\d+)$", + "errorMessage": "Please provide a valid SRA, ENA, DDBJ or GEO identifier" } }, - "required": ["sample", "fastq_1"] + "required": ["sample", "sra"] } } diff --git a/main.nf b/main.nf index 17096eb..1510ff6 100755 --- a/main.nf +++ b/main.nf @@ -1,9 +1,10 @@ #!/usr/bin/env nextflow -include { validateParameters } from 'plugin/nf-validation' +include { validateParameters; fromSamplesheet } from 'plugin/nf-validation' include { CLIQUESNV } from './modules/cliquesnv' include { EFETCH } from './modules/efetch' +include { FASTERQ_DUMP } from './modules/fasterq_dump' include { HAPLINK_HAPLOTYPES as HAPLINK_ML_HAPLOTYPES } from './modules/haplink/haplotypes' include { HAPLINK_HAPLOTYPES as HAPLINK_RAW_HAPLOTYPES } from './modules/haplink/haplotypes' include { HAPLINK_SEQUENCES } from './modules/haplink/sequences' @@ -20,8 +21,12 @@ workflow { validateParameters() Channel - .fromPath("*.fastq.gz") - .map { file -> tuple(file.simpleName, file) } + .fromSamplesheet("input") + .set { ch_sras } + + FASTERQ_DUMP( ch_sras ) + FASTERQ_DUMP + .out .set { ch_input } EFETCH("${params.reference}")