From 5f3eee9a4a2115c5b7e55fdddedad39c540d2cfb Mon Sep 17 00:00:00 2001 From: Rafal Stepien Date: Tue, 16 Aug 2022 15:46:22 -0400 Subject: [PATCH] Add working version of PEP-nf-core integration --- README.md | 6 +- assets/samplesheet_schema.yaml | 55 ++++ bin/check_samplesheet.py | 236 ------------------ bin/detect_reads.py | 125 ++++++++++ conf/test.config | 6 + conf/test_pep.config | 50 ++++ docs/usage.md | 3 + lib/WorkflowMain.groovy | 2 +- modules.json | 8 + modules/local/samplesheet_check.nf | 6 +- modules/nf-core/modules/eido/convert/main.nf | 37 +++ modules/nf-core/modules/eido/convert/meta.yml | 36 +++ modules/nf-core/modules/eido/validate/main.nf | 32 +++ .../nf-core/modules/eido/validate/meta.yml | 38 +++ nextflow.config | 2 + nextflow_schema.json | 7 +- subworkflows/local/input_check.nf | 9 +- workflows/taxprofiler.nf | 23 +- 18 files changed, 434 insertions(+), 247 deletions(-) create mode 100644 assets/samplesheet_schema.yaml delete mode 100755 bin/check_samplesheet.py create mode 100644 bin/detect_reads.py create mode 100644 conf/test_pep.config create mode 100644 modules/nf-core/modules/eido/convert/main.nf create mode 100644 modules/nf-core/modules/eido/convert/meta.yml create mode 100644 modules/nf-core/modules/eido/validate/main.nf create mode 100644 modules/nf-core/modules/eido/validate/meta.yml diff --git a/README.md b/README.md index 672c2a9..5f4c886 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,11 @@ On release, automated continuous integration tests run the pipeline on a full-si > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs. > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. -4. Start running your own analysis! +4. You can also run the pipeline using PEP format as an input by running following command: + ```console + nextflow run main.nf -profile test_pep,docker --outdir + ``` +5. Start running your own analysis! ```console nextflow run nf-core/taxprofiler --input samplesheet.csv --databases database.csv --outdir --run_ --run_ -profile diff --git a/assets/samplesheet_schema.yaml b/assets/samplesheet_schema.yaml new file mode 100644 index 0000000..3b1e978 --- /dev/null +++ b/assets/samplesheet_schema.yaml @@ -0,0 +1,55 @@ +description: A schema for validation of samplesheet.csv for taxprofiler pipeline. +imports: + - https://schema.databio.org/pep/2.1.0.yaml +properties: + samples: + type: array + items: + type: object + properties: + sample: + type: string + description: "Sample identifier." + pattern: "^\\S*$" + run_accession: + type: string + description: "Run accession number." + instrument_platform: + type: string + description: "Name of the platform that sequenced the samples." + enum: + [ + "ABI_SOLID", + "BGISEQ", + "CAPILLARY", + "COMPLETE_GENOMICS", + "DNBSEQ", + "HELICOS", + "ILLUMINA", + "ION_TORRENT", + "LS454", + "OXFORD_NANOPORE", + "PACBIO_SMRT", + ] + fastq1: + type: ["string", "null"] + description: "FASTQ file for read 1." + pattern: "^[\\S]+.(fq\\.gz|fastq\\.gz)$" + fastq2: + type: ["string", "null"] + description: "FASTQ file for read 2." + pattern: "^[\\S]+.(fq\\.gz|fastq\\.gz)$" + fasta: + type: ["string", "null"] + description: "Path to FASTA file." + pattern: "^[\\S]+.(fa\\.gz|fasta\\.gz)$" + required: + - sample + - run_accession + - instrument_platform + files: + - fastq1 + - fastq2 + - fasta +required: + - samples diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py deleted file mode 100755 index ca54ed9..0000000 --- a/bin/check_samplesheet.py +++ /dev/null @@ -1,236 +0,0 @@ -#!/usr/bin/env python - -from distutils import extension -import os -import sys -import errno -import argparse - - -def parse_args(args=None): - Description = "Reformat nf-core/taxprofiler samplesheet file and check its contents." - - Epilog = "Example usage: python check_samplesheet.py " - - parser = argparse.ArgumentParser(description=Description, epilog=Epilog) - parser.add_argument("FILE_IN", help="Input samplesheet file.") - parser.add_argument("FILE_OUT", help="Output file.") - return parser.parse_args(args) - - -def make_dir(path): - if len(path) > 0: - try: - os.makedirs(path) - except OSError as exception: - if exception.errno != errno.EEXIST: - raise exception - - -def print_error(error, context="Line", context_str=""): - error_str = "ERROR: Please check samplesheet -> {}".format(error) - if context != "" and context_str != "": - error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format( - error, context.strip(), context_str.strip() - ) - print(error_str) - sys.exit(1) - - -def check_samplesheet(file_in, file_out): - """ - This function checks that the samplesheet follows the following structure: - - sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta - 2611,ERR5766174,ILLUMINA,,,ERX5474930_ERR5766174_1.fa.gz - 2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz, - 2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,, - 2613,ERR5766181,ILLUMINA,ERX5474937_ERR5766181_1.fastq.gz,ERX5474937_ERR5766181_2.fastq.gz, - """ - - FQ_EXTENSIONS = (".fq.gz", ".fastq.gz") - FA_EXTENSIONS = ( - ".fa", - ".fa.gz", - ".fasta", - ".fasta.gz", - ".fna", - ".fna.gz", - ".fas", - ".fas.gz", - ) - INSTRUMENT_PLATFORMS = [ - "ABI_SOLID", - "BGISEQ", - "CAPILLARY", - "COMPLETE_GENOMICS", - "DNBSEQ", - "HELICOS", - "ILLUMINA", - "ION_TORRENT", - "LS454", - "OXFORD_NANOPORE", - "PACBIO_SMRT", - ] - - sample_mapping_dict = {} - with open(file_in, "r") as fin: - - ## Check header - MIN_COLS = 4 - HEADER = [ - "sample", - "run_accession", - "instrument_platform", - "fastq_1", - "fastq_2", - "fasta", - ] - header = [x.strip('"') for x in fin.readline().strip().split(",")] - - ## Check for missing mandatory columns - missing_columns = list(set(HEADER) - set(header)) - if len(missing_columns) > 0: - print( - "ERROR: Missing required column header -> {}. Note some columns can otherwise be empty. See pipeline documentation (https://nf-co.re/taxprofiler/usage).".format( - ",".join(missing_columns) - ) - ) - sys.exit(1) - - ## Find locations of mandatory columns - header_locs = {} - for i in HEADER: - header_locs[i] = header.index(i) - - ## Check sample entries - for line in fin: - - ## Pull out only relevant columns for downstream checking - line_parsed = [x.strip().strip('"') for x in line.strip().split(",")] - lspl = [line_parsed[i] for i in header_locs.values()] - - # Check valid number of columns per row - if len(lspl) < len(HEADER): - print_error( - "Invalid number of columns (minimum = {})!".format(len(HEADER)), - "Line", - line, - ) - num_cols = len([x for x in lspl if x]) - if num_cols < MIN_COLS: - print_error( - "Invalid number of populated columns (minimum = {})!".format(MIN_COLS), - "Line", - line, - ) - - ## Check sample name entries - - ( - sample, - run_accession, - instrument_platform, - fastq_1, - fastq_2, - fasta, - ) = lspl[: len(HEADER)] - sample = sample.replace(" ", "_") - if not sample: - print_error("Sample entry has not been specified!", "Line", line) - - ## Check FastQ file extension - for fastq in [fastq_1, fastq_2]: - if fastq: - if fastq.find(" ") != -1: - print_error("FastQ file contains spaces!", "Line", line) - if not fastq.endswith(FQ_EXTENSIONS): - print_error( - f"FastQ file does not have extension {' or '.join(list(FQ_EXTENSIONS))} !", - "Line", - line, - ) - if fasta: - if fasta.find(" ") != -1: - print_error("FastA file contains spaces!", "Line", line) - if not fasta.endswith(FA_EXTENSIONS): - print_error( - f"FastA file does not have extension {' or '.join(list(FA_EXTENSIONS))}!", - "Line", - line, - ) - sample_info = [] - - # Check run_accession - if not run_accession: - print_error("Run accession has not been specified!", "Line", line) - else: - sample_info.append(run_accession) - - # Check instrument_platform - if not instrument_platform: - print_error("Instrument platform has not been specified!", "Line", line) - else: - if instrument_platform not in INSTRUMENT_PLATFORMS: - print_error( - f"Instrument platform {instrument_platform} is not supported!", - f"List of supported platforms {', '.join(INSTRUMENT_PLATFORMS)}", - "Line", - line, - ) - sample_info.append(instrument_platform) - - ## Auto-detect paired-end/single-end - if sample and fastq_1 and fastq_2: ## Paired-end short reads - sample_info.extend(["0", fastq_1, fastq_2, fasta]) - elif sample and fastq_1 and not fastq_2: ## Single-end short/long fastq reads - sample_info.extend(["1", fastq_1, fastq_2, fasta]) - elif sample and fasta and not fastq_1 and not fastq_2: ## Single-end long reads - sample_info.extend(["1", fastq_1, fastq_2, fasta]) - elif fasta and (fastq_1 or fastq_2): - print_error( - "FastQ and FastA files cannot be specified together in the same library!", - "Line", - line, - ) - else: - print_error("Invalid combination of columns provided!", "Line", line) - - ## Create sample mapping dictionary = { sample: [ run_accession, instrument_platform, single_end, fastq_1, fastq_2 , fasta ] } - if sample not in sample_mapping_dict: - sample_mapping_dict[sample] = [sample_info] - else: - if sample_info in sample_mapping_dict[sample]: - print_error("Samplesheet contains duplicate rows!", "Line", line) - else: - sample_mapping_dict[sample].append(sample_info) - - ## Write validated samplesheet with appropriate columns - HEADER_OUT = [ - "sample", - "run_accession", - "instrument_platform", - "single_end", - "fastq_1", - "fastq_2", - "fasta", - ] - if len(sample_mapping_dict) > 0: - out_dir = os.path.dirname(file_out) - make_dir(out_dir) - with open(file_out, "w") as fout: - fout.write(",".join(HEADER_OUT) + "\n") - for sample in sorted(sample_mapping_dict.keys()): - for idx, val in enumerate(sample_mapping_dict[sample]): - fout.write(f"{sample},{','.join(val)}\n") - else: - print_error("No entries to process!", "Samplesheet: {}".format(file_in)) - - -def main(args=None): - args = parse_args(args) - check_samplesheet(args.FILE_IN, args.FILE_OUT) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/detect_reads.py b/bin/detect_reads.py new file mode 100644 index 0000000..8a1430e --- /dev/null +++ b/bin/detect_reads.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python + +import argparse +import csv +import sys +from typing import List, NoReturn + + +def parse_args(args=None) -> argparse.Namespace: + """ + Reformatting is based on detecting whether the reads are paired or single end. + Script appends appropriate column to samplesheet.csv file. + """ + Description = "Reformat nf-core/taxprofiler samplesheet file." + Epilog = "Example usage: python detect_reads.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_IN", help="Input samplesheet file.") + parser.add_argument("FILE_OUT", help="Output file.") + return parser.parse_args(args) + + +class ReadsModifier: + def __init__(self): + self.headers = None + self.sample_index = None + self.fastq_1_index = None + self.fastq_2_index = None + self.fasta_index = None + + def detect_reads_and_reformat(self, input_file_path: str, output_file_path: str) -> NoReturn: + NEW_COLUMN_NAME = "single_end" + new_file_rows = [] + + with open(input_file_path, "r") as input_file: + csv_reader = csv.reader(input_file, delimiter=",") + self.headers = next(csv_reader) + self.headers.append(NEW_COLUMN_NAME) + + self._infer_column_indexes() + + for samplesheet_row in csv_reader: + + if self._is_paired_end_short_read(samplesheet_row): + new_file_rows.append([*samplesheet_row, "0"]) + + elif self._is_single_end_short_long_read(samplesheet_row): + new_file_rows.append([*samplesheet_row, "1"]) + + elif self._is_single_end_long_read(samplesheet_row): + new_file_rows.append([*samplesheet_row, "1"]) + + elif self._is_error_row(samplesheet_row): + self.print_error( + "FastQ and FastA files cannot be specified together in the same library!", + "Line", + ",".join(samplesheet_row), + ) + else: + self.print_error("Invalid combination of columns provided!", "Line", ",".join(samplesheet_row)) + + self.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path) + + def _get_row_values(self, samplesheet_row): + """ + This method extracts data from the columns for given row of samplesheet table, based on + previously infered column indexes. + """ + sample = samplesheet_row[self.sample_index] + fastq_1 = samplesheet_row[self.fastq_1_index] if self.fastq_1_index else None + fastq_2 = samplesheet_row[self.fastq_2_index] if self.fastq_2_index else None + fasta = samplesheet_row[self.fasta_index] if self.fasta_index else None + return sample, fastq_1, fastq_2, fasta + + def _infer_column_indexes(self): + """ + This method infers indexes of necessary columns from samplesheet table + """ + self.sample_index = self.headers.index("sample") + self.fastq_1_index = self.headers.index("fastq_1") if "fastq_1" in self.headers else None + self.fastq_2_index = self.headers.index("fastq_2") if "fastq_2" in self.headers else None + self.fasta_index = self.headers.index("fasta") if "fasta" in self.headers else None + + def _is_paired_end_short_read(self, samplesheet_row: List) -> bool: + sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row) + return sample and fastq_1 and fastq_2 + + def _is_single_end_short_long_read(self, samplesheet_row: List) -> bool: + sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row) + return sample and fastq_1 and not fastq_2 + + def _is_single_end_long_read(self, samplesheet_row: List) -> bool: + sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row) + return sample and fasta and not fastq_1 and not fastq_2 + + def _is_error_row(self, samplesheet_row: List) -> bool: + sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row) + return fasta and (fastq_1 or fastq_2) + + @staticmethod + def print_error(error: str, context: str = "Line", context_str: str = ""): + error_str = "ERROR: Please check samplesheet -> {}".format(error) + if context != "" and context_str != "": + error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format( + error, context.strip(), context_str.strip() + ) + print(error_str) + sys.exit(1) + + @staticmethod + def save_reformatted_samplesheet(new_file_rows: List[List], output_file_path: str) -> NoReturn: + """ + Write new samplesheet. + """ + with open(output_file_path, "w") as output_file: + csv.writer(output_file).writerows(new_file_rows) + + +def main(args=None): + args = parse_args(args) + ReadsModifier().detect_reads_and_reformat(args.FILE_IN, args.FILE_OUT) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/conf/test.config b/conf/test.config index a39a107..d5dcd67 100644 --- a/conf/test.config +++ b/conf/test.config @@ -57,4 +57,10 @@ process { withName: MEGAN_RMA2INFO_KRONA { maxForks = 1 } + withName: 'EIDO_VALIDATE' { + ext.args = '--st-index sample' + } + withName: 'EIDO_CONVERT' { + ext.args = '--st-index sample' + } } diff --git a/conf/test_pep.config b/conf/test_pep.config new file mode 100644 index 0000000..e3428fd --- /dev/null +++ b/conf/test_pep.config @@ -0,0 +1,50 @@ +params { + config_profile_name = 'Test PEP profile' + config_profile_description = 'Minimal test dataset to check pipeline function with PEP file as an input.' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = null + pep = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/pep/test_pep_format_files/config.yaml' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + perform_shortread_qc = true + perform_longread_qc = true + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_malt = true + run_metaphlan3 = true + run_centrifuge = true + run_diamond = true + run_motus = false + run_krona = true + krona_taxonomy_directory = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/metagenome/krona_taxonomy.tab' + malt_save_reads = true + kraken2_save_reads = true + centrifuge_save_reads = true + diamond_save_reads = true +} + + +process { + withName: MALT_RUN { + maxForks = 1 + } + withName: MEGAN_RMA2INFO { + maxForks = 1 + } + withName: 'EIDO_VALIDATE' { + ext.args = '--st-index sample' + } + withName: 'EIDO_CONVERT' { + ext.args = '--st-index sample' + } +} diff --git a/docs/usage.md b/docs/usage.md index 8ae5257..cd4b749 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -277,6 +277,9 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - `test` - A profile with a complete configuration for automated testing - Includes links to test data so needs no other parameters +- `test_pep` + - A profile with a complete configuration for running a pipeline with PEP as input + - Includes links to test data so needs no other parameters ### `-resume` diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index c90cf49..b9dd514 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -74,7 +74,7 @@ class WorkflowMain { NfcoreTemplate.awsBatch(workflow, params) // Check input has been provided - if (!params.input) { + if (!params.input && !params.pep) { log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'" System.exit(1) } diff --git a/modules.json b/modules.json index 385149a..feed11f 100644 --- a/modules.json +++ b/modules.json @@ -41,6 +41,14 @@ "branch": "master", "git_sha": "3531824af826c16cd252bc5aa82ae169b244ebaa" }, + "eido/convert": { + "branch": "master", + "git_sha": "c9b29c76869d9713130a13a418c1e8b5aecfb80d" + }, + "eido/validate": { + "branch": "master", + "git_sha": "8c0127e071711cb0a2648a6bdf881637a9d7eadc" + }, "fastp": { "branch": "master", "git_sha": "2c70c1c1951aaf884d2e8d8d9c871db79f7b35aa" diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index dea4362..f048351 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -13,11 +13,9 @@ process SAMPLESHEET_CHECK { path '*.csv' , emit: csv path "versions.yml", emit: versions - script: // This script is bundled with the pipeline, in nf-core/taxprofiler/bin/ + script: // detect_reads.py script is bundled with the pipeline, in nf-core/taxprofiler/bin/ """ - check_samplesheet.py \\ - $samplesheet \\ - samplesheet.valid.csv + python3 $projectDir/bin/detect_reads.py $samplesheet samplesheet_validated.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/modules/eido/convert/main.nf b/modules/nf-core/modules/eido/convert/main.nf new file mode 100644 index 0000000..40cd57a --- /dev/null +++ b/modules/nf-core/modules/eido/convert/main.nf @@ -0,0 +1,37 @@ +process EIDO_CONVERT { + tag '$samplesheet' + label 'process_single' + + conda (params.enable_conda ? "conda-forge::eido=0.1.9" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/eido/0.1.9_cv1/eido_0.1.9_cv1.sif' : + 'biocontainers/eido:0.1.9_cv1' }" + + input: + path samplesheet + val format + + output: + path "versions.yml" , emit: versions + path "${prefix}.${format}" , emit: samplesheet_converted + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "samplesheet_converted" + """ + eido \\ + convert \\ + -f $format \\ + $samplesheet \\ + $args \\ + -p samples=${prefix}.${format} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + eido: \$(echo \$(eido --version 2>&1) | sed 's/^.*eido //;s/ .*//' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/eido/convert/meta.yml b/modules/nf-core/modules/eido/convert/meta.yml new file mode 100644 index 0000000..0cf354a --- /dev/null +++ b/modules/nf-core/modules/eido/convert/meta.yml @@ -0,0 +1,36 @@ +name: "eido_convert" +description: Convert any PEP project or Nextflow samplesheet to any format +keywords: + - eido + - convert + - PEP + - format + - samplesheet +tools: + - "eido": + description: "Convert any PEP project or Nextflow samplesheet to any format" + homepage: "http://eido.databio.org/en/latest/" + documentation: "http://eido.databio.org/en/latest/" + doi: "10.1093/gigascience/giab077" + licence: "BSD-2-Clause" + +input: + - samplesheet: + type: file + description: Nextflow samplesheet or PEP project + pattern: "*.{yaml,yml,csv}" + - format: + type: value + description: Extension of an output file + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - samplesheet_converted: + type: file + description: PEP project or samplesheet converted to csv file + +authors: + - "@rafalstepien" diff --git a/modules/nf-core/modules/eido/validate/main.nf b/modules/nf-core/modules/eido/validate/main.nf new file mode 100644 index 0000000..bc6a111 --- /dev/null +++ b/modules/nf-core/modules/eido/validate/main.nf @@ -0,0 +1,32 @@ +process EIDO_VALIDATE { + tag '$samplesheet' + label 'process_single' + + conda (params.enable_conda ? "conda-forge::eido=0.1.9" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/eido/0.1.9_cv2/eido_0.1.9_cv2.sif' : + 'biocontainers/eido:0.1.9_cv2' }" + + input: + path samplesheet + path schema + + output: + path "versions.yml" , emit: versions + path "*.log" , emit: log + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "validation" + """ + eido validate $args $samplesheet -s $schema -e > ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + eido: \$(echo \$(eido --version 2>&1) | sed 's/^.*eido //;s/ .*//' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/eido/validate/meta.yml b/modules/nf-core/modules/eido/validate/meta.yml new file mode 100644 index 0000000..962f59e --- /dev/null +++ b/modules/nf-core/modules/eido/validate/meta.yml @@ -0,0 +1,38 @@ +name: "eido_validate" +description: Validate samplesheet or PEP config against a schema +keywords: + - eido + - validate + - schema + - format + - pep +tools: + - "validate": + description: "Validate samplesheet or PEP config against a schema." + homepage: "http://eido.databio.org/en/latest/" + documentation: "http://eido.databio.org/en/latest/" + doi: "10.1093/gigascience/giab077" + licence: "BSD-2-Clause" + +input: + - samplesheet: + type: file + description: Samplesheet or PEP file to be validated + pattern: "*.{yaml,yml,csv}" + - schema: + type: file + description: Schema that the samplesheet will be validated against + pattern: "*.{yaml,yml}" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - log: + type: file + description: File containing validation log. + pattern: "*.log" + +authors: + - "@rafalstepien" diff --git a/nextflow.config b/nextflow.config index df5b90b..f3c7756 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,6 +12,7 @@ params { // TODO nf-core: Specify your pipeline's command line flags // Input options input = null + pep = null // References @@ -227,6 +228,7 @@ profiles { test_nopreprocessing { includeConfig 'conf/test_nopreprocessing.config' } test_nothing { includeConfig 'conf/test_nothing.config' } test_motus { includeConfig 'conf/test_motus.config' } + test_pep { includeConfig 'conf/test_pep.config' } } diff --git a/nextflow_schema.json b/nextflow_schema.json index eb839ec..e8690f2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,8 +10,13 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "databases", "outdir"], + "required": ["outdir", "databases"], "properties": { + "pep": { + "type": "string", + "format": "file-path", + "pattern": "^\\S+\\.yaml$" + }, "input": { "type": "string", "format": "file-path", diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index eb21b9d..5db1520 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -3,13 +3,18 @@ // include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' +include { EIDO_VALIDATE } from '../../modules/nf-core/modules/eido/validate/main' +include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main' workflow INPUT_CHECK { take: - samplesheet // file: /path/to/samplesheet.csv + samplesheet_or_pep_config // file: /path/to/samplesheet.csv or /path/to/pep/config.yaml + base_dir // file: path to PEP directory main: - parsed_samplesheet = SAMPLESHEET_CHECK ( samplesheet ) + EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml") ) + converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv" ) + parsed_samplesheet = SAMPLESHEET_CHECK ( converted_samplesheet.samplesheet_converted ) .csv .splitCsv ( header:true, sep:',' ) .branch { diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 223ba15..1f38214 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -17,7 +17,26 @@ def checkPathParamList = [ params.input, params.databases, params.hostremoval_re for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters -if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (params.input) { + ch_input = file(params.input) + ch_input_basedir = [] + +} else if (params.pep) { + + if ( params.pep.startsWith("http://") || params.pep.startsWith("https://") ) { + ch_input = file(params.pep) + ch_input_basedir = [] + } + + else { + ch_input = file(params.pep) + ch_input_basedir = new File(params.pep).getParent() + } + +} else { + exit 1, 'Input samplesheet or PEP config not specified!' +} + if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } if (params.shortread_qc_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." @@ -98,7 +117,7 @@ workflow TAXPROFILER { SUBWORKFLOW: Read in samplesheet, validate and stage input files */ INPUT_CHECK ( - ch_input + ch_input, ch_input_basedir ) ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)