From 5f3eee9a4a2115c5b7e55fdddedad39c540d2cfb Mon Sep 17 00:00:00 2001
From: Rafal Stepien <rstepien095@gmail.com>
Date: Tue, 16 Aug 2022 15:46:22 -0400
Subject: [PATCH] Add working version of PEP-nf-core integration

---
 README.md                                     |   6 +-
 assets/samplesheet_schema.yaml                |  55 ++++
 bin/check_samplesheet.py                      | 236 ------------------
 bin/detect_reads.py                           | 125 ++++++++++
 conf/test.config                              |   6 +
 conf/test_pep.config                          |  50 ++++
 docs/usage.md                                 |   3 +
 lib/WorkflowMain.groovy                       |   2 +-
 modules.json                                  |   8 +
 modules/local/samplesheet_check.nf            |   6 +-
 modules/nf-core/modules/eido/convert/main.nf  |  37 +++
 modules/nf-core/modules/eido/convert/meta.yml |  36 +++
 modules/nf-core/modules/eido/validate/main.nf |  32 +++
 .../nf-core/modules/eido/validate/meta.yml    |  38 +++
 nextflow.config                               |   2 +
 nextflow_schema.json                          |   7 +-
 subworkflows/local/input_check.nf             |   9 +-
 workflows/taxprofiler.nf                      |  23 +-
 18 files changed, 434 insertions(+), 247 deletions(-)
 create mode 100644 assets/samplesheet_schema.yaml
 delete mode 100755 bin/check_samplesheet.py
 create mode 100644 bin/detect_reads.py
 create mode 100644 conf/test_pep.config
 create mode 100644 modules/nf-core/modules/eido/convert/main.nf
 create mode 100644 modules/nf-core/modules/eido/convert/meta.yml
 create mode 100644 modules/nf-core/modules/eido/validate/main.nf
 create mode 100644 modules/nf-core/modules/eido/validate/meta.yml
diff --git a/README.md b/README.md
index 672c2a9..5f4c886 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,11 @@ On release, automated continuous integration tests run the pipeline on a full-si
    > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs.
    > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs.
 
-4. Start running your own analysis!
+4. You can also run the pipeline using PEP format as an input by running following command:
+   ```console
+   nextflow run main.nf -profile test_pep,docker --outdir <OUTDIR>
+   ```
+5. Start running your own analysis!
 
    ```console
    nextflow run nf-core/taxprofiler --input samplesheet.csv --databases database.csv --outdir <OUTDIR> --run_<TOOL1> --run_<TOOL1> -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
diff --git a/assets/samplesheet_schema.yaml b/assets/samplesheet_schema.yaml
new file mode 100644
index 0000000..3b1e978
--- /dev/null
+++ b/assets/samplesheet_schema.yaml
@@ -0,0 +1,55 @@
+description: A schema for validation of samplesheet.csv for taxprofiler pipeline.
+imports:
+  - https://schema.databio.org/pep/2.1.0.yaml
+properties:
+  samples:
+    type: array
+    items:
+      type: object
+      properties:
+        sample:
+          type: string
+          description: "Sample identifier."
+          pattern: "^\\S*$"
+        run_accession:
+          type: string
+          description: "Run accession number."
+        instrument_platform:
+          type: string
+          description: "Name of the platform that sequenced the samples."
+          enum:
+            [
+              "ABI_SOLID",
+              "BGISEQ",
+              "CAPILLARY",
+              "COMPLETE_GENOMICS",
+              "DNBSEQ",
+              "HELICOS",
+              "ILLUMINA",
+              "ION_TORRENT",
+              "LS454",
+              "OXFORD_NANOPORE",
+              "PACBIO_SMRT",
+            ]
+        fastq1:
+          type: ["string", "null"]
+          description: "FASTQ file for read 1."
+          pattern: "^[\\S]+.(fq\\.gz|fastq\\.gz)$"
+        fastq2:
+          type: ["string", "null"]
+          description: "FASTQ file for read 2."
+          pattern: "^[\\S]+.(fq\\.gz|fastq\\.gz)$"
+        fasta:
+          type: ["string", "null"]
+          description: "Path to FASTA file."
+          pattern: "^[\\S]+.(fa\\.gz|fasta\\.gz)$"
+      required:
+        - sample
+        - run_accession
+        - instrument_platform
+      files:
+        - fastq1
+        - fastq2
+        - fasta
+required:
+  - samples
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
deleted file mode 100755
index ca54ed9..0000000
--- a/bin/check_samplesheet.py
+++ /dev/null
@@ -1,236 +0,0 @@
-#!/usr/bin/env python
-
-from distutils import extension
-import os
-import sys
-import errno
-import argparse
-
-
-def parse_args(args=None):
-    Description = "Reformat nf-core/taxprofiler samplesheet file and check its contents."
-
-    Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
-
-    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
-    parser.add_argument("FILE_IN", help="Input samplesheet file.")
-    parser.add_argument("FILE_OUT", help="Output file.")
-    return parser.parse_args(args)
-
-
-def make_dir(path):
-    if len(path) > 0:
-        try:
-            os.makedirs(path)
-        except OSError as exception:
-            if exception.errno != errno.EEXIST:
-                raise exception
-
-
-def print_error(error, context="Line", context_str=""):
-    error_str = "ERROR: Please check samplesheet -> {}".format(error)
-    if context != "" and context_str != "":
-        error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
-            error, context.strip(), context_str.strip()
-        )
-    print(error_str)
-    sys.exit(1)
-
-
-def check_samplesheet(file_in, file_out):
-    """
-    This function checks that the samplesheet follows the following structure:
-
-    sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
-    2611,ERR5766174,ILLUMINA,,,ERX5474930_ERR5766174_1.fa.gz
-    2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz,
-    2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,,
-    2613,ERR5766181,ILLUMINA,ERX5474937_ERR5766181_1.fastq.gz,ERX5474937_ERR5766181_2.fastq.gz,
-    """
-
-    FQ_EXTENSIONS = (".fq.gz", ".fastq.gz")
-    FA_EXTENSIONS = (
-        ".fa",
-        ".fa.gz",
-        ".fasta",
-        ".fasta.gz",
-        ".fna",
-        ".fna.gz",
-        ".fas",
-        ".fas.gz",
-    )
-    INSTRUMENT_PLATFORMS = [
-        "ABI_SOLID",
-        "BGISEQ",
-        "CAPILLARY",
-        "COMPLETE_GENOMICS",
-        "DNBSEQ",
-        "HELICOS",
-        "ILLUMINA",
-        "ION_TORRENT",
-        "LS454",
-        "OXFORD_NANOPORE",
-        "PACBIO_SMRT",
-    ]
-
-    sample_mapping_dict = {}
-    with open(file_in, "r") as fin:
-
-        ## Check header
-        MIN_COLS = 4
-        HEADER = [
-            "sample",
-            "run_accession",
-            "instrument_platform",
-            "fastq_1",
-            "fastq_2",
-            "fasta",
-        ]
-        header = [x.strip('"') for x in fin.readline().strip().split(",")]
-
-        ## Check for missing mandatory columns
-        missing_columns = list(set(HEADER) - set(header))
-        if len(missing_columns) > 0:
-            print(
-                "ERROR: Missing required column header -> {}. Note some columns can otherwise be empty. See pipeline documentation (https://nf-co.re/taxprofiler/usage).".format(
-                    ",".join(missing_columns)
-                )
-            )
-            sys.exit(1)
-
-        ## Find locations of mandatory columns
-        header_locs = {}
-        for i in HEADER:
-            header_locs[i] = header.index(i)
-
-        ## Check sample entries
-        for line in fin:
-
-            ## Pull out only relevant columns for downstream checking
-            line_parsed = [x.strip().strip('"') for x in line.strip().split(",")]
-            lspl = [line_parsed[i] for i in header_locs.values()]
-
-            # Check valid number of columns per row
-            if len(lspl) < len(HEADER):
-                print_error(
-                    "Invalid number of columns (minimum = {})!".format(len(HEADER)),
-                    "Line",
-                    line,
-                )
-            num_cols = len([x for x in lspl if x])
-            if num_cols < MIN_COLS:
-                print_error(
-                    "Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
-                    "Line",
-                    line,
-                )
-
-            ## Check sample name entries
-
-            (
-                sample,
-                run_accession,
-                instrument_platform,
-                fastq_1,
-                fastq_2,
-                fasta,
-            ) = lspl[: len(HEADER)]
-            sample = sample.replace(" ", "_")
-            if not sample:
-                print_error("Sample entry has not been specified!", "Line", line)
-
-            ## Check FastQ file extension
-            for fastq in [fastq_1, fastq_2]:
-                if fastq:
-                    if fastq.find(" ") != -1:
-                        print_error("FastQ file contains spaces!", "Line", line)
-                    if not fastq.endswith(FQ_EXTENSIONS):
-                        print_error(
-                            f"FastQ file does not have extension {' or '.join(list(FQ_EXTENSIONS))} !",
-                            "Line",
-                            line,
-                        )
-            if fasta:
-                if fasta.find(" ") != -1:
-                    print_error("FastA file contains spaces!", "Line", line)
-                if not fasta.endswith(FA_EXTENSIONS):
-                    print_error(
-                        f"FastA file does not have extension {' or '.join(list(FA_EXTENSIONS))}!",
-                        "Line",
-                        line,
-                    )
-            sample_info = []
-
-            # Check run_accession
-            if not run_accession:
-                print_error("Run accession has not been specified!", "Line", line)
-            else:
-                sample_info.append(run_accession)
-
-            # Check instrument_platform
-            if not instrument_platform:
-                print_error("Instrument platform has not been specified!", "Line", line)
-            else:
-                if instrument_platform not in INSTRUMENT_PLATFORMS:
-                    print_error(
-                        f"Instrument platform {instrument_platform} is not supported!",
-                        f"List of supported platforms {', '.join(INSTRUMENT_PLATFORMS)}",
-                        "Line",
-                        line,
-                    )
-                sample_info.append(instrument_platform)
-
-            ## Auto-detect paired-end/single-end
-            if sample and fastq_1 and fastq_2:  ## Paired-end short reads
-                sample_info.extend(["0", fastq_1, fastq_2, fasta])
-            elif sample and fastq_1 and not fastq_2:  ## Single-end short/long fastq reads
-                sample_info.extend(["1", fastq_1, fastq_2, fasta])
-            elif sample and fasta and not fastq_1 and not fastq_2:  ## Single-end long reads
-                sample_info.extend(["1", fastq_1, fastq_2, fasta])
-            elif fasta and (fastq_1 or fastq_2):
-                print_error(
-                    "FastQ and FastA files cannot be specified together in the same library!",
-                    "Line",
-                    line,
-                )
-            else:
-                print_error("Invalid combination of columns provided!", "Line", line)
-
-            ## Create sample mapping dictionary = { sample: [ run_accession, instrument_platform, single_end, fastq_1, fastq_2 , fasta ] }
-            if sample not in sample_mapping_dict:
-                sample_mapping_dict[sample] = [sample_info]
-            else:
-                if sample_info in sample_mapping_dict[sample]:
-                    print_error("Samplesheet contains duplicate rows!", "Line", line)
-                else:
-                    sample_mapping_dict[sample].append(sample_info)
-
-    ## Write validated samplesheet with appropriate columns
-    HEADER_OUT = [
-        "sample",
-        "run_accession",
-        "instrument_platform",
-        "single_end",
-        "fastq_1",
-        "fastq_2",
-        "fasta",
-    ]
-    if len(sample_mapping_dict) > 0:
-        out_dir = os.path.dirname(file_out)
-        make_dir(out_dir)
-        with open(file_out, "w") as fout:
-            fout.write(",".join(HEADER_OUT) + "\n")
-            for sample in sorted(sample_mapping_dict.keys()):
-                for idx, val in enumerate(sample_mapping_dict[sample]):
-                    fout.write(f"{sample},{','.join(val)}\n")
-    else:
-        print_error("No entries to process!", "Samplesheet: {}".format(file_in))
-
-
-def main(args=None):
-    args = parse_args(args)
-    check_samplesheet(args.FILE_IN, args.FILE_OUT)
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/bin/detect_reads.py b/bin/detect_reads.py
new file mode 100644
index 0000000..8a1430e
--- /dev/null
+++ b/bin/detect_reads.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+
+import argparse
+import csv
+import sys
+from typing import List, NoReturn
+
+
+def parse_args(args=None) -> argparse.Namespace:
+    """
+    Reformatting is based on detecting whether the reads are paired or single end.
+    Script appends appropriate column to samplesheet.csv file.
+    """
+    Description = "Reformat nf-core/taxprofiler samplesheet file."
+    Epilog = "Example usage: python detect_reads.py <FILE_IN> <FILE_OUT>"
+
+    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+    parser.add_argument("FILE_IN", help="Input samplesheet file.")
+    parser.add_argument("FILE_OUT", help="Output file.")
+    return parser.parse_args(args)
+
+
+class ReadsModifier:
+    def __init__(self):
+        self.headers = None
+        self.sample_index = None
+        self.fastq_1_index = None
+        self.fastq_2_index = None
+        self.fasta_index = None
+
+    def detect_reads_and_reformat(self, input_file_path: str, output_file_path: str) -> NoReturn:
+        NEW_COLUMN_NAME = "single_end"
+        new_file_rows = []
+
+        with open(input_file_path, "r") as input_file:
+            csv_reader = csv.reader(input_file, delimiter=",")
+            self.headers = next(csv_reader)
+            self.headers.append(NEW_COLUMN_NAME)
+
+            self._infer_column_indexes()
+
+            for samplesheet_row in csv_reader:
+
+                if self._is_paired_end_short_read(samplesheet_row):
+                    new_file_rows.append([*samplesheet_row, "0"])
+
+                elif self._is_single_end_short_long_read(samplesheet_row):
+                    new_file_rows.append([*samplesheet_row, "1"])
+
+                elif self._is_single_end_long_read(samplesheet_row):
+                    new_file_rows.append([*samplesheet_row, "1"])
+
+                elif self._is_error_row(samplesheet_row):
+                    self.print_error(
+                        "FastQ and FastA files cannot be specified together in the same library!",
+                        "Line",
+                        ",".join(samplesheet_row),
+                    )
+                else:
+                    self.print_error("Invalid combination of columns provided!", "Line", ",".join(samplesheet_row))
+
+        self.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path)
+
+    def _get_row_values(self, samplesheet_row):
+        """
+        This method extracts data from the columns for given row of samplesheet table, based on
+        previously infered column indexes.
+        """
+        sample = samplesheet_row[self.sample_index]
+        fastq_1 = samplesheet_row[self.fastq_1_index] if self.fastq_1_index else None
+        fastq_2 = samplesheet_row[self.fastq_2_index] if self.fastq_2_index else None
+        fasta = samplesheet_row[self.fasta_index] if self.fasta_index else None
+        return sample, fastq_1, fastq_2, fasta
+
+    def _infer_column_indexes(self):
+        """
+        This method infers indexes of necessary columns from samplesheet table
+        """
+        self.sample_index = self.headers.index("sample")
+        self.fastq_1_index = self.headers.index("fastq_1") if "fastq_1" in self.headers else None
+        self.fastq_2_index = self.headers.index("fastq_2") if "fastq_2" in self.headers else None
+        self.fasta_index = self.headers.index("fasta") if "fasta" in self.headers else None
+
+    def _is_paired_end_short_read(self, samplesheet_row: List) -> bool:
+        sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
+        return sample and fastq_1 and fastq_2
+
+    def _is_single_end_short_long_read(self, samplesheet_row: List) -> bool:
+        sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
+        return sample and fastq_1 and not fastq_2
+
+    def _is_single_end_long_read(self, samplesheet_row: List) -> bool:
+        sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
+        return sample and fasta and not fastq_1 and not fastq_2
+
+    def _is_error_row(self, samplesheet_row: List) -> bool:
+        sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
+        return fasta and (fastq_1 or fastq_2)
+
+    @staticmethod
+    def print_error(error: str, context: str = "Line", context_str: str = ""):
+        error_str = "ERROR: Please check samplesheet -> {}".format(error)
+        if context != "" and context_str != "":
+            error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
+                error, context.strip(), context_str.strip()
+            )
+        print(error_str)
+        sys.exit(1)
+
+    @staticmethod
+    def save_reformatted_samplesheet(new_file_rows: List[List], output_file_path: str) -> NoReturn:
+        """
+        Write new samplesheet.
+        """
+        with open(output_file_path, "w") as output_file:
+            csv.writer(output_file).writerows(new_file_rows)
+
+
+def main(args=None):
+    args = parse_args(args)
+    ReadsModifier().detect_reads_and_reformat(args.FILE_IN, args.FILE_OUT)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/conf/test.config b/conf/test.config
index a39a107..d5dcd67 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -57,4 +57,10 @@ process {
     withName: MEGAN_RMA2INFO_KRONA {
         maxForks = 1
     }
+    withName: 'EIDO_VALIDATE' {
+        ext.args = '--st-index sample'
+    }
+    withName: 'EIDO_CONVERT' {
+        ext.args = '--st-index sample'
+    }
 }
diff --git a/conf/test_pep.config b/conf/test_pep.config
new file mode 100644
index 0000000..e3428fd
--- /dev/null
+++ b/conf/test_pep.config
@@ -0,0 +1,50 @@
+params {
+    config_profile_name        = 'Test PEP profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function with PEP file as an input.'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    input                                 = null
+    pep                                   = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/pep/test_pep_format_files/config.yaml'
+    databases                             = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv'
+    perform_shortread_qc                  = true
+    perform_longread_qc                   = true
+    perform_shortread_complexityfilter    = true
+    perform_shortread_hostremoval         = true
+    perform_longread_hostremoval          = true
+    perform_runmerging                    = true
+    hostremoval_reference                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
+    run_kaiju                             = true
+    run_kraken2                           = true
+    run_malt                              = true
+    run_metaphlan3                        = true
+    run_centrifuge                        = true
+    run_diamond                           = true
+    run_motus                             = false
+    run_krona                             = true
+    krona_taxonomy_directory              = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/metagenome/krona_taxonomy.tab'
+    malt_save_reads                       = true
+    kraken2_save_reads                    = true
+    centrifuge_save_reads                 = true
+    diamond_save_reads                    = true
+}
+
+
+process {
+    withName: MALT_RUN {
+        maxForks = 1
+    }
+    withName: MEGAN_RMA2INFO {
+        maxForks = 1
+    }
+    withName: 'EIDO_VALIDATE' {
+        ext.args = '--st-index sample'
+    }
+    withName: 'EIDO_CONVERT' {
+        ext.args = '--st-index sample'
+    }
+}
diff --git a/docs/usage.md b/docs/usage.md
index 8ae5257..cd4b749 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -277,6 +277,9 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof
 - `test`
   - A profile with a complete configuration for automated testing
   - Includes links to test data so needs no other parameters
+- `test_pep`
+  - A profile with a complete configuration for running a pipeline with PEP as input
+  - Includes links to test data so needs no other parameters
 
 ### `-resume`
 
diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy
index c90cf49..b9dd514 100755
--- a/lib/WorkflowMain.groovy
+++ b/lib/WorkflowMain.groovy
@@ -74,7 +74,7 @@ class WorkflowMain {
         NfcoreTemplate.awsBatch(workflow, params)
 
         // Check input has been provided
-        if (!params.input) {
+        if (!params.input && !params.pep) {
             log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'"
             System.exit(1)
         }
diff --git a/modules.json b/modules.json
index 385149a..feed11f 100644
--- a/modules.json
+++ b/modules.json
@@ -41,6 +41,14 @@
                     "branch": "master",
                     "git_sha": "3531824af826c16cd252bc5aa82ae169b244ebaa"
                 },
+                "eido/convert": {
+                    "branch": "master",
+                    "git_sha": "c9b29c76869d9713130a13a418c1e8b5aecfb80d"
+                },
+                "eido/validate": {
+                    "branch": "master",
+                    "git_sha": "8c0127e071711cb0a2648a6bdf881637a9d7eadc"
+                },
                 "fastp": {
                     "branch": "master",
                     "git_sha": "2c70c1c1951aaf884d2e8d8d9c871db79f7b35aa"
diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf
index dea4362..f048351 100644
--- a/modules/local/samplesheet_check.nf
+++ b/modules/local/samplesheet_check.nf
@@ -13,11 +13,9 @@ process SAMPLESHEET_CHECK {
     path '*.csv'       , emit: csv
     path "versions.yml", emit: versions
 
-    script: // This script is bundled with the pipeline, in nf-core/taxprofiler/bin/
+    script: // detect_reads.py script is bundled with the pipeline, in nf-core/taxprofiler/bin/
     """
-    check_samplesheet.py \\
-        $samplesheet \\
-        samplesheet.valid.csv
+    python3 $projectDir/bin/detect_reads.py $samplesheet samplesheet_validated.csv
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/nf-core/modules/eido/convert/main.nf b/modules/nf-core/modules/eido/convert/main.nf
new file mode 100644
index 0000000..40cd57a
--- /dev/null
+++ b/modules/nf-core/modules/eido/convert/main.nf
@@ -0,0 +1,37 @@
+process EIDO_CONVERT {
+    tag '$samplesheet'
+    label 'process_single'
+
+    conda (params.enable_conda ? "conda-forge::eido=0.1.9" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://containers.biocontainers.pro/s3/SingImgsRepo/eido/0.1.9_cv1/eido_0.1.9_cv1.sif' :
+        'biocontainers/eido:0.1.9_cv1' }"
+
+    input:
+    path samplesheet
+    val format
+
+    output:
+    path "versions.yml"           , emit: versions
+    path "${prefix}.${format}"    , emit: samplesheet_converted
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    prefix = task.ext.prefix ?: "samplesheet_converted"
+    """
+    eido \\
+        convert \\
+        -f $format \\
+        $samplesheet \\
+        $args \\
+        -p samples=${prefix}.${format}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        eido: \$(echo \$(eido --version 2>&1) | sed 's/^.*eido //;s/ .*//' ))
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/modules/eido/convert/meta.yml b/modules/nf-core/modules/eido/convert/meta.yml
new file mode 100644
index 0000000..0cf354a
--- /dev/null
+++ b/modules/nf-core/modules/eido/convert/meta.yml
@@ -0,0 +1,36 @@
+name: "eido_convert"
+description: Convert any PEP project or Nextflow samplesheet to any format
+keywords:
+  - eido
+  - convert
+  - PEP
+  - format
+  - samplesheet
+tools:
+  - "eido":
+      description: "Convert any PEP project or Nextflow samplesheet to any format"
+      homepage: "http://eido.databio.org/en/latest/"
+      documentation: "http://eido.databio.org/en/latest/"
+      doi: "10.1093/gigascience/giab077"
+      licence: "BSD-2-Clause"
+
+input:
+  - samplesheet:
+      type: file
+      description: Nextflow samplesheet or PEP project
+      pattern: "*.{yaml,yml,csv}"
+  - format:
+      type: value
+      description: Extension of an output file
+
+output:
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - samplesheet_converted:
+      type: file
+      description: PEP project or samplesheet converted to csv file
+
+authors:
+  - "@rafalstepien"
diff --git a/modules/nf-core/modules/eido/validate/main.nf b/modules/nf-core/modules/eido/validate/main.nf
new file mode 100644
index 0000000..bc6a111
--- /dev/null
+++ b/modules/nf-core/modules/eido/validate/main.nf
@@ -0,0 +1,32 @@
+process EIDO_VALIDATE {
+    tag '$samplesheet'
+    label 'process_single'
+
+    conda (params.enable_conda ? "conda-forge::eido=0.1.9" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://containers.biocontainers.pro/s3/SingImgsRepo/eido/0.1.9_cv2/eido_0.1.9_cv2.sif' :
+        'biocontainers/eido:0.1.9_cv2' }"
+
+    input:
+    path samplesheet
+    path schema
+
+    output:
+    path "versions.yml"  , emit: versions
+    path "*.log"         , emit: log
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args   = task.ext.args   ?: ''
+    def prefix = task.ext.prefix ?: "validation"
+    """
+    eido validate $args $samplesheet -s $schema -e > ${prefix}.log
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        eido: \$(echo \$(eido --version 2>&1) | sed 's/^.*eido //;s/ .*//' ))
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/modules/eido/validate/meta.yml b/modules/nf-core/modules/eido/validate/meta.yml
new file mode 100644
index 0000000..962f59e
--- /dev/null
+++ b/modules/nf-core/modules/eido/validate/meta.yml
@@ -0,0 +1,38 @@
+name: "eido_validate"
+description: Validate samplesheet or PEP config against a schema
+keywords:
+  - eido
+  - validate
+  - schema
+  - format
+  - pep
+tools:
+  - "validate":
+      description: "Validate samplesheet or PEP config against a schema."
+      homepage: "http://eido.databio.org/en/latest/"
+      documentation: "http://eido.databio.org/en/latest/"
+      doi: "10.1093/gigascience/giab077"
+      licence: "BSD-2-Clause"
+
+input:
+  - samplesheet:
+      type: file
+      description: Samplesheet or PEP file to be validated
+      pattern: "*.{yaml,yml,csv}"
+  - schema:
+      type: file
+      description: Schema that the samplesheet will be validated against
+      pattern: "*.{yaml,yml}"
+
+output:
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - log:
+      type: file
+      description: File containing validation log.
+      pattern: "*.log"
+
+authors:
+  - "@rafalstepien"
diff --git a/nextflow.config b/nextflow.config
index df5b90b..f3c7756 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -12,6 +12,7 @@ params {
     // TODO nf-core: Specify your pipeline's command line flags
     // Input options
     input                      = null
+    pep                        = null
 
 
     // References
@@ -227,6 +228,7 @@ profiles {
     test_nopreprocessing    { includeConfig 'conf/test_nopreprocessing.config' }
     test_nothing            { includeConfig 'conf/test_nothing.config' }
     test_motus              { includeConfig 'conf/test_motus.config' }
+    test_pep { includeConfig 'conf/test_pep.config' }
 }
 
 
diff --git a/nextflow_schema.json b/nextflow_schema.json
index eb839ec..e8690f2 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -10,8 +10,13 @@
             "type": "object",
             "fa_icon": "fas fa-terminal",
             "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input", "databases", "outdir"],
+            "required": ["outdir", "databases"],
             "properties": {
+                "pep": {
+                    "type": "string",
+                    "format": "file-path",
+                    "pattern": "^\\S+\\.yaml$"
+                },
                 "input": {
                     "type": "string",
                     "format": "file-path",
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index eb21b9d..5db1520 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -3,13 +3,18 @@
 //
 
 include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check'
+include { EIDO_VALIDATE } from '../../modules/nf-core/modules/eido/validate/main'
+include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main'
 
 workflow INPUT_CHECK {
     take:
-    samplesheet // file: /path/to/samplesheet.csv
+    samplesheet_or_pep_config // file: /path/to/samplesheet.csv or /path/to/pep/config.yaml
+    base_dir // file: path to PEP directory
 
     main:
-    parsed_samplesheet = SAMPLESHEET_CHECK ( samplesheet )
+    EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml") )
+    converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv" )
+    parsed_samplesheet = SAMPLESHEET_CHECK ( converted_samplesheet.samplesheet_converted )
         .csv
         .splitCsv ( header:true, sep:',' )
         .branch {
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index 223ba15..1f38214 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -17,7 +17,26 @@ def checkPathParamList = [ params.input, params.databases, params.hostremoval_re
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 
 // Check mandatory parameters
-if (params.input    ) { ch_input     = file(params.input)     } else { exit 1, 'Input samplesheet not specified!' }
+if (params.input) {
+    ch_input = file(params.input)
+    ch_input_basedir = []
+
+} else if (params.pep) {
+
+    if ( params.pep.startsWith("http://") || params.pep.startsWith("https://") ) {
+        ch_input = file(params.pep)
+        ch_input_basedir = []
+    }
+
+    else {
+        ch_input = file(params.pep)
+        ch_input_basedir = new File(params.pep).getParent()
+    }
+
+}  else {
+    exit 1, 'Input samplesheet or PEP config not specified!'
+}
+
 if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
 
 if (params.shortread_qc_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files."
@@ -98,7 +117,7 @@ workflow TAXPROFILER {
         SUBWORKFLOW: Read in samplesheet, validate and stage input files
     */
     INPUT_CHECK (
-        ch_input
+        ch_input, ch_input_basedir
     )
     ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)