From 1584d6fc517a34acf19bfd29352b0645480f95a7 Mon Sep 17 00:00:00 2001
From: Rafal Stepien <43926522+rafalstepien@users.noreply.github.com>
Date: Mon, 19 Sep 2022 09:51:25 -0400
Subject: [PATCH 1/3] Update workflows/taxprofiler.nf

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>
---
 nextflow_schema.json              |  4 ++--
 subworkflows/local/input_check.nf |  6 +++---
 workflows/taxprofiler.nf          | 22 +++++-----------------
 3 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index d5309ed..4a9237d 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -10,13 +10,13 @@
             "type": "object",
             "fa_icon": "fas fa-terminal",
             "description": "Define where the pipeline should find input data and save output data.",
-            "required": [ "input", "outdir", "databases"],
+            "required": ["input", "outdir", "databases"],
             "properties": {
                 "input": {
                     "type": "string",
                     "format": "file-path",
                     "mimetype": "text/csv",
-                    "pattern": "^\\S+\\.(csv|yaml)$",
+                    "pattern": "^\\S+\\.(csv|yaml|yml)$",
                     "schema": "assets/schema_input.json",
                     "description": "Path to comma-separated file containing information about the samples and libraries/runs.",
                     "help_text": "You will need to create a design file with information about the samples and libraries/runs you want to running in your pipeline run. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).",
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 0a07538..447eb15 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -9,11 +9,11 @@ include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main'
 workflow INPUT_CHECK {
     take:
     samplesheet_or_pep_config // file: /path/to/samplesheet.csv or /path/to/pep/config.yaml
-    ch_pep_input_base_dir
+    pep_input_base_dir
 
     main:
-    EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), ch_pep_input_base_dir )
-    converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", ch_pep_input_base_dir )
+    EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), pep_input_base_dir )
+    converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir )
     parsed_samplesheet = SAMPLESHEET_CHECK ( converted_samplesheet.samplesheet_converted )
         .csv
         .splitCsv ( header:true, sep:',' )
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index b88b286..6f7becf 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -17,23 +17,11 @@ def checkPathParamList = [ params.input, params.databases, params.hostremoval_re
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 
 // Check mandatory parameters
-if ( params.input.endsWith(".yaml") ) {
-
-    if ( params.input.startsWith("http://") || params.input.startsWith("https://") ) {
-        ch_input = file(params.input)
-        ch_pep_input_base_dir = []
-    }
-    else {
-        ch_input = file(params.input)
-        ch_pep_input_base_dir = new File(params.input).getParent()
-    }
-
-} else if ( params.input.endsWith(".csv") ) {
-    ch_input = file(params.input)
-    ch_pep_input_base_dir = []
-
+if ( params.input ) {
+    ch_input              = file(params.input, checkIfExists: true)
+    pep_input_base_dir    = file(params.input).extension.matches("yaml|yml") ? file(file(params.input).getParent(), checkIfExists: true) :  []
 } else {
-    exit 1, 'Input samplesheet or PEP config not specified!'
+    exit 1, "Input samplesheet, or PEP config and base directory not specified"
 }
 
 if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
@@ -116,7 +104,7 @@ workflow TAXPROFILER {
         SUBWORKFLOW: Read in samplesheet, validate and stage input files
     */
     INPUT_CHECK (
-        ch_input, ch_pep_input_base_dir
+        ch_input, pep_input_base_dir
     )
     ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)
 

From 43a8aa4405e67e0616d63f71ef217ecb7b2e350b Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Tue, 27 Sep 2022 15:32:51 +0200
Subject: [PATCH 2/3] Remove detect_reads.py and replace remaining checks with
 nextflow code instead

---
 assets/samplesheet_schema.yaml     |   2 +
 bin/detect_reads.py                | 125 -----------------------------
 conf/modules.config                |   8 --
 modules/local/samplesheet_check.nf |  27 -------
 subworkflows/local/input_check.nf  |  34 +++++---
 5 files changed, 27 insertions(+), 169 deletions(-)
 delete mode 100644 bin/detect_reads.py
 delete mode 100644 modules/local/samplesheet_check.nf

diff --git a/assets/samplesheet_schema.yaml b/assets/samplesheet_schema.yaml
index 88ff451..366ee93 100644
--- a/assets/samplesheet_schema.yaml
+++ b/assets/samplesheet_schema.yaml
@@ -53,3 +53,5 @@ properties:
         - fasta
 required:
   - samples
+  - run_accession
+  - instrument_platform
diff --git a/bin/detect_reads.py b/bin/detect_reads.py
deleted file mode 100644
index 8a1430e..0000000
--- a/bin/detect_reads.py
+++ /dev/null
@@ -1,125 +0,0 @@
-#!/usr/bin/env python
-
-import argparse
-import csv
-import sys
-from typing import List, NoReturn
-
-
-def parse_args(args=None) -> argparse.Namespace:
-    """
-    Reformatting is based on detecting whether the reads are paired or single end.
-    Script appends appropriate column to samplesheet.csv file.
-    """
-    Description = "Reformat nf-core/taxprofiler samplesheet file."
-    Epilog = "Example usage: python detect_reads.py <FILE_IN> <FILE_OUT>"
-
-    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
-    parser.add_argument("FILE_IN", help="Input samplesheet file.")
-    parser.add_argument("FILE_OUT", help="Output file.")
-    return parser.parse_args(args)
-
-
-class ReadsModifier:
-    def __init__(self):
-        self.headers = None
-        self.sample_index = None
-        self.fastq_1_index = None
-        self.fastq_2_index = None
-        self.fasta_index = None
-
-    def detect_reads_and_reformat(self, input_file_path: str, output_file_path: str) -> NoReturn:
-        NEW_COLUMN_NAME = "single_end"
-        new_file_rows = []
-
-        with open(input_file_path, "r") as input_file:
-            csv_reader = csv.reader(input_file, delimiter=",")
-            self.headers = next(csv_reader)
-            self.headers.append(NEW_COLUMN_NAME)
-
-            self._infer_column_indexes()
-
-            for samplesheet_row in csv_reader:
-
-                if self._is_paired_end_short_read(samplesheet_row):
-                    new_file_rows.append([*samplesheet_row, "0"])
-
-                elif self._is_single_end_short_long_read(samplesheet_row):
-                    new_file_rows.append([*samplesheet_row, "1"])
-
-                elif self._is_single_end_long_read(samplesheet_row):
-                    new_file_rows.append([*samplesheet_row, "1"])
-
-                elif self._is_error_row(samplesheet_row):
-                    self.print_error(
-                        "FastQ and FastA files cannot be specified together in the same library!",
-                        "Line",
-                        ",".join(samplesheet_row),
-                    )
-                else:
-                    self.print_error("Invalid combination of columns provided!", "Line", ",".join(samplesheet_row))
-
-        self.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path)
-
-    def _get_row_values(self, samplesheet_row):
-        """
-        This method extracts data from the columns for given row of samplesheet table, based on
-        previously infered column indexes.
-        """
-        sample = samplesheet_row[self.sample_index]
-        fastq_1 = samplesheet_row[self.fastq_1_index] if self.fastq_1_index else None
-        fastq_2 = samplesheet_row[self.fastq_2_index] if self.fastq_2_index else None
-        fasta = samplesheet_row[self.fasta_index] if self.fasta_index else None
-        return sample, fastq_1, fastq_2, fasta
-
-    def _infer_column_indexes(self):
-        """
-        This method infers indexes of necessary columns from samplesheet table
-        """
-        self.sample_index = self.headers.index("sample")
-        self.fastq_1_index = self.headers.index("fastq_1") if "fastq_1" in self.headers else None
-        self.fastq_2_index = self.headers.index("fastq_2") if "fastq_2" in self.headers else None
-        self.fasta_index = self.headers.index("fasta") if "fasta" in self.headers else None
-
-    def _is_paired_end_short_read(self, samplesheet_row: List) -> bool:
-        sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
-        return sample and fastq_1 and fastq_2
-
-    def _is_single_end_short_long_read(self, samplesheet_row: List) -> bool:
-        sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
-        return sample and fastq_1 and not fastq_2
-
-    def _is_single_end_long_read(self, samplesheet_row: List) -> bool:
-        sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
-        return sample and fasta and not fastq_1 and not fastq_2
-
-    def _is_error_row(self, samplesheet_row: List) -> bool:
-        sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
-        return fasta and (fastq_1 or fastq_2)
-
-    @staticmethod
-    def print_error(error: str, context: str = "Line", context_str: str = ""):
-        error_str = "ERROR: Please check samplesheet -> {}".format(error)
-        if context != "" and context_str != "":
-            error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
-                error, context.strip(), context_str.strip()
-            )
-        print(error_str)
-        sys.exit(1)
-
-    @staticmethod
-    def save_reformatted_samplesheet(new_file_rows: List[List], output_file_path: str) -> NoReturn:
-        """
-        Write new samplesheet.
-        """
-        with open(output_file_path, "w") as output_file:
-            csv.writer(output_file).writerows(new_file_rows)
-
-
-def main(args=None):
-    args = parse_args(args)
-    ReadsModifier().detect_reads_and_reformat(args.FILE_IN, args.FILE_OUT)
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/conf/modules.config b/conf/modules.config
index 0efd251..73cc042 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -12,14 +12,6 @@
 
 process {
 
-    withName: SAMPLESHEET_CHECK {
-        publishDir = [
-            path: { "${params.outdir}/pipeline_info" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-
     withName: DATABASE_CHECK {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },
diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf
deleted file mode 100644
index 91e0f04..0000000
--- a/modules/local/samplesheet_check.nf
+++ /dev/null
@@ -1,27 +0,0 @@
-process SAMPLESHEET_CHECK {
-    tag "$samplesheet"
-
-    conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/python:3.8.3' :
-        'quay.io/biocontainers/python:3.8.3' }"
-
-    input:
-    path samplesheet
-
-    output:
-    path '*.csv'       , emit: csv
-    path "versions.yml", emit: versions
-
-    script: // detect_reads.py script is bundled with the pipeline, in nf-core/taxprofiler/bin/
-    """
-    python3 $projectDir/bin/detect_reads.py \\
-        $samplesheet \\
-        samplesheet_validated.csv
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        python: \$(python --version | sed 's/Python //g')
-    END_VERSIONS
-    """
-}
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 447eb15..d54d2ad 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -2,7 +2,6 @@
 // Check input samplesheet and get read channels
 //
 
-include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check'
 include { EIDO_VALIDATE } from '../../modules/nf-core/modules/eido/validate/main'
 include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main'
 
@@ -12,26 +11,43 @@ workflow INPUT_CHECK {
     pep_input_base_dir
 
     main:
+    ch_versions = Channel.empty()
+
     EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), pep_input_base_dir )
-    converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir )
-    parsed_samplesheet = SAMPLESHEET_CHECK ( converted_samplesheet.samplesheet_converted )
-        .csv
+    ch_versions = ch_versions.mix(EIDO_VALIDATE.out.versions)
+
+    EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir )
+    ch_versions = ch_versions.mix(EIDO_CONVERT.out.versions)
+
+    ch_parsed_samplesheet = EIDO_CONVERT.out.samplesheet_converted
         .splitCsv ( header:true, sep:',' )
+        .map{
+
+            // Checks not supported by EIDO(?)
+            if ( ( it['fastq_1'] != "" || it['fastq_2'] != "" ) && it['fasta'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: FastQ and FastA files cannot be specified together in the same library. Check input samplesheet! Check sample: ${it['sample']}" }
+            if ( it['fastq_1'] == "" && it['fastq_2'] != "" )                          { exit 1, "[nf-core/taxprofiler] ERROR: Input samplesheet has a missing fastq_1 when fastq_2 is specified. Check sample: ${it['sample']}" }
+
+            single_end = it['fastq_2'] == "" ? true : false
+            it['single_end'] = single_end
+
+            [ it ]
+        }
+        .flatten()
         .branch {
             fasta: it['fasta'] != ''
             nanopore: it['instrument_platform'] == 'OXFORD_NANOPORE'
             fastq: true
         }
 
-    parsed_samplesheet.fastq
+    ch_parsed_samplesheet.fastq
         .map { create_fastq_channel(it) }
         .set { fastq }
 
-    parsed_samplesheet.nanopore
+    ch_parsed_samplesheet.nanopore
         .map { create_fastq_channel(it) }
         .set { nanopore }
 
-    parsed_samplesheet.fasta
+    ch_parsed_samplesheet.fasta
         .map { create_fasta_channel(it) }
         .set { fasta }
 
@@ -39,7 +55,7 @@ workflow INPUT_CHECK {
     fastq = fastq ?: []                       // channel: [ val(meta), [ reads ] ]
     nanopore = nanopore ?: []                 // channel: [ val(meta), [ reads ] ]
     fasta = fasta ?: []                       // channel: [ val(meta), fasta ]
-    versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ]
+    versions = ch_versions                    // channel: [ versions.yml ]
 }
 
 // Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
@@ -69,7 +85,7 @@ def create_fastq_channel(LinkedHashMap row) {
             if (!file(row.fastq_2).exists()) {
                 exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}"
             }
-         fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
+            fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
         }
 
     }

From 87edc4569cef3290531ac0dbe56b1c407de26381 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Tue, 27 Sep 2022 15:46:17 +0200
Subject: [PATCH 3/3] Move to a function

---
 subworkflows/local/input_check.nf | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index d54d2ad..e8d5e7a 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -21,18 +21,7 @@ workflow INPUT_CHECK {
 
     ch_parsed_samplesheet = EIDO_CONVERT.out.samplesheet_converted
         .splitCsv ( header:true, sep:',' )
-        .map{
-
-            // Checks not supported by EIDO(?)
-            if ( ( it['fastq_1'] != "" || it['fastq_2'] != "" ) && it['fasta'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: FastQ and FastA files cannot be specified together in the same library. Check input samplesheet! Check sample: ${it['sample']}" }
-            if ( it['fastq_1'] == "" && it['fastq_2'] != "" )                          { exit 1, "[nf-core/taxprofiler] ERROR: Input samplesheet has a missing fastq_1 when fastq_2 is specified. Check sample: ${it['sample']}" }
-
-            single_end = it['fastq_2'] == "" ? true : false
-            it['single_end'] = single_end
-
-            [ it ]
-        }
-        .flatten()
+        .map { check_missing_and_singleend_autodetect(it) }
         .branch {
             fasta: it['fasta'] != ''
             nanopore: it['instrument_platform'] == 'OXFORD_NANOPORE'
@@ -58,6 +47,19 @@ workflow INPUT_CHECK {
     versions = ch_versions                    // channel: [ versions.yml ]
 }
 
+// Function to validate input sheet and auto-detect R1/R2
+def check_missing_and_singleend_autodetect(LinkedHashMap row) {
+
+            // Checks not supported by EIDO(?)
+            if ( ( row['fastq_1'] != "" || row['fastq_2'] != "" ) && row['fasta'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: FastQ and FastA files cannot be specified together in the same library. Check input samplesheet! Check sample: ${row['sample']}" }
+            if ( row['fastq_1'] == "" && row['fastq_2'] != "" )                            { exit 1, "[nf-core/taxprofiler] ERROR: Input samplesheet has a missing fastq_1 when fastq_2 is specified. Check sample: ${row['sample']}" }
+
+            single_end = row['fastq_2'] == "" ? true : false
+            row['single_end'] = single_end
+
+            return row
+}
+
 // Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
 def create_fastq_channel(LinkedHashMap row) {
     // create meta map
@@ -90,6 +92,7 @@ def create_fastq_channel(LinkedHashMap row) {
 
     }
     return fastq_meta
+
 }// Function to get list of [ meta, fasta ]
 def create_fasta_channel(LinkedHashMap row) {
     def meta = [:]