Update workflows/taxprofiler.nf

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>
2024-11-22 00:26:03 +00:00 · 2022-09-19 09:51:25 -04:00 · 2022-09-19 09:51:25 -04:00 · bfd260e9c8
commit bfd260e9c8
parent 0ccbf50938
4 changed files with 56 additions and 76 deletions
--- a/bin/detect_reads.py
+++ b/bin/detect_reads.py
@ -1,20 +1,29 @@
 #!/usr/bin/env python
-
 import argparse
 import csv
+import logging
 import sys
-from typing import List, NoReturn
+from enum import Enum
+from typing import List, NoReturn, Optional


-def parse_args(args=None) -> argparse.Namespace:
+class ColumnNames(str, Enum):
+    SAMPLE = "sample"
+    FASTQ_1 = "fastq_1"
+    FASTQ_2 = "fastq_2"
+    FASTA = "fasta"
+    SINGLE_END = "single_end"
+
+
+def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
    """
    Reformatting is based on detecting whether the reads are paired or single end.
    Script appends appropriate column to samplesheet.csv file.
    """
-    Description = "Reformat nf-core/taxprofiler samplesheet file."
-    Epilog = "Example usage: python detect_reads.py <FILE_IN> <FILE_OUT>"
-
-    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+    parser = argparse.ArgumentParser(
+        description="Reformat nf-core/taxprofiler samplesheet file.",
+        epilog="Example usage: python detect_reads.py <FILE_IN> <FILE_OUT>",
+    )
    parser.add_argument("FILE_IN", help="Input samplesheet file.")
    parser.add_argument("FILE_OUT", help="Output file.")
    return parser.parse_args(args)
@ -29,86 +38,69 @@ class ReadsModifier:
        self.fasta_index = None

    def detect_reads_and_reformat(self, input_file_path: str, output_file_path: str) -> NoReturn:
-        NEW_COLUMN_NAME = "single_end"
        new_file_rows = []

-        with open(input_file_path, "r") as input_file:
-            csv_reader = csv.reader(input_file, delimiter=",")
-            self.headers = next(csv_reader)
-            self.headers.append(NEW_COLUMN_NAME)
-
-            self._infer_column_indexes()
+        with open(input_file_path, "r", newline="") as input_file:
+            csv_reader = csv.DictReader(input_file, delimiter=",")
+            self.headers = csv_reader.fieldnames
+            self.headers.append("single_end")

            for samplesheet_row in csv_reader:

                if self._is_paired_end_short_read(samplesheet_row):
-                    new_file_rows.append([*samplesheet_row, "0"])
+                    samplesheet_row[ColumnNames.SINGLE_END] = "0"
+                    new_file_rows.append(samplesheet_row.values())

                elif self._is_single_end_short_long_read(samplesheet_row):
-                    new_file_rows.append([*samplesheet_row, "1"])
+                    samplesheet_row[ColumnNames.SINGLE_END] = "1"
+                    new_file_rows.append(samplesheet_row.values())

                elif self._is_single_end_long_read(samplesheet_row):
-                    new_file_rows.append([*samplesheet_row, "1"])
+                    samplesheet_row[ColumnNames.SINGLE_END] = "1"
+                    new_file_rows.append(samplesheet_row.values())

                elif self._is_error_row(samplesheet_row):
-                    self.print_error(
+                    logging.error(
                        "FastQ and FastA files cannot be specified together in the same library!",
                        "Line",
-                        ",".join(samplesheet_row),
+                        ",".join(samplesheet_row.values()),
                    )
                else:
-                    self.print_error("Invalid combination of columns provided!", "Line", ",".join(samplesheet_row))
+                    logging.error(
+                        "Invalid combination of columns provided!", "Line", ",".join(samplesheet_row.values())
+                    )

-        self.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path)
+        ReadsModifier.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path)

-    def _get_row_values(self, samplesheet_row):
+    def _get_row_values(self, samplesheet_row: dict):
        """
-        This method extracts data from the columns for given row of samplesheet table, based on
-        previously infered column indexes.
+        This method extracts data from the columns for given row of samplesheet table.
        """
-        sample = samplesheet_row[self.sample_index]
-        fastq_1 = samplesheet_row[self.fastq_1_index] if self.fastq_1_index else None
-        fastq_2 = samplesheet_row[self.fastq_2_index] if self.fastq_2_index else None
-        fasta = samplesheet_row[self.fasta_index] if self.fasta_index else None
-        return sample, fastq_1, fastq_2, fasta
+        return (
+            samplesheet_row.get(ColumnNames.SAMPLE),
+            samplesheet_row.get(ColumnNames.FASTQ_1),
+            samplesheet_row.get(ColumnNames.FASTQ_2),
+            samplesheet_row.get(ColumnNames.FASTA),
+        )

-    def _infer_column_indexes(self):
-        """
-        This method infers indexes of necessary columns from samplesheet table
-        """
-        self.sample_index = self.headers.index("sample")
-        self.fastq_1_index = self.headers.index("fastq_1") if "fastq_1" in self.headers else None
-        self.fastq_2_index = self.headers.index("fastq_2") if "fastq_2" in self.headers else None
-        self.fasta_index = self.headers.index("fasta") if "fasta" in self.headers else None
-
-    def _is_paired_end_short_read(self, samplesheet_row: List) -> bool:
+    def _is_paired_end_short_read(self, samplesheet_row: dict) -> bool:
        sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
        return sample and fastq_1 and fastq_2

-    def _is_single_end_short_long_read(self, samplesheet_row: List) -> bool:
+    def _is_single_end_short_long_read(self, samplesheet_row: dict) -> bool:
        sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
        return sample and fastq_1 and not fastq_2

-    def _is_single_end_long_read(self, samplesheet_row: List) -> bool:
+    def _is_single_end_long_read(self, samplesheet_row: dict) -> bool:
        sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
        return sample and fasta and not fastq_1 and not fastq_2

-    def _is_error_row(self, samplesheet_row: List) -> bool:
+    def _is_error_row(self, samplesheet_row: dict) -> bool:
        sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
        return fasta and (fastq_1 or fastq_2)

-    @staticmethod
-    def print_error(error: str, context: str = "Line", context_str: str = ""):
-        error_str = "ERROR: Please check samplesheet -> {}".format(error)
-        if context != "" and context_str != "":
-            error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
-                error, context.strip(), context_str.strip()
-            )
-        print(error_str)
-        sys.exit(1)
-
-    @staticmethod
-    def save_reformatted_samplesheet(new_file_rows: List[List], output_file_path: str) -> NoReturn:
+    @classmethod
+    def save_reformatted_samplesheet(cls, new_file_rows: List[List], output_file_path: str) -> NoReturn:
        """
        Write new samplesheet.
        """
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -10,13 +10,13 @@
            "type": "object",
            "fa_icon": "fas fa-terminal",
            "description": "Define where the pipeline should find input data and save output data.",
-            "required": [ "input", "outdir", "databases"],
+            "required": ["input", "outdir", "databases"],
            "properties": {
                "input": {
                    "type": "string",
                    "format": "file-path",
                    "mimetype": "text/csv",
-                    "pattern": "^\\S+\\.(csv|yaml)$",
+                    "pattern": "^\\S+\\.(csv|yaml|yml)$",
                    "schema": "assets/schema_input.json",
                    "description": "Path to comma-separated file containing information about the samples and libraries/runs.",
                    "help_text": "You will need to create a design file with information about the samples and libraries/runs you want to running in your pipeline run. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).",
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@ -9,11 +9,11 @@ include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main'
 workflow INPUT_CHECK {
    take:
    samplesheet_or_pep_config // file: /path/to/samplesheet.csv or /path/to/pep/config.yaml
-    ch_pep_input_base_dir
+    pep_input_base_dir

    main:
-    EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), ch_pep_input_base_dir )
-    converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", ch_pep_input_base_dir )
+    EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), pep_input_base_dir )
+    converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir )
    parsed_samplesheet = SAMPLESHEET_CHECK ( converted_samplesheet.samplesheet_converted )
        .csv
        .splitCsv ( header:true, sep:',' )
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -17,23 +17,11 @@ def checkPathParamList = [ params.input, params.databases, params.hostremoval_re
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }

 // Check mandatory parameters
-if ( params.input.endsWith(".yaml") ) {
-
-    if ( params.input.startsWith("http://") || params.input.startsWith("https://") ) {
-        ch_input = file(params.input)
-        ch_pep_input_base_dir = []
-    }
-    else {
-        ch_input = file(params.input)
-        ch_pep_input_base_dir = new File(params.input).getParent()
-    }
-
-} else if ( params.input.endsWith(".csv") ) {
-    ch_input = file(params.input)
-    ch_pep_input_base_dir = []
-
+if ( params.input ) {
+    ch_input              = file(params.input, checkIfExists: true)
+    pep_input_base_dir    = file(params.input).extension.matches("yaml|yml") ? file(file(params.input).getParent(), checkIfExists: true) :  []
 } else {
-    exit 1, 'Input samplesheet or PEP config not specified!'
+    exit 1, "Input samplesheet, or PEP config and base directory not specified"
 }

 if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
@ -116,7 +104,7 @@ workflow TAXPROFILER {
        SUBWORKFLOW: Read in samplesheet, validate and stage input files
    */
    INPUT_CHECK (
-        ch_input, ch_pep_input_base_dir
+        ch_input, pep_input_base_dir
    )
    ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)