diff --git a/bin/detect_reads.py b/bin/detect_reads.py index 8a1430e..14f6a07 100644 --- a/bin/detect_reads.py +++ b/bin/detect_reads.py @@ -1,20 +1,29 @@ #!/usr/bin/env python - import argparse import csv +import logging import sys -from typing import List, NoReturn +from enum import Enum +from typing import List, NoReturn, Optional -def parse_args(args=None) -> argparse.Namespace: +class ColumnNames(str, Enum): + SAMPLE = "sample" + FASTQ_1 = "fastq_1" + FASTQ_2 = "fastq_2" + FASTA = "fasta" + SINGLE_END = "single_end" + + +def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace: """ Reformatting is based on detecting whether the reads are paired or single end. Script appends appropriate column to samplesheet.csv file. """ - Description = "Reformat nf-core/taxprofiler samplesheet file." - Epilog = "Example usage: python detect_reads.py " - - parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser = argparse.ArgumentParser( + description="Reformat nf-core/taxprofiler samplesheet file.", + epilog="Example usage: python detect_reads.py ", + ) parser.add_argument("FILE_IN", help="Input samplesheet file.") parser.add_argument("FILE_OUT", help="Output file.") return parser.parse_args(args) @@ -29,86 +38,69 @@ class ReadsModifier: self.fasta_index = None def detect_reads_and_reformat(self, input_file_path: str, output_file_path: str) -> NoReturn: - NEW_COLUMN_NAME = "single_end" new_file_rows = [] - with open(input_file_path, "r") as input_file: - csv_reader = csv.reader(input_file, delimiter=",") - self.headers = next(csv_reader) - self.headers.append(NEW_COLUMN_NAME) - - self._infer_column_indexes() + with open(input_file_path, "r", newline="") as input_file: + csv_reader = csv.DictReader(input_file, delimiter=",") + self.headers = csv_reader.fieldnames + self.headers.append("single_end") for samplesheet_row in csv_reader: if self._is_paired_end_short_read(samplesheet_row): - new_file_rows.append([*samplesheet_row, "0"]) + samplesheet_row[ColumnNames.SINGLE_END] = "0" + new_file_rows.append(samplesheet_row.values()) elif self._is_single_end_short_long_read(samplesheet_row): - new_file_rows.append([*samplesheet_row, "1"]) + samplesheet_row[ColumnNames.SINGLE_END] = "1" + new_file_rows.append(samplesheet_row.values()) elif self._is_single_end_long_read(samplesheet_row): - new_file_rows.append([*samplesheet_row, "1"]) + samplesheet_row[ColumnNames.SINGLE_END] = "1" + new_file_rows.append(samplesheet_row.values()) elif self._is_error_row(samplesheet_row): - self.print_error( + logging.error( "FastQ and FastA files cannot be specified together in the same library!", "Line", - ",".join(samplesheet_row), + ",".join(samplesheet_row.values()), ) else: - self.print_error("Invalid combination of columns provided!", "Line", ",".join(samplesheet_row)) + logging.error( + "Invalid combination of columns provided!", "Line", ",".join(samplesheet_row.values()) + ) - self.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path) + ReadsModifier.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path) - def _get_row_values(self, samplesheet_row): + def _get_row_values(self, samplesheet_row: dict): """ - This method extracts data from the columns for given row of samplesheet table, based on - previously infered column indexes. + This method extracts data from the columns for given row of samplesheet table. """ - sample = samplesheet_row[self.sample_index] - fastq_1 = samplesheet_row[self.fastq_1_index] if self.fastq_1_index else None - fastq_2 = samplesheet_row[self.fastq_2_index] if self.fastq_2_index else None - fasta = samplesheet_row[self.fasta_index] if self.fasta_index else None - return sample, fastq_1, fastq_2, fasta + return ( + samplesheet_row.get(ColumnNames.SAMPLE), + samplesheet_row.get(ColumnNames.FASTQ_1), + samplesheet_row.get(ColumnNames.FASTQ_2), + samplesheet_row.get(ColumnNames.FASTA), + ) - def _infer_column_indexes(self): - """ - This method infers indexes of necessary columns from samplesheet table - """ - self.sample_index = self.headers.index("sample") - self.fastq_1_index = self.headers.index("fastq_1") if "fastq_1" in self.headers else None - self.fastq_2_index = self.headers.index("fastq_2") if "fastq_2" in self.headers else None - self.fasta_index = self.headers.index("fasta") if "fasta" in self.headers else None - - def _is_paired_end_short_read(self, samplesheet_row: List) -> bool: + def _is_paired_end_short_read(self, samplesheet_row: dict) -> bool: sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row) return sample and fastq_1 and fastq_2 - def _is_single_end_short_long_read(self, samplesheet_row: List) -> bool: + def _is_single_end_short_long_read(self, samplesheet_row: dict) -> bool: sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row) return sample and fastq_1 and not fastq_2 - def _is_single_end_long_read(self, samplesheet_row: List) -> bool: + def _is_single_end_long_read(self, samplesheet_row: dict) -> bool: sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row) return sample and fasta and not fastq_1 and not fastq_2 - def _is_error_row(self, samplesheet_row: List) -> bool: + def _is_error_row(self, samplesheet_row: dict) -> bool: sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row) return fasta and (fastq_1 or fastq_2) - @staticmethod - def print_error(error: str, context: str = "Line", context_str: str = ""): - error_str = "ERROR: Please check samplesheet -> {}".format(error) - if context != "" and context_str != "": - error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format( - error, context.strip(), context_str.strip() - ) - print(error_str) - sys.exit(1) - - @staticmethod - def save_reformatted_samplesheet(new_file_rows: List[List], output_file_path: str) -> NoReturn: + @classmethod + def save_reformatted_samplesheet(cls, new_file_rows: List[List], output_file_path: str) -> NoReturn: """ Write new samplesheet. """ diff --git a/nextflow_schema.json b/nextflow_schema.json index d5309ed..4a9237d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,13 +10,13 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ "input", "outdir", "databases"], + "required": ["input", "outdir", "databases"], "properties": { "input": { "type": "string", "format": "file-path", "mimetype": "text/csv", - "pattern": "^\\S+\\.(csv|yaml)$", + "pattern": "^\\S+\\.(csv|yaml|yml)$", "schema": "assets/schema_input.json", "description": "Path to comma-separated file containing information about the samples and libraries/runs.", "help_text": "You will need to create a design file with information about the samples and libraries/runs you want to running in your pipeline run. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).", diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 0a07538..447eb15 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -9,11 +9,11 @@ include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main' workflow INPUT_CHECK { take: samplesheet_or_pep_config // file: /path/to/samplesheet.csv or /path/to/pep/config.yaml - ch_pep_input_base_dir + pep_input_base_dir main: - EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), ch_pep_input_base_dir ) - converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", ch_pep_input_base_dir ) + EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), pep_input_base_dir ) + converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir ) parsed_samplesheet = SAMPLESHEET_CHECK ( converted_samplesheet.samplesheet_converted ) .csv .splitCsv ( header:true, sep:',' ) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index b88b286..6f7becf 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -17,23 +17,11 @@ def checkPathParamList = [ params.input, params.databases, params.hostremoval_re for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters -if ( params.input.endsWith(".yaml") ) { - - if ( params.input.startsWith("http://") || params.input.startsWith("https://") ) { - ch_input = file(params.input) - ch_pep_input_base_dir = [] - } - else { - ch_input = file(params.input) - ch_pep_input_base_dir = new File(params.input).getParent() - } - -} else if ( params.input.endsWith(".csv") ) { - ch_input = file(params.input) - ch_pep_input_base_dir = [] - +if ( params.input ) { + ch_input = file(params.input, checkIfExists: true) + pep_input_base_dir = file(params.input).extension.matches("yaml|yml") ? file(file(params.input).getParent(), checkIfExists: true) : [] } else { - exit 1, 'Input samplesheet or PEP config not specified!' + exit 1, "Input samplesheet, or PEP config and base directory not specified" } if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } @@ -116,7 +104,7 @@ workflow TAXPROFILER { SUBWORKFLOW: Read in samplesheet, validate and stage input files */ INPUT_CHECK ( - ch_input, ch_pep_input_base_dir + ch_input, pep_input_base_dir ) ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)