1
0
Fork 0
mirror of https://github.com/MillironX/taxprofiler.git synced 2024-11-22 12:19:54 +00:00

Update workflows/taxprofiler.nf

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>
This commit is contained in:
Rafal Stepien 2022-09-19 09:51:25 -04:00 committed by Rafal Stepien
parent 0ccbf50938
commit bfd260e9c8
4 changed files with 56 additions and 76 deletions

View file

@ -1,20 +1,29 @@
#!/usr/bin/env python #!/usr/bin/env python
import argparse import argparse
import csv import csv
import logging
import sys import sys
from typing import List, NoReturn from enum import Enum
from typing import List, NoReturn, Optional
def parse_args(args=None) -> argparse.Namespace: class ColumnNames(str, Enum):
SAMPLE = "sample"
FASTQ_1 = "fastq_1"
FASTQ_2 = "fastq_2"
FASTA = "fasta"
SINGLE_END = "single_end"
def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
""" """
Reformatting is based on detecting whether the reads are paired or single end. Reformatting is based on detecting whether the reads are paired or single end.
Script appends appropriate column to samplesheet.csv file. Script appends appropriate column to samplesheet.csv file.
""" """
Description = "Reformat nf-core/taxprofiler samplesheet file." parser = argparse.ArgumentParser(
Epilog = "Example usage: python detect_reads.py <FILE_IN> <FILE_OUT>" description="Reformat nf-core/taxprofiler samplesheet file.",
epilog="Example usage: python detect_reads.py <FILE_IN> <FILE_OUT>",
parser = argparse.ArgumentParser(description=Description, epilog=Epilog) )
parser.add_argument("FILE_IN", help="Input samplesheet file.") parser.add_argument("FILE_IN", help="Input samplesheet file.")
parser.add_argument("FILE_OUT", help="Output file.") parser.add_argument("FILE_OUT", help="Output file.")
return parser.parse_args(args) return parser.parse_args(args)
@ -29,86 +38,69 @@ class ReadsModifier:
self.fasta_index = None self.fasta_index = None
def detect_reads_and_reformat(self, input_file_path: str, output_file_path: str) -> NoReturn: def detect_reads_and_reformat(self, input_file_path: str, output_file_path: str) -> NoReturn:
NEW_COLUMN_NAME = "single_end"
new_file_rows = [] new_file_rows = []
with open(input_file_path, "r") as input_file: with open(input_file_path, "r", newline="") as input_file:
csv_reader = csv.reader(input_file, delimiter=",") csv_reader = csv.DictReader(input_file, delimiter=",")
self.headers = next(csv_reader) self.headers = csv_reader.fieldnames
self.headers.append(NEW_COLUMN_NAME) self.headers.append("single_end")
self._infer_column_indexes()
for samplesheet_row in csv_reader: for samplesheet_row in csv_reader:
if self._is_paired_end_short_read(samplesheet_row): if self._is_paired_end_short_read(samplesheet_row):
new_file_rows.append([*samplesheet_row, "0"]) samplesheet_row[ColumnNames.SINGLE_END] = "0"
new_file_rows.append(samplesheet_row.values())
elif self._is_single_end_short_long_read(samplesheet_row): elif self._is_single_end_short_long_read(samplesheet_row):
new_file_rows.append([*samplesheet_row, "1"]) samplesheet_row[ColumnNames.SINGLE_END] = "1"
new_file_rows.append(samplesheet_row.values())
elif self._is_single_end_long_read(samplesheet_row): elif self._is_single_end_long_read(samplesheet_row):
new_file_rows.append([*samplesheet_row, "1"]) samplesheet_row[ColumnNames.SINGLE_END] = "1"
new_file_rows.append(samplesheet_row.values())
elif self._is_error_row(samplesheet_row): elif self._is_error_row(samplesheet_row):
self.print_error( logging.error(
"FastQ and FastA files cannot be specified together in the same library!", "FastQ and FastA files cannot be specified together in the same library!",
"Line", "Line",
",".join(samplesheet_row), ",".join(samplesheet_row.values()),
) )
else: else:
self.print_error("Invalid combination of columns provided!", "Line", ",".join(samplesheet_row)) logging.error(
"Invalid combination of columns provided!", "Line", ",".join(samplesheet_row.values())
)
self.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path) ReadsModifier.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path)
def _get_row_values(self, samplesheet_row): def _get_row_values(self, samplesheet_row: dict):
""" """
This method extracts data from the columns for given row of samplesheet table, based on This method extracts data from the columns for given row of samplesheet table.
previously infered column indexes.
""" """
sample = samplesheet_row[self.sample_index] return (
fastq_1 = samplesheet_row[self.fastq_1_index] if self.fastq_1_index else None samplesheet_row.get(ColumnNames.SAMPLE),
fastq_2 = samplesheet_row[self.fastq_2_index] if self.fastq_2_index else None samplesheet_row.get(ColumnNames.FASTQ_1),
fasta = samplesheet_row[self.fasta_index] if self.fasta_index else None samplesheet_row.get(ColumnNames.FASTQ_2),
return sample, fastq_1, fastq_2, fasta samplesheet_row.get(ColumnNames.FASTA),
)
def _infer_column_indexes(self): def _is_paired_end_short_read(self, samplesheet_row: dict) -> bool:
"""
This method infers indexes of necessary columns from samplesheet table
"""
self.sample_index = self.headers.index("sample")
self.fastq_1_index = self.headers.index("fastq_1") if "fastq_1" in self.headers else None
self.fastq_2_index = self.headers.index("fastq_2") if "fastq_2" in self.headers else None
self.fasta_index = self.headers.index("fasta") if "fasta" in self.headers else None
def _is_paired_end_short_read(self, samplesheet_row: List) -> bool:
sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row) sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
return sample and fastq_1 and fastq_2 return sample and fastq_1 and fastq_2
def _is_single_end_short_long_read(self, samplesheet_row: List) -> bool: def _is_single_end_short_long_read(self, samplesheet_row: dict) -> bool:
sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row) sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
return sample and fastq_1 and not fastq_2 return sample and fastq_1 and not fastq_2
def _is_single_end_long_read(self, samplesheet_row: List) -> bool: def _is_single_end_long_read(self, samplesheet_row: dict) -> bool:
sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row) sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
return sample and fasta and not fastq_1 and not fastq_2 return sample and fasta and not fastq_1 and not fastq_2
def _is_error_row(self, samplesheet_row: List) -> bool: def _is_error_row(self, samplesheet_row: dict) -> bool:
sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row) sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
return fasta and (fastq_1 or fastq_2) return fasta and (fastq_1 or fastq_2)
@staticmethod @classmethod
def print_error(error: str, context: str = "Line", context_str: str = ""): def save_reformatted_samplesheet(cls, new_file_rows: List[List], output_file_path: str) -> NoReturn:
error_str = "ERROR: Please check samplesheet -> {}".format(error)
if context != "" and context_str != "":
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
error, context.strip(), context_str.strip()
)
print(error_str)
sys.exit(1)
@staticmethod
def save_reformatted_samplesheet(new_file_rows: List[List], output_file_path: str) -> NoReturn:
""" """
Write new samplesheet. Write new samplesheet.
""" """

View file

@ -16,7 +16,7 @@
"type": "string", "type": "string",
"format": "file-path", "format": "file-path",
"mimetype": "text/csv", "mimetype": "text/csv",
"pattern": "^\\S+\\.(csv|yaml)$", "pattern": "^\\S+\\.(csv|yaml|yml)$",
"schema": "assets/schema_input.json", "schema": "assets/schema_input.json",
"description": "Path to comma-separated file containing information about the samples and libraries/runs.", "description": "Path to comma-separated file containing information about the samples and libraries/runs.",
"help_text": "You will need to create a design file with information about the samples and libraries/runs you want to running in your pipeline run. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).", "help_text": "You will need to create a design file with information about the samples and libraries/runs you want to running in your pipeline run. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).",

View file

@ -9,11 +9,11 @@ include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main'
workflow INPUT_CHECK { workflow INPUT_CHECK {
take: take:
samplesheet_or_pep_config // file: /path/to/samplesheet.csv or /path/to/pep/config.yaml samplesheet_or_pep_config // file: /path/to/samplesheet.csv or /path/to/pep/config.yaml
ch_pep_input_base_dir pep_input_base_dir
main: main:
EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), ch_pep_input_base_dir ) EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), pep_input_base_dir )
converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", ch_pep_input_base_dir ) converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir )
parsed_samplesheet = SAMPLESHEET_CHECK ( converted_samplesheet.samplesheet_converted ) parsed_samplesheet = SAMPLESHEET_CHECK ( converted_samplesheet.samplesheet_converted )
.csv .csv
.splitCsv ( header:true, sep:',' ) .splitCsv ( header:true, sep:',' )

View file

@ -17,23 +17,11 @@ def checkPathParamList = [ params.input, params.databases, params.hostremoval_re
for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
// Check mandatory parameters // Check mandatory parameters
if ( params.input.endsWith(".yaml") ) { if ( params.input ) {
ch_input = file(params.input, checkIfExists: true)
if ( params.input.startsWith("http://") || params.input.startsWith("https://") ) { pep_input_base_dir = file(params.input).extension.matches("yaml|yml") ? file(file(params.input).getParent(), checkIfExists: true) : []
ch_input = file(params.input)
ch_pep_input_base_dir = []
}
else {
ch_input = file(params.input)
ch_pep_input_base_dir = new File(params.input).getParent()
}
} else if ( params.input.endsWith(".csv") ) {
ch_input = file(params.input)
ch_pep_input_base_dir = []
} else { } else {
exit 1, 'Input samplesheet or PEP config not specified!' exit 1, "Input samplesheet, or PEP config and base directory not specified"
} }
if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
@ -116,7 +104,7 @@ workflow TAXPROFILER {
SUBWORKFLOW: Read in samplesheet, validate and stage input files SUBWORKFLOW: Read in samplesheet, validate and stage input files
*/ */
INPUT_CHECK ( INPUT_CHECK (
ch_input, ch_pep_input_base_dir ch_input, pep_input_base_dir
) )
ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)