1
0
Fork 0
mirror of https://github.com/MillironX/taxprofiler.git synced 2024-11-10 21:53:09 +00:00

Update workflows/taxprofiler.nf

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>
This commit is contained in:
Rafal Stepien 2022-09-19 09:51:25 -04:00 committed by Rafal Stepien
parent 0ccbf50938
commit bfd260e9c8
4 changed files with 56 additions and 76 deletions

View file

@ -1,20 +1,29 @@
#!/usr/bin/env python
import argparse
import csv
import logging
import sys
from typing import List, NoReturn
from enum import Enum
from typing import List, NoReturn, Optional
def parse_args(args=None) -> argparse.Namespace:
class ColumnNames(str, Enum):
SAMPLE = "sample"
FASTQ_1 = "fastq_1"
FASTQ_2 = "fastq_2"
FASTA = "fasta"
SINGLE_END = "single_end"
def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
"""
Reformatting is based on detecting whether the reads are paired or single end.
Script appends appropriate column to samplesheet.csv file.
"""
Description = "Reformat nf-core/taxprofiler samplesheet file."
Epilog = "Example usage: python detect_reads.py <FILE_IN> <FILE_OUT>"
parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser = argparse.ArgumentParser(
description="Reformat nf-core/taxprofiler samplesheet file.",
epilog="Example usage: python detect_reads.py <FILE_IN> <FILE_OUT>",
)
parser.add_argument("FILE_IN", help="Input samplesheet file.")
parser.add_argument("FILE_OUT", help="Output file.")
return parser.parse_args(args)
@ -29,86 +38,69 @@ class ReadsModifier:
self.fasta_index = None
def detect_reads_and_reformat(self, input_file_path: str, output_file_path: str) -> NoReturn:
NEW_COLUMN_NAME = "single_end"
new_file_rows = []
with open(input_file_path, "r") as input_file:
csv_reader = csv.reader(input_file, delimiter=",")
self.headers = next(csv_reader)
self.headers.append(NEW_COLUMN_NAME)
self._infer_column_indexes()
with open(input_file_path, "r", newline="") as input_file:
csv_reader = csv.DictReader(input_file, delimiter=",")
self.headers = csv_reader.fieldnames
self.headers.append("single_end")
for samplesheet_row in csv_reader:
if self._is_paired_end_short_read(samplesheet_row):
new_file_rows.append([*samplesheet_row, "0"])
samplesheet_row[ColumnNames.SINGLE_END] = "0"
new_file_rows.append(samplesheet_row.values())
elif self._is_single_end_short_long_read(samplesheet_row):
new_file_rows.append([*samplesheet_row, "1"])
samplesheet_row[ColumnNames.SINGLE_END] = "1"
new_file_rows.append(samplesheet_row.values())
elif self._is_single_end_long_read(samplesheet_row):
new_file_rows.append([*samplesheet_row, "1"])
samplesheet_row[ColumnNames.SINGLE_END] = "1"
new_file_rows.append(samplesheet_row.values())
elif self._is_error_row(samplesheet_row):
self.print_error(
logging.error(
"FastQ and FastA files cannot be specified together in the same library!",
"Line",
",".join(samplesheet_row),
",".join(samplesheet_row.values()),
)
else:
self.print_error("Invalid combination of columns provided!", "Line", ",".join(samplesheet_row))
logging.error(
"Invalid combination of columns provided!", "Line", ",".join(samplesheet_row.values())
)
self.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path)
ReadsModifier.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path)
def _get_row_values(self, samplesheet_row):
def _get_row_values(self, samplesheet_row: dict):
"""
This method extracts data from the columns for given row of samplesheet table, based on
previously infered column indexes.
This method extracts data from the columns for given row of samplesheet table.
"""
sample = samplesheet_row[self.sample_index]
fastq_1 = samplesheet_row[self.fastq_1_index] if self.fastq_1_index else None
fastq_2 = samplesheet_row[self.fastq_2_index] if self.fastq_2_index else None
fasta = samplesheet_row[self.fasta_index] if self.fasta_index else None
return sample, fastq_1, fastq_2, fasta
return (
samplesheet_row.get(ColumnNames.SAMPLE),
samplesheet_row.get(ColumnNames.FASTQ_1),
samplesheet_row.get(ColumnNames.FASTQ_2),
samplesheet_row.get(ColumnNames.FASTA),
)
def _infer_column_indexes(self):
"""
This method infers indexes of necessary columns from samplesheet table
"""
self.sample_index = self.headers.index("sample")
self.fastq_1_index = self.headers.index("fastq_1") if "fastq_1" in self.headers else None
self.fastq_2_index = self.headers.index("fastq_2") if "fastq_2" in self.headers else None
self.fasta_index = self.headers.index("fasta") if "fasta" in self.headers else None
def _is_paired_end_short_read(self, samplesheet_row: List) -> bool:
def _is_paired_end_short_read(self, samplesheet_row: dict) -> bool:
sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
return sample and fastq_1 and fastq_2
def _is_single_end_short_long_read(self, samplesheet_row: List) -> bool:
def _is_single_end_short_long_read(self, samplesheet_row: dict) -> bool:
sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
return sample and fastq_1 and not fastq_2
def _is_single_end_long_read(self, samplesheet_row: List) -> bool:
def _is_single_end_long_read(self, samplesheet_row: dict) -> bool:
sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
return sample and fasta and not fastq_1 and not fastq_2
def _is_error_row(self, samplesheet_row: List) -> bool:
def _is_error_row(self, samplesheet_row: dict) -> bool:
sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
return fasta and (fastq_1 or fastq_2)
@staticmethod
def print_error(error: str, context: str = "Line", context_str: str = ""):
error_str = "ERROR: Please check samplesheet -> {}".format(error)
if context != "" and context_str != "":
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
error, context.strip(), context_str.strip()
)
print(error_str)
sys.exit(1)
@staticmethod
def save_reformatted_samplesheet(new_file_rows: List[List], output_file_path: str) -> NoReturn:
@classmethod
def save_reformatted_samplesheet(cls, new_file_rows: List[List], output_file_path: str) -> NoReturn:
"""
Write new samplesheet.
"""

View file

@ -10,13 +10,13 @@
"type": "object",
"fa_icon": "fas fa-terminal",
"description": "Define where the pipeline should find input data and save output data.",
"required": [ "input", "outdir", "databases"],
"required": ["input", "outdir", "databases"],
"properties": {
"input": {
"type": "string",
"format": "file-path",
"mimetype": "text/csv",
"pattern": "^\\S+\\.(csv|yaml)$",
"pattern": "^\\S+\\.(csv|yaml|yml)$",
"schema": "assets/schema_input.json",
"description": "Path to comma-separated file containing information about the samples and libraries/runs.",
"help_text": "You will need to create a design file with information about the samples and libraries/runs you want to running in your pipeline run. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).",

View file

@ -9,11 +9,11 @@ include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main'
workflow INPUT_CHECK {
take:
samplesheet_or_pep_config // file: /path/to/samplesheet.csv or /path/to/pep/config.yaml
ch_pep_input_base_dir
pep_input_base_dir
main:
EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), ch_pep_input_base_dir )
converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", ch_pep_input_base_dir )
EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), pep_input_base_dir )
converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir )
parsed_samplesheet = SAMPLESHEET_CHECK ( converted_samplesheet.samplesheet_converted )
.csv
.splitCsv ( header:true, sep:',' )

View file

@ -17,23 +17,11 @@ def checkPathParamList = [ params.input, params.databases, params.hostremoval_re
for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
// Check mandatory parameters
if ( params.input.endsWith(".yaml") ) {
if ( params.input.startsWith("http://") || params.input.startsWith("https://") ) {
ch_input = file(params.input)
ch_pep_input_base_dir = []
}
else {
ch_input = file(params.input)
ch_pep_input_base_dir = new File(params.input).getParent()
}
} else if ( params.input.endsWith(".csv") ) {
ch_input = file(params.input)
ch_pep_input_base_dir = []
if ( params.input ) {
ch_input = file(params.input, checkIfExists: true)
pep_input_base_dir = file(params.input).extension.matches("yaml|yml") ? file(file(params.input).getParent(), checkIfExists: true) : []
} else {
exit 1, 'Input samplesheet or PEP config not specified!'
exit 1, "Input samplesheet, or PEP config and base directory not specified"
}
if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
@ -116,7 +104,7 @@ workflow TAXPROFILER {
SUBWORKFLOW: Read in samplesheet, validate and stage input files
*/
INPUT_CHECK (
ch_input, ch_pep_input_base_dir
ch_input, pep_input_base_dir
)
ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)