mirror of
https://github.com/MillironX/taxprofiler.git
synced 2024-11-25 17:09:54 +00:00
Update workflows/taxprofiler.nf
Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>
This commit is contained in:
parent
0ccbf50938
commit
bfd260e9c8
4 changed files with 56 additions and 76 deletions
|
@ -1,20 +1,29 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import csv
|
import csv
|
||||||
|
import logging
|
||||||
import sys
|
import sys
|
||||||
from typing import List, NoReturn
|
from enum import Enum
|
||||||
|
from typing import List, NoReturn, Optional
|
||||||
|
|
||||||
|
|
||||||
def parse_args(args=None) -> argparse.Namespace:
|
class ColumnNames(str, Enum):
|
||||||
|
SAMPLE = "sample"
|
||||||
|
FASTQ_1 = "fastq_1"
|
||||||
|
FASTQ_2 = "fastq_2"
|
||||||
|
FASTA = "fasta"
|
||||||
|
SINGLE_END = "single_end"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
|
||||||
"""
|
"""
|
||||||
Reformatting is based on detecting whether the reads are paired or single end.
|
Reformatting is based on detecting whether the reads are paired or single end.
|
||||||
Script appends appropriate column to samplesheet.csv file.
|
Script appends appropriate column to samplesheet.csv file.
|
||||||
"""
|
"""
|
||||||
Description = "Reformat nf-core/taxprofiler samplesheet file."
|
parser = argparse.ArgumentParser(
|
||||||
Epilog = "Example usage: python detect_reads.py <FILE_IN> <FILE_OUT>"
|
description="Reformat nf-core/taxprofiler samplesheet file.",
|
||||||
|
epilog="Example usage: python detect_reads.py <FILE_IN> <FILE_OUT>",
|
||||||
parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
|
)
|
||||||
parser.add_argument("FILE_IN", help="Input samplesheet file.")
|
parser.add_argument("FILE_IN", help="Input samplesheet file.")
|
||||||
parser.add_argument("FILE_OUT", help="Output file.")
|
parser.add_argument("FILE_OUT", help="Output file.")
|
||||||
return parser.parse_args(args)
|
return parser.parse_args(args)
|
||||||
|
@ -29,86 +38,69 @@ class ReadsModifier:
|
||||||
self.fasta_index = None
|
self.fasta_index = None
|
||||||
|
|
||||||
def detect_reads_and_reformat(self, input_file_path: str, output_file_path: str) -> NoReturn:
|
def detect_reads_and_reformat(self, input_file_path: str, output_file_path: str) -> NoReturn:
|
||||||
NEW_COLUMN_NAME = "single_end"
|
|
||||||
new_file_rows = []
|
new_file_rows = []
|
||||||
|
|
||||||
with open(input_file_path, "r") as input_file:
|
with open(input_file_path, "r", newline="") as input_file:
|
||||||
csv_reader = csv.reader(input_file, delimiter=",")
|
csv_reader = csv.DictReader(input_file, delimiter=",")
|
||||||
self.headers = next(csv_reader)
|
self.headers = csv_reader.fieldnames
|
||||||
self.headers.append(NEW_COLUMN_NAME)
|
self.headers.append("single_end")
|
||||||
|
|
||||||
self._infer_column_indexes()
|
|
||||||
|
|
||||||
for samplesheet_row in csv_reader:
|
for samplesheet_row in csv_reader:
|
||||||
|
|
||||||
if self._is_paired_end_short_read(samplesheet_row):
|
if self._is_paired_end_short_read(samplesheet_row):
|
||||||
new_file_rows.append([*samplesheet_row, "0"])
|
samplesheet_row[ColumnNames.SINGLE_END] = "0"
|
||||||
|
new_file_rows.append(samplesheet_row.values())
|
||||||
|
|
||||||
elif self._is_single_end_short_long_read(samplesheet_row):
|
elif self._is_single_end_short_long_read(samplesheet_row):
|
||||||
new_file_rows.append([*samplesheet_row, "1"])
|
samplesheet_row[ColumnNames.SINGLE_END] = "1"
|
||||||
|
new_file_rows.append(samplesheet_row.values())
|
||||||
|
|
||||||
elif self._is_single_end_long_read(samplesheet_row):
|
elif self._is_single_end_long_read(samplesheet_row):
|
||||||
new_file_rows.append([*samplesheet_row, "1"])
|
samplesheet_row[ColumnNames.SINGLE_END] = "1"
|
||||||
|
new_file_rows.append(samplesheet_row.values())
|
||||||
|
|
||||||
elif self._is_error_row(samplesheet_row):
|
elif self._is_error_row(samplesheet_row):
|
||||||
self.print_error(
|
logging.error(
|
||||||
"FastQ and FastA files cannot be specified together in the same library!",
|
"FastQ and FastA files cannot be specified together in the same library!",
|
||||||
"Line",
|
"Line",
|
||||||
",".join(samplesheet_row),
|
",".join(samplesheet_row.values()),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.print_error("Invalid combination of columns provided!", "Line", ",".join(samplesheet_row))
|
logging.error(
|
||||||
|
"Invalid combination of columns provided!", "Line", ",".join(samplesheet_row.values())
|
||||||
|
)
|
||||||
|
|
||||||
self.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path)
|
ReadsModifier.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path)
|
||||||
|
|
||||||
def _get_row_values(self, samplesheet_row):
|
def _get_row_values(self, samplesheet_row: dict):
|
||||||
"""
|
"""
|
||||||
This method extracts data from the columns for given row of samplesheet table, based on
|
This method extracts data from the columns for given row of samplesheet table.
|
||||||
previously infered column indexes.
|
|
||||||
"""
|
"""
|
||||||
sample = samplesheet_row[self.sample_index]
|
return (
|
||||||
fastq_1 = samplesheet_row[self.fastq_1_index] if self.fastq_1_index else None
|
samplesheet_row.get(ColumnNames.SAMPLE),
|
||||||
fastq_2 = samplesheet_row[self.fastq_2_index] if self.fastq_2_index else None
|
samplesheet_row.get(ColumnNames.FASTQ_1),
|
||||||
fasta = samplesheet_row[self.fasta_index] if self.fasta_index else None
|
samplesheet_row.get(ColumnNames.FASTQ_2),
|
||||||
return sample, fastq_1, fastq_2, fasta
|
samplesheet_row.get(ColumnNames.FASTA),
|
||||||
|
)
|
||||||
|
|
||||||
def _infer_column_indexes(self):
|
def _is_paired_end_short_read(self, samplesheet_row: dict) -> bool:
|
||||||
"""
|
|
||||||
This method infers indexes of necessary columns from samplesheet table
|
|
||||||
"""
|
|
||||||
self.sample_index = self.headers.index("sample")
|
|
||||||
self.fastq_1_index = self.headers.index("fastq_1") if "fastq_1" in self.headers else None
|
|
||||||
self.fastq_2_index = self.headers.index("fastq_2") if "fastq_2" in self.headers else None
|
|
||||||
self.fasta_index = self.headers.index("fasta") if "fasta" in self.headers else None
|
|
||||||
|
|
||||||
def _is_paired_end_short_read(self, samplesheet_row: List) -> bool:
|
|
||||||
sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
|
sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
|
||||||
return sample and fastq_1 and fastq_2
|
return sample and fastq_1 and fastq_2
|
||||||
|
|
||||||
def _is_single_end_short_long_read(self, samplesheet_row: List) -> bool:
|
def _is_single_end_short_long_read(self, samplesheet_row: dict) -> bool:
|
||||||
sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
|
sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
|
||||||
return sample and fastq_1 and not fastq_2
|
return sample and fastq_1 and not fastq_2
|
||||||
|
|
||||||
def _is_single_end_long_read(self, samplesheet_row: List) -> bool:
|
def _is_single_end_long_read(self, samplesheet_row: dict) -> bool:
|
||||||
sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
|
sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
|
||||||
return sample and fasta and not fastq_1 and not fastq_2
|
return sample and fasta and not fastq_1 and not fastq_2
|
||||||
|
|
||||||
def _is_error_row(self, samplesheet_row: List) -> bool:
|
def _is_error_row(self, samplesheet_row: dict) -> bool:
|
||||||
sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
|
sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
|
||||||
return fasta and (fastq_1 or fastq_2)
|
return fasta and (fastq_1 or fastq_2)
|
||||||
|
|
||||||
@staticmethod
|
@classmethod
|
||||||
def print_error(error: str, context: str = "Line", context_str: str = ""):
|
def save_reformatted_samplesheet(cls, new_file_rows: List[List], output_file_path: str) -> NoReturn:
|
||||||
error_str = "ERROR: Please check samplesheet -> {}".format(error)
|
|
||||||
if context != "" and context_str != "":
|
|
||||||
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
|
|
||||||
error, context.strip(), context_str.strip()
|
|
||||||
)
|
|
||||||
print(error_str)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def save_reformatted_samplesheet(new_file_rows: List[List], output_file_path: str) -> NoReturn:
|
|
||||||
"""
|
"""
|
||||||
Write new samplesheet.
|
Write new samplesheet.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -10,13 +10,13 @@
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"fa_icon": "fas fa-terminal",
|
"fa_icon": "fas fa-terminal",
|
||||||
"description": "Define where the pipeline should find input data and save output data.",
|
"description": "Define where the pipeline should find input data and save output data.",
|
||||||
"required": [ "input", "outdir", "databases"],
|
"required": ["input", "outdir", "databases"],
|
||||||
"properties": {
|
"properties": {
|
||||||
"input": {
|
"input": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"format": "file-path",
|
"format": "file-path",
|
||||||
"mimetype": "text/csv",
|
"mimetype": "text/csv",
|
||||||
"pattern": "^\\S+\\.(csv|yaml)$",
|
"pattern": "^\\S+\\.(csv|yaml|yml)$",
|
||||||
"schema": "assets/schema_input.json",
|
"schema": "assets/schema_input.json",
|
||||||
"description": "Path to comma-separated file containing information about the samples and libraries/runs.",
|
"description": "Path to comma-separated file containing information about the samples and libraries/runs.",
|
||||||
"help_text": "You will need to create a design file with information about the samples and libraries/runs you want to running in your pipeline run. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).",
|
"help_text": "You will need to create a design file with information about the samples and libraries/runs you want to running in your pipeline run. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).",
|
||||||
|
|
|
@ -9,11 +9,11 @@ include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main'
|
||||||
workflow INPUT_CHECK {
|
workflow INPUT_CHECK {
|
||||||
take:
|
take:
|
||||||
samplesheet_or_pep_config // file: /path/to/samplesheet.csv or /path/to/pep/config.yaml
|
samplesheet_or_pep_config // file: /path/to/samplesheet.csv or /path/to/pep/config.yaml
|
||||||
ch_pep_input_base_dir
|
pep_input_base_dir
|
||||||
|
|
||||||
main:
|
main:
|
||||||
EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), ch_pep_input_base_dir )
|
EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), pep_input_base_dir )
|
||||||
converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", ch_pep_input_base_dir )
|
converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir )
|
||||||
parsed_samplesheet = SAMPLESHEET_CHECK ( converted_samplesheet.samplesheet_converted )
|
parsed_samplesheet = SAMPLESHEET_CHECK ( converted_samplesheet.samplesheet_converted )
|
||||||
.csv
|
.csv
|
||||||
.splitCsv ( header:true, sep:',' )
|
.splitCsv ( header:true, sep:',' )
|
||||||
|
|
|
@ -17,23 +17,11 @@ def checkPathParamList = [ params.input, params.databases, params.hostremoval_re
|
||||||
for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
|
for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
|
||||||
|
|
||||||
// Check mandatory parameters
|
// Check mandatory parameters
|
||||||
if ( params.input.endsWith(".yaml") ) {
|
if ( params.input ) {
|
||||||
|
ch_input = file(params.input, checkIfExists: true)
|
||||||
if ( params.input.startsWith("http://") || params.input.startsWith("https://") ) {
|
pep_input_base_dir = file(params.input).extension.matches("yaml|yml") ? file(file(params.input).getParent(), checkIfExists: true) : []
|
||||||
ch_input = file(params.input)
|
|
||||||
ch_pep_input_base_dir = []
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
ch_input = file(params.input)
|
|
||||||
ch_pep_input_base_dir = new File(params.input).getParent()
|
|
||||||
}
|
|
||||||
|
|
||||||
} else if ( params.input.endsWith(".csv") ) {
|
|
||||||
ch_input = file(params.input)
|
|
||||||
ch_pep_input_base_dir = []
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
exit 1, 'Input samplesheet or PEP config not specified!'
|
exit 1, "Input samplesheet, or PEP config and base directory not specified"
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
|
if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
|
||||||
|
@ -116,7 +104,7 @@ workflow TAXPROFILER {
|
||||||
SUBWORKFLOW: Read in samplesheet, validate and stage input files
|
SUBWORKFLOW: Read in samplesheet, validate and stage input files
|
||||||
*/
|
*/
|
||||||
INPUT_CHECK (
|
INPUT_CHECK (
|
||||||
ch_input, ch_pep_input_base_dir
|
ch_input, pep_input_base_dir
|
||||||
)
|
)
|
||||||
ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)
|
ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue