mirror of
https://github.com/MillironX/taxprofiler.git
synced 2024-11-25 01:09:55 +00:00
Remove detect_reads.py and replace remaining checks with nextflow code instead
This commit is contained in:
parent
1584d6fc51
commit
43a8aa4405
5 changed files with 27 additions and 169 deletions
|
@ -53,3 +53,5 @@ properties:
|
||||||
- fasta
|
- fasta
|
||||||
required:
|
required:
|
||||||
- samples
|
- samples
|
||||||
|
- run_accession
|
||||||
|
- instrument_platform
|
||||||
|
|
|
@ -1,125 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import csv
|
|
||||||
import sys
|
|
||||||
from typing import List, NoReturn
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args(args=None) -> argparse.Namespace:
|
|
||||||
"""
|
|
||||||
Reformatting is based on detecting whether the reads are paired or single end.
|
|
||||||
Script appends appropriate column to samplesheet.csv file.
|
|
||||||
"""
|
|
||||||
Description = "Reformat nf-core/taxprofiler samplesheet file."
|
|
||||||
Epilog = "Example usage: python detect_reads.py <FILE_IN> <FILE_OUT>"
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
|
|
||||||
parser.add_argument("FILE_IN", help="Input samplesheet file.")
|
|
||||||
parser.add_argument("FILE_OUT", help="Output file.")
|
|
||||||
return parser.parse_args(args)
|
|
||||||
|
|
||||||
|
|
||||||
class ReadsModifier:
|
|
||||||
def __init__(self):
|
|
||||||
self.headers = None
|
|
||||||
self.sample_index = None
|
|
||||||
self.fastq_1_index = None
|
|
||||||
self.fastq_2_index = None
|
|
||||||
self.fasta_index = None
|
|
||||||
|
|
||||||
def detect_reads_and_reformat(self, input_file_path: str, output_file_path: str) -> NoReturn:
|
|
||||||
NEW_COLUMN_NAME = "single_end"
|
|
||||||
new_file_rows = []
|
|
||||||
|
|
||||||
with open(input_file_path, "r") as input_file:
|
|
||||||
csv_reader = csv.reader(input_file, delimiter=",")
|
|
||||||
self.headers = next(csv_reader)
|
|
||||||
self.headers.append(NEW_COLUMN_NAME)
|
|
||||||
|
|
||||||
self._infer_column_indexes()
|
|
||||||
|
|
||||||
for samplesheet_row in csv_reader:
|
|
||||||
|
|
||||||
if self._is_paired_end_short_read(samplesheet_row):
|
|
||||||
new_file_rows.append([*samplesheet_row, "0"])
|
|
||||||
|
|
||||||
elif self._is_single_end_short_long_read(samplesheet_row):
|
|
||||||
new_file_rows.append([*samplesheet_row, "1"])
|
|
||||||
|
|
||||||
elif self._is_single_end_long_read(samplesheet_row):
|
|
||||||
new_file_rows.append([*samplesheet_row, "1"])
|
|
||||||
|
|
||||||
elif self._is_error_row(samplesheet_row):
|
|
||||||
self.print_error(
|
|
||||||
"FastQ and FastA files cannot be specified together in the same library!",
|
|
||||||
"Line",
|
|
||||||
",".join(samplesheet_row),
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.print_error("Invalid combination of columns provided!", "Line", ",".join(samplesheet_row))
|
|
||||||
|
|
||||||
self.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path)
|
|
||||||
|
|
||||||
def _get_row_values(self, samplesheet_row):
|
|
||||||
"""
|
|
||||||
This method extracts data from the columns for given row of samplesheet table, based on
|
|
||||||
previously infered column indexes.
|
|
||||||
"""
|
|
||||||
sample = samplesheet_row[self.sample_index]
|
|
||||||
fastq_1 = samplesheet_row[self.fastq_1_index] if self.fastq_1_index else None
|
|
||||||
fastq_2 = samplesheet_row[self.fastq_2_index] if self.fastq_2_index else None
|
|
||||||
fasta = samplesheet_row[self.fasta_index] if self.fasta_index else None
|
|
||||||
return sample, fastq_1, fastq_2, fasta
|
|
||||||
|
|
||||||
def _infer_column_indexes(self):
|
|
||||||
"""
|
|
||||||
This method infers indexes of necessary columns from samplesheet table
|
|
||||||
"""
|
|
||||||
self.sample_index = self.headers.index("sample")
|
|
||||||
self.fastq_1_index = self.headers.index("fastq_1") if "fastq_1" in self.headers else None
|
|
||||||
self.fastq_2_index = self.headers.index("fastq_2") if "fastq_2" in self.headers else None
|
|
||||||
self.fasta_index = self.headers.index("fasta") if "fasta" in self.headers else None
|
|
||||||
|
|
||||||
def _is_paired_end_short_read(self, samplesheet_row: List) -> bool:
|
|
||||||
sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
|
|
||||||
return sample and fastq_1 and fastq_2
|
|
||||||
|
|
||||||
def _is_single_end_short_long_read(self, samplesheet_row: List) -> bool:
|
|
||||||
sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
|
|
||||||
return sample and fastq_1 and not fastq_2
|
|
||||||
|
|
||||||
def _is_single_end_long_read(self, samplesheet_row: List) -> bool:
|
|
||||||
sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
|
|
||||||
return sample and fasta and not fastq_1 and not fastq_2
|
|
||||||
|
|
||||||
def _is_error_row(self, samplesheet_row: List) -> bool:
|
|
||||||
sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
|
|
||||||
return fasta and (fastq_1 or fastq_2)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def print_error(error: str, context: str = "Line", context_str: str = ""):
|
|
||||||
error_str = "ERROR: Please check samplesheet -> {}".format(error)
|
|
||||||
if context != "" and context_str != "":
|
|
||||||
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
|
|
||||||
error, context.strip(), context_str.strip()
|
|
||||||
)
|
|
||||||
print(error_str)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def save_reformatted_samplesheet(new_file_rows: List[List], output_file_path: str) -> NoReturn:
|
|
||||||
"""
|
|
||||||
Write new samplesheet.
|
|
||||||
"""
|
|
||||||
with open(output_file_path, "w") as output_file:
|
|
||||||
csv.writer(output_file).writerows(new_file_rows)
|
|
||||||
|
|
||||||
|
|
||||||
def main(args=None):
|
|
||||||
args = parse_args(args)
|
|
||||||
ReadsModifier().detect_reads_and_reformat(args.FILE_IN, args.FILE_OUT)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(main())
|
|
|
@ -12,14 +12,6 @@
|
||||||
|
|
||||||
process {
|
process {
|
||||||
|
|
||||||
withName: SAMPLESHEET_CHECK {
|
|
||||||
publishDir = [
|
|
||||||
path: { "${params.outdir}/pipeline_info" },
|
|
||||||
mode: params.publish_dir_mode,
|
|
||||||
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
withName: DATABASE_CHECK {
|
withName: DATABASE_CHECK {
|
||||||
publishDir = [
|
publishDir = [
|
||||||
path: { "${params.outdir}/pipeline_info" },
|
path: { "${params.outdir}/pipeline_info" },
|
||||||
|
|
|
@ -1,27 +0,0 @@
|
||||||
process SAMPLESHEET_CHECK {
|
|
||||||
tag "$samplesheet"
|
|
||||||
|
|
||||||
conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
|
|
||||||
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
|
|
||||||
'https://depot.galaxyproject.org/singularity/python:3.8.3' :
|
|
||||||
'quay.io/biocontainers/python:3.8.3' }"
|
|
||||||
|
|
||||||
input:
|
|
||||||
path samplesheet
|
|
||||||
|
|
||||||
output:
|
|
||||||
path '*.csv' , emit: csv
|
|
||||||
path "versions.yml", emit: versions
|
|
||||||
|
|
||||||
script: // detect_reads.py script is bundled with the pipeline, in nf-core/taxprofiler/bin/
|
|
||||||
"""
|
|
||||||
python3 $projectDir/bin/detect_reads.py \\
|
|
||||||
$samplesheet \\
|
|
||||||
samplesheet_validated.csv
|
|
||||||
|
|
||||||
cat <<-END_VERSIONS > versions.yml
|
|
||||||
"${task.process}":
|
|
||||||
python: \$(python --version | sed 's/Python //g')
|
|
||||||
END_VERSIONS
|
|
||||||
"""
|
|
||||||
}
|
|
|
@ -2,7 +2,6 @@
|
||||||
// Check input samplesheet and get read channels
|
// Check input samplesheet and get read channels
|
||||||
//
|
//
|
||||||
|
|
||||||
include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check'
|
|
||||||
include { EIDO_VALIDATE } from '../../modules/nf-core/modules/eido/validate/main'
|
include { EIDO_VALIDATE } from '../../modules/nf-core/modules/eido/validate/main'
|
||||||
include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main'
|
include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main'
|
||||||
|
|
||||||
|
@ -12,26 +11,43 @@ workflow INPUT_CHECK {
|
||||||
pep_input_base_dir
|
pep_input_base_dir
|
||||||
|
|
||||||
main:
|
main:
|
||||||
|
ch_versions = Channel.empty()
|
||||||
|
|
||||||
EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), pep_input_base_dir )
|
EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), pep_input_base_dir )
|
||||||
converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir )
|
ch_versions = ch_versions.mix(EIDO_VALIDATE.out.versions)
|
||||||
parsed_samplesheet = SAMPLESHEET_CHECK ( converted_samplesheet.samplesheet_converted )
|
|
||||||
.csv
|
EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir )
|
||||||
|
ch_versions = ch_versions.mix(EIDO_CONVERT.out.versions)
|
||||||
|
|
||||||
|
ch_parsed_samplesheet = EIDO_CONVERT.out.samplesheet_converted
|
||||||
.splitCsv ( header:true, sep:',' )
|
.splitCsv ( header:true, sep:',' )
|
||||||
|
.map{
|
||||||
|
|
||||||
|
// Checks not supported by EIDO(?)
|
||||||
|
if ( ( it['fastq_1'] != "" || it['fastq_2'] != "" ) && it['fasta'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: FastQ and FastA files cannot be specified together in the same library. Check input samplesheet! Check sample: ${it['sample']}" }
|
||||||
|
if ( it['fastq_1'] == "" && it['fastq_2'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: Input samplesheet has a missing fastq_1 when fastq_2 is specified. Check sample: ${it['sample']}" }
|
||||||
|
|
||||||
|
single_end = it['fastq_2'] == "" ? true : false
|
||||||
|
it['single_end'] = single_end
|
||||||
|
|
||||||
|
[ it ]
|
||||||
|
}
|
||||||
|
.flatten()
|
||||||
.branch {
|
.branch {
|
||||||
fasta: it['fasta'] != ''
|
fasta: it['fasta'] != ''
|
||||||
nanopore: it['instrument_platform'] == 'OXFORD_NANOPORE'
|
nanopore: it['instrument_platform'] == 'OXFORD_NANOPORE'
|
||||||
fastq: true
|
fastq: true
|
||||||
}
|
}
|
||||||
|
|
||||||
parsed_samplesheet.fastq
|
ch_parsed_samplesheet.fastq
|
||||||
.map { create_fastq_channel(it) }
|
.map { create_fastq_channel(it) }
|
||||||
.set { fastq }
|
.set { fastq }
|
||||||
|
|
||||||
parsed_samplesheet.nanopore
|
ch_parsed_samplesheet.nanopore
|
||||||
.map { create_fastq_channel(it) }
|
.map { create_fastq_channel(it) }
|
||||||
.set { nanopore }
|
.set { nanopore }
|
||||||
|
|
||||||
parsed_samplesheet.fasta
|
ch_parsed_samplesheet.fasta
|
||||||
.map { create_fasta_channel(it) }
|
.map { create_fasta_channel(it) }
|
||||||
.set { fasta }
|
.set { fasta }
|
||||||
|
|
||||||
|
@ -39,7 +55,7 @@ workflow INPUT_CHECK {
|
||||||
fastq = fastq ?: [] // channel: [ val(meta), [ reads ] ]
|
fastq = fastq ?: [] // channel: [ val(meta), [ reads ] ]
|
||||||
nanopore = nanopore ?: [] // channel: [ val(meta), [ reads ] ]
|
nanopore = nanopore ?: [] // channel: [ val(meta), [ reads ] ]
|
||||||
fasta = fasta ?: [] // channel: [ val(meta), fasta ]
|
fasta = fasta ?: [] // channel: [ val(meta), fasta ]
|
||||||
versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ]
|
versions = ch_versions // channel: [ versions.yml ]
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
|
// Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
|
||||||
|
@ -69,7 +85,7 @@ def create_fastq_channel(LinkedHashMap row) {
|
||||||
if (!file(row.fastq_2).exists()) {
|
if (!file(row.fastq_2).exists()) {
|
||||||
exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}"
|
exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}"
|
||||||
}
|
}
|
||||||
fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
|
fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue