mirror of
https://github.com/MillironX/taxprofiler.git
synced 2024-11-21 16:06:05 +00:00
add more columns to samplesheet
This commit is contained in:
parent
12e7b428f2
commit
f867c057a4
1 changed files with 118 additions and 20 deletions
|
@ -3,6 +3,7 @@
|
|||
# TODO nf-core: Update the script to check the samplesheet
|
||||
# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
|
||||
|
||||
from distutils import extension
|
||||
import os
|
||||
import sys
|
||||
import errno
|
||||
|
@ -10,7 +11,9 @@ import argparse
|
|||
|
||||
|
||||
def parse_args(args=None):
|
||||
Description = "Reformat nf-core/taxprofiler samplesheet file and check its contents."
|
||||
Description = (
|
||||
"Reformat nf-core/taxprofiler samplesheet file and check its contents."
|
||||
)
|
||||
Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
|
||||
|
||||
parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
|
||||
|
@ -43,25 +46,62 @@ def check_samplesheet(file_in, file_out):
|
|||
"""
|
||||
This function checks that the samplesheet follows the following structure:
|
||||
|
||||
sample,fastq_1,fastq_2
|
||||
SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
|
||||
SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
|
||||
SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
|
||||
sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
|
||||
2611,ERR5766174,ILLUMINA,NA,NA,FA_EXTENSIONSERX5474930_ERR5766174_1.fa.gz
|
||||
2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz,NA
|
||||
2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,NA,NA
|
||||
2613,ERR5766181,ILLUMINA,ERX5474930_ERR5766174_1.fa.gz,ERX5474930_ERR5766174_2.fa.gz,NA
|
||||
|
||||
For an example see:
|
||||
https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
|
||||
"""
|
||||
|
||||
FQ_EXTENSIONS = (".fq", ".fq.gz", ".fastq", ".fastq.gz")
|
||||
FA_EXTENSIONS = (
|
||||
".fa",
|
||||
".fa.gz",
|
||||
".fasta",
|
||||
".fasta.gz",
|
||||
".fna",
|
||||
".fna.gz",
|
||||
".fas",
|
||||
".fas.gz",
|
||||
)
|
||||
INSTRUMENT_PLATFORMS = [
|
||||
"ABI_SOLID",
|
||||
"BGISEQ",
|
||||
"CAPILLARY",
|
||||
"COMPLETE_GENOMICS",
|
||||
"DNBSEQ",
|
||||
"HELICOS",
|
||||
"ILLUMINA",
|
||||
"ION_TORRENT",
|
||||
"LS454",
|
||||
"OXFORD_NANOPORE",
|
||||
"PACBIO_SMRT",
|
||||
]
|
||||
|
||||
sample_mapping_dict = {}
|
||||
with open(file_in, "r") as fin:
|
||||
|
||||
## Check header
|
||||
MIN_COLS = 2
|
||||
MIN_COLS = 4
|
||||
# TODO nf-core: Update the column names for the input samplesheet
|
||||
HEADER = ["sample", "fastq_1", "fastq_2"]
|
||||
HEADER = [
|
||||
"sample",
|
||||
"run_accession",
|
||||
"instrument_platform",
|
||||
"fastq_1",
|
||||
"fastq_2",
|
||||
"fasta",
|
||||
]
|
||||
header = [x.strip('"') for x in fin.readline().strip().split(",")]
|
||||
if header[: len(HEADER)] != HEADER:
|
||||
print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
|
||||
print(
|
||||
"ERROR: Please check samplesheet header -> {} != {}".format(
|
||||
",".join(header), ",".join(HEADER)
|
||||
)
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
## Check sample entries
|
||||
|
@ -78,13 +118,22 @@ def check_samplesheet(file_in, file_out):
|
|||
num_cols = len([x for x in lspl if x])
|
||||
if num_cols < MIN_COLS:
|
||||
print_error(
|
||||
"Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
|
||||
"Invalid number of populated columns (minimum = {})!".format(
|
||||
MIN_COLS
|
||||
),
|
||||
"Line",
|
||||
line,
|
||||
)
|
||||
|
||||
## Check sample name entries
|
||||
sample, fastq_1, fastq_2 = lspl[: len(HEADER)]
|
||||
(
|
||||
sample,
|
||||
run_accession,
|
||||
instrument_platform,
|
||||
fastq_1,
|
||||
fastq_2,
|
||||
fasta,
|
||||
) = lspl[: len(HEADER)]
|
||||
sample = sample.replace(" ", "_")
|
||||
if not sample:
|
||||
print_error("Sample entry has not been specified!", "Line", line)
|
||||
|
@ -94,23 +143,55 @@ def check_samplesheet(file_in, file_out):
|
|||
if fastq:
|
||||
if fastq.find(" ") != -1:
|
||||
print_error("FastQ file contains spaces!", "Line", line)
|
||||
if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
|
||||
if not fastq.endswith(FQ_EXTENSIONS):
|
||||
print_error(
|
||||
"FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
|
||||
f"FastQ file does not have extension {' or '.join(list(FQ_EXTENSIONS))} !",
|
||||
"Line",
|
||||
line,
|
||||
)
|
||||
if fasta:
|
||||
if fasta.find(" ") != -1:
|
||||
print_error("FastA file contains spaces!", "Line", line)
|
||||
if not fasta.endswith(FA_EXTENSIONS):
|
||||
print_error(
|
||||
f"FastA file does not have extension {' or '.join(list(FA_EXTENSIONS))}!",
|
||||
"Line",
|
||||
line,
|
||||
)
|
||||
sample_info = []
|
||||
|
||||
# Check run_accession
|
||||
if not run_accession:
|
||||
print_error("Run accession has not been specified!", "Line", line)
|
||||
else:
|
||||
sample_info.append(run_accession)
|
||||
|
||||
# Check instrument_platform
|
||||
if not instrument_platform:
|
||||
print_error("Instrument platform has not been specified!", "Line", line)
|
||||
else:
|
||||
if instrument_platform not in INSTRUMENT_PLATFORMS:
|
||||
print_error(
|
||||
f"Instrument platform {instrument_platform} is not supported!",
|
||||
f"List of supported platforms {', '.join(INSTRUMENT_PLATFORMS)}",
|
||||
"Line",
|
||||
line,
|
||||
)
|
||||
sample_info.append(instrument_platform)
|
||||
|
||||
## Auto-detect paired-end/single-end
|
||||
sample_info = [] ## [single_end, fastq_1, fastq_2]
|
||||
if sample and fastq_1 and fastq_2: ## Paired-end short reads
|
||||
sample_info = ["0", fastq_1, fastq_2]
|
||||
sample_info.extend(["0", fastq_1, fastq_2, fasta])
|
||||
elif sample and fastq_1 and not fastq_2: ## Single-end short reads
|
||||
sample_info = ["1", fastq_1, fastq_2]
|
||||
sample_info.extend(["1", fastq_1, fastq_2, fasta])
|
||||
elif (
|
||||
sample and fasta and not fastq_1 and not fastq_2
|
||||
): ## Single-end long reads
|
||||
sample_info.extend(["1", fastq_1, fastq_2, fasta])
|
||||
else:
|
||||
print_error("Invalid combination of columns provided!", "Line", line)
|
||||
|
||||
## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] }
|
||||
## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 , fasta, run_accession, instrument_platform] }
|
||||
if sample not in sample_mapping_dict:
|
||||
sample_mapping_dict[sample] = [sample_info]
|
||||
else:
|
||||
|
@ -120,19 +201,36 @@ def check_samplesheet(file_in, file_out):
|
|||
sample_mapping_dict[sample].append(sample_info)
|
||||
|
||||
## Write validated samplesheet with appropriate columns
|
||||
HEADER_OUT = [
|
||||
"sample",
|
||||
"run_accession",
|
||||
"instrument_platform",
|
||||
"single_end",
|
||||
"fastq_1",
|
||||
"fastq_2",
|
||||
"fasta",
|
||||
]
|
||||
if len(sample_mapping_dict) > 0:
|
||||
out_dir = os.path.dirname(file_out)
|
||||
make_dir(out_dir)
|
||||
with open(file_out, "w") as fout:
|
||||
fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n")
|
||||
fout.write(",".join(HEADER_OUT) + "\n")
|
||||
for sample in sorted(sample_mapping_dict.keys()):
|
||||
|
||||
## Check that multiple runs of the same sample are of the same datatype
|
||||
if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):
|
||||
print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample))
|
||||
if not all(
|
||||
x[0] == sample_mapping_dict[sample][0][0]
|
||||
for x in sample_mapping_dict[sample]
|
||||
):
|
||||
print_error(
|
||||
"Multiple runs of a sample must be of the same datatype!",
|
||||
"Sample: {}".format(sample),
|
||||
)
|
||||
|
||||
for idx, val in enumerate(sample_mapping_dict[sample]):
|
||||
fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n")
|
||||
fout.write(
|
||||
",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n"
|
||||
)
|
||||
else:
|
||||
print_error("No entries to process!", "Samplesheet: {}".format(file_in))
|
||||
|
||||
|
|
Loading…
Reference in a new issue