mirror of
https://github.com/MillironX/taxprofiler.git
synced 2024-11-22 11:09:55 +00:00
add more columns to samplesheet
This commit is contained in:
parent
12e7b428f2
commit
f867c057a4
1 changed files with 118 additions and 20 deletions
|
@ -3,6 +3,7 @@
|
||||||
# TODO nf-core: Update the script to check the samplesheet
|
# TODO nf-core: Update the script to check the samplesheet
|
||||||
# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
|
# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
|
||||||
|
|
||||||
|
from distutils import extension
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import errno
|
import errno
|
||||||
|
@ -10,7 +11,9 @@ import argparse
|
||||||
|
|
||||||
|
|
||||||
def parse_args(args=None):
|
def parse_args(args=None):
|
||||||
Description = "Reformat nf-core/taxprofiler samplesheet file and check its contents."
|
Description = (
|
||||||
|
"Reformat nf-core/taxprofiler samplesheet file and check its contents."
|
||||||
|
)
|
||||||
Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
|
Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
|
parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
|
||||||
|
@ -43,25 +46,62 @@ def check_samplesheet(file_in, file_out):
|
||||||
"""
|
"""
|
||||||
This function checks that the samplesheet follows the following structure:
|
This function checks that the samplesheet follows the following structure:
|
||||||
|
|
||||||
sample,fastq_1,fastq_2
|
sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
|
||||||
SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
|
2611,ERR5766174,ILLUMINA,NA,NA,FA_EXTENSIONSERX5474930_ERR5766174_1.fa.gz
|
||||||
SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
|
2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz,NA
|
||||||
SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
|
2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,NA,NA
|
||||||
|
2613,ERR5766181,ILLUMINA,ERX5474930_ERR5766174_1.fa.gz,ERX5474930_ERR5766174_2.fa.gz,NA
|
||||||
|
|
||||||
For an example see:
|
For an example see:
|
||||||
https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
|
https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
FQ_EXTENSIONS = (".fq", ".fq.gz", ".fastq", ".fastq.gz")
|
||||||
|
FA_EXTENSIONS = (
|
||||||
|
".fa",
|
||||||
|
".fa.gz",
|
||||||
|
".fasta",
|
||||||
|
".fasta.gz",
|
||||||
|
".fna",
|
||||||
|
".fna.gz",
|
||||||
|
".fas",
|
||||||
|
".fas.gz",
|
||||||
|
)
|
||||||
|
INSTRUMENT_PLATFORMS = [
|
||||||
|
"ABI_SOLID",
|
||||||
|
"BGISEQ",
|
||||||
|
"CAPILLARY",
|
||||||
|
"COMPLETE_GENOMICS",
|
||||||
|
"DNBSEQ",
|
||||||
|
"HELICOS",
|
||||||
|
"ILLUMINA",
|
||||||
|
"ION_TORRENT",
|
||||||
|
"LS454",
|
||||||
|
"OXFORD_NANOPORE",
|
||||||
|
"PACBIO_SMRT",
|
||||||
|
]
|
||||||
|
|
||||||
sample_mapping_dict = {}
|
sample_mapping_dict = {}
|
||||||
with open(file_in, "r") as fin:
|
with open(file_in, "r") as fin:
|
||||||
|
|
||||||
## Check header
|
## Check header
|
||||||
MIN_COLS = 2
|
MIN_COLS = 4
|
||||||
# TODO nf-core: Update the column names for the input samplesheet
|
# TODO nf-core: Update the column names for the input samplesheet
|
||||||
HEADER = ["sample", "fastq_1", "fastq_2"]
|
HEADER = [
|
||||||
|
"sample",
|
||||||
|
"run_accession",
|
||||||
|
"instrument_platform",
|
||||||
|
"fastq_1",
|
||||||
|
"fastq_2",
|
||||||
|
"fasta",
|
||||||
|
]
|
||||||
header = [x.strip('"') for x in fin.readline().strip().split(",")]
|
header = [x.strip('"') for x in fin.readline().strip().split(",")]
|
||||||
if header[: len(HEADER)] != HEADER:
|
if header[: len(HEADER)] != HEADER:
|
||||||
print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
|
print(
|
||||||
|
"ERROR: Please check samplesheet header -> {} != {}".format(
|
||||||
|
",".join(header), ",".join(HEADER)
|
||||||
|
)
|
||||||
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
## Check sample entries
|
## Check sample entries
|
||||||
|
@ -78,13 +118,22 @@ def check_samplesheet(file_in, file_out):
|
||||||
num_cols = len([x for x in lspl if x])
|
num_cols = len([x for x in lspl if x])
|
||||||
if num_cols < MIN_COLS:
|
if num_cols < MIN_COLS:
|
||||||
print_error(
|
print_error(
|
||||||
"Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
|
"Invalid number of populated columns (minimum = {})!".format(
|
||||||
|
MIN_COLS
|
||||||
|
),
|
||||||
"Line",
|
"Line",
|
||||||
line,
|
line,
|
||||||
)
|
)
|
||||||
|
|
||||||
## Check sample name entries
|
## Check sample name entries
|
||||||
sample, fastq_1, fastq_2 = lspl[: len(HEADER)]
|
(
|
||||||
|
sample,
|
||||||
|
run_accession,
|
||||||
|
instrument_platform,
|
||||||
|
fastq_1,
|
||||||
|
fastq_2,
|
||||||
|
fasta,
|
||||||
|
) = lspl[: len(HEADER)]
|
||||||
sample = sample.replace(" ", "_")
|
sample = sample.replace(" ", "_")
|
||||||
if not sample:
|
if not sample:
|
||||||
print_error("Sample entry has not been specified!", "Line", line)
|
print_error("Sample entry has not been specified!", "Line", line)
|
||||||
|
@ -94,23 +143,55 @@ def check_samplesheet(file_in, file_out):
|
||||||
if fastq:
|
if fastq:
|
||||||
if fastq.find(" ") != -1:
|
if fastq.find(" ") != -1:
|
||||||
print_error("FastQ file contains spaces!", "Line", line)
|
print_error("FastQ file contains spaces!", "Line", line)
|
||||||
if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
|
if not fastq.endswith(FQ_EXTENSIONS):
|
||||||
print_error(
|
print_error(
|
||||||
"FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
|
f"FastQ file does not have extension {' or '.join(list(FQ_EXTENSIONS))} !",
|
||||||
"Line",
|
"Line",
|
||||||
line,
|
line,
|
||||||
)
|
)
|
||||||
|
if fasta:
|
||||||
|
if fasta.find(" ") != -1:
|
||||||
|
print_error("FastA file contains spaces!", "Line", line)
|
||||||
|
if not fasta.endswith(FA_EXTENSIONS):
|
||||||
|
print_error(
|
||||||
|
f"FastA file does not have extension {' or '.join(list(FA_EXTENSIONS))}!",
|
||||||
|
"Line",
|
||||||
|
line,
|
||||||
|
)
|
||||||
|
sample_info = []
|
||||||
|
|
||||||
|
# Check run_accession
|
||||||
|
if not run_accession:
|
||||||
|
print_error("Run accession has not been specified!", "Line", line)
|
||||||
|
else:
|
||||||
|
sample_info.append(run_accession)
|
||||||
|
|
||||||
|
# Check instrument_platform
|
||||||
|
if not instrument_platform:
|
||||||
|
print_error("Instrument platform has not been specified!", "Line", line)
|
||||||
|
else:
|
||||||
|
if instrument_platform not in INSTRUMENT_PLATFORMS:
|
||||||
|
print_error(
|
||||||
|
f"Instrument platform {instrument_platform} is not supported!",
|
||||||
|
f"List of supported platforms {', '.join(INSTRUMENT_PLATFORMS)}",
|
||||||
|
"Line",
|
||||||
|
line,
|
||||||
|
)
|
||||||
|
sample_info.append(instrument_platform)
|
||||||
|
|
||||||
## Auto-detect paired-end/single-end
|
## Auto-detect paired-end/single-end
|
||||||
sample_info = [] ## [single_end, fastq_1, fastq_2]
|
|
||||||
if sample and fastq_1 and fastq_2: ## Paired-end short reads
|
if sample and fastq_1 and fastq_2: ## Paired-end short reads
|
||||||
sample_info = ["0", fastq_1, fastq_2]
|
sample_info.extend(["0", fastq_1, fastq_2, fasta])
|
||||||
elif sample and fastq_1 and not fastq_2: ## Single-end short reads
|
elif sample and fastq_1 and not fastq_2: ## Single-end short reads
|
||||||
sample_info = ["1", fastq_1, fastq_2]
|
sample_info.extend(["1", fastq_1, fastq_2, fasta])
|
||||||
|
elif (
|
||||||
|
sample and fasta and not fastq_1 and not fastq_2
|
||||||
|
): ## Single-end long reads
|
||||||
|
sample_info.extend(["1", fastq_1, fastq_2, fasta])
|
||||||
else:
|
else:
|
||||||
print_error("Invalid combination of columns provided!", "Line", line)
|
print_error("Invalid combination of columns provided!", "Line", line)
|
||||||
|
|
||||||
## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] }
|
## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 , fasta, run_accession, instrument_platform] }
|
||||||
if sample not in sample_mapping_dict:
|
if sample not in sample_mapping_dict:
|
||||||
sample_mapping_dict[sample] = [sample_info]
|
sample_mapping_dict[sample] = [sample_info]
|
||||||
else:
|
else:
|
||||||
|
@ -120,19 +201,36 @@ def check_samplesheet(file_in, file_out):
|
||||||
sample_mapping_dict[sample].append(sample_info)
|
sample_mapping_dict[sample].append(sample_info)
|
||||||
|
|
||||||
## Write validated samplesheet with appropriate columns
|
## Write validated samplesheet with appropriate columns
|
||||||
|
HEADER_OUT = [
|
||||||
|
"sample",
|
||||||
|
"run_accession",
|
||||||
|
"instrument_platform",
|
||||||
|
"single_end",
|
||||||
|
"fastq_1",
|
||||||
|
"fastq_2",
|
||||||
|
"fasta",
|
||||||
|
]
|
||||||
if len(sample_mapping_dict) > 0:
|
if len(sample_mapping_dict) > 0:
|
||||||
out_dir = os.path.dirname(file_out)
|
out_dir = os.path.dirname(file_out)
|
||||||
make_dir(out_dir)
|
make_dir(out_dir)
|
||||||
with open(file_out, "w") as fout:
|
with open(file_out, "w") as fout:
|
||||||
fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n")
|
fout.write(",".join(HEADER_OUT) + "\n")
|
||||||
for sample in sorted(sample_mapping_dict.keys()):
|
for sample in sorted(sample_mapping_dict.keys()):
|
||||||
|
|
||||||
## Check that multiple runs of the same sample are of the same datatype
|
## Check that multiple runs of the same sample are of the same datatype
|
||||||
if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):
|
if not all(
|
||||||
print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample))
|
x[0] == sample_mapping_dict[sample][0][0]
|
||||||
|
for x in sample_mapping_dict[sample]
|
||||||
|
):
|
||||||
|
print_error(
|
||||||
|
"Multiple runs of a sample must be of the same datatype!",
|
||||||
|
"Sample: {}".format(sample),
|
||||||
|
)
|
||||||
|
|
||||||
for idx, val in enumerate(sample_mapping_dict[sample]):
|
for idx, val in enumerate(sample_mapping_dict[sample]):
|
||||||
fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n")
|
fout.write(
|
||||||
|
",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
print_error("No entries to process!", "Samplesheet: {}".format(file_in))
|
print_error("No entries to process!", "Samplesheet: {}".format(file_in))
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue