add more columns to samplesheet

2024-11-22 11:09:55 +00:00 · 2022-02-18 11:53:13 +01:00 · 2022-02-18 11:53:13 +01:00 · f867c057a4
commit f867c057a4
parent 12e7b428f2
1 changed files with 118 additions and 20 deletions
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@ -3,6 +3,7 @@
 # TODO nf-core: Update the script to check the samplesheet
 # This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
 from distutils import extension
 import os
 import sys
 import errno
@ -10,7 +11,9 @@ import argparse
 def parse_args(args=None):
-    Description = "Reformat nf-core/taxprofiler samplesheet file and check its contents."
+    Description = (
        "Reformat nf-core/taxprofiler samplesheet file and check its contents."
    )
    Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
@ -43,25 +46,62 @@ def check_samplesheet(file_in, file_out):
    """
    This function checks that the samplesheet follows the following structure:
-    sample,fastq_1,fastq_2
+    sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
-    SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
+    2611,ERR5766174,ILLUMINA,NA,NA,FA_EXTENSIONSERX5474930_ERR5766174_1.fa.gz
-    SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
+    2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz,NA
-    SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
+    2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,NA,NA
    2613,ERR5766181,ILLUMINA,ERX5474930_ERR5766174_1.fa.gz,ERX5474930_ERR5766174_2.fa.gz,NA
    For an example see:
    https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
    """
    FQ_EXTENSIONS = (".fq", ".fq.gz", ".fastq", ".fastq.gz")
    FA_EXTENSIONS = (
        ".fa",
        ".fa.gz",
        ".fasta",
        ".fasta.gz",
        ".fna",
        ".fna.gz",
        ".fas",
        ".fas.gz",
    )
    INSTRUMENT_PLATFORMS = [
        "ABI_SOLID",
        "BGISEQ",
        "CAPILLARY",
        "COMPLETE_GENOMICS",
        "DNBSEQ",
        "HELICOS",
        "ILLUMINA",
        "ION_TORRENT",
        "LS454",
        "OXFORD_NANOPORE",
        "PACBIO_SMRT",
    ]
    sample_mapping_dict = {}
    with open(file_in, "r") as fin:
        ## Check header
-        MIN_COLS = 2
+        MIN_COLS = 4
        # TODO nf-core: Update the column names for the input samplesheet
-        HEADER = ["sample", "fastq_1", "fastq_2"]
+        HEADER = [
            "sample",
            "run_accession",
            "instrument_platform",
            "fastq_1",
            "fastq_2",
            "fasta",
        ]
        header = [x.strip('"') for x in fin.readline().strip().split(",")]
        if header[: len(HEADER)] != HEADER:
-            print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
+            print(
                "ERROR: Please check samplesheet header -> {} != {}".format(
                    ",".join(header), ",".join(HEADER)
                )
            )
            sys.exit(1)
        ## Check sample entries
@ -78,13 +118,22 @@ def check_samplesheet(file_in, file_out):
            num_cols = len([x for x in lspl if x])
            if num_cols < MIN_COLS:
                print_error(
-                    "Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
+                    "Invalid number of populated columns (minimum = {})!".format(
                        MIN_COLS
                    ),
                    "Line",
                    line,
                )
            ## Check sample name entries
-            sample, fastq_1, fastq_2 = lspl[: len(HEADER)]
+            (
                sample,
                run_accession,
                instrument_platform,
                fastq_1,
                fastq_2,
                fasta,
            ) = lspl[: len(HEADER)]
            sample = sample.replace(" ", "_")
            if not sample:
                print_error("Sample entry has not been specified!", "Line", line)
@ -94,23 +143,55 @@ def check_samplesheet(file_in, file_out):
                if fastq:
                    if fastq.find(" ") != -1:
                        print_error("FastQ file contains spaces!", "Line", line)
-                    if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
+                    if not fastq.endswith(FQ_EXTENSIONS):
                        print_error(
-                            "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
+                            f"FastQ file does not have extension {' or '.join(list(FQ_EXTENSIONS))} !",
                            "Line",
                            line,
                        )
            if fasta:
                if fasta.find(" ") != -1:
                    print_error("FastA file contains spaces!", "Line", line)
                if not fasta.endswith(FA_EXTENSIONS):
                    print_error(
                        f"FastA file does not have extension {' or '.join(list(FA_EXTENSIONS))}!",
                        "Line",
                        line,
                    )
            sample_info = []
            # Check run_accession
            if not run_accession:
                print_error("Run accession has not been specified!", "Line", line)
            else:
                sample_info.append(run_accession)
            # Check instrument_platform
            if not instrument_platform:
                print_error("Instrument platform has not been specified!", "Line", line)
            else:
                if instrument_platform not in INSTRUMENT_PLATFORMS:
                    print_error(
                        f"Instrument platform {instrument_platform} is not supported!",
                        f"List of supported platforms {', '.join(INSTRUMENT_PLATFORMS)}",
                        "Line",
                        line,
                    )
                sample_info.append(instrument_platform)
            ## Auto-detect paired-end/single-end
            sample_info = []  ## [single_end, fastq_1, fastq_2]
            if sample and fastq_1 and fastq_2:  ## Paired-end short reads
-                sample_info = ["0", fastq_1, fastq_2]
+                sample_info.extend(["0", fastq_1, fastq_2, fasta])
            elif sample and fastq_1 and not fastq_2:  ## Single-end short reads
-                sample_info = ["1", fastq_1, fastq_2]
+                sample_info.extend(["1", fastq_1, fastq_2, fasta])
            elif (
                sample and fasta and not fastq_1 and not fastq_2
            ):  ## Single-end long reads
                sample_info.extend(["1", fastq_1, fastq_2, fasta])
            else:
                print_error("Invalid combination of columns provided!", "Line", line)
-            ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] }
+            ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 , fasta, run_accession, instrument_platform] }
            if sample not in sample_mapping_dict:
                sample_mapping_dict[sample] = [sample_info]
            else:
@ -120,19 +201,36 @@ def check_samplesheet(file_in, file_out):
                    sample_mapping_dict[sample].append(sample_info)
    ## Write validated samplesheet with appropriate columns
    HEADER_OUT = [
        "sample",
        "run_accession",
        "instrument_platform",
        "single_end",
        "fastq_1",
        "fastq_2",
        "fasta",
    ]
    if len(sample_mapping_dict) > 0:
        out_dir = os.path.dirname(file_out)
        make_dir(out_dir)
        with open(file_out, "w") as fout:
-            fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n")
+            fout.write(",".join(HEADER_OUT) + "\n")
            for sample in sorted(sample_mapping_dict.keys()):
                ## Check that multiple runs of the same sample are of the same datatype
-                if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):
+                if not all(
-                    print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample))
+                    x[0] == sample_mapping_dict[sample][0][0]
                    for x in sample_mapping_dict[sample]
                ):
                    print_error(
                        "Multiple runs of a sample must be of the same datatype!",
                        "Sample: {}".format(sample),
                    )
                for idx, val in enumerate(sample_mapping_dict[sample]):
-                    fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n")
+                    fout.write(
                        ",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n"
                    )
    else:
        print_error("No entries to process!", "Samplesheet: {}".format(file_in))