add more columns to samplesheet

2024-12-22 10:08:17 +00:00 · 2022-02-18 11:53:13 +01:00 · 2022-02-18 11:53:13 +01:00 · f867c057a4
commit f867c057a4
parent 12e7b428f2
1 changed files with 118 additions and 20 deletions
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@ -3,6 +3,7 @@
 # TODO nf-core: Update the script to check the samplesheet
 # This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv

+from distutils import extension
 import os
 import sys
 import errno
@ -10,7 +11,9 @@ import argparse


 def parse_args(args=None):
-    Description = "Reformat nf-core/taxprofiler samplesheet file and check its contents."
+    Description = (
+        "Reformat nf-core/taxprofiler samplesheet file and check its contents."
+    )
    Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"

    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
@ -43,25 +46,62 @@ def check_samplesheet(file_in, file_out):
    """
    This function checks that the samplesheet follows the following structure:

-    sample,fastq_1,fastq_2
-    SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
-    SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
-    SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
+    sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
+    2611,ERR5766174,ILLUMINA,NA,NA,FA_EXTENSIONSERX5474930_ERR5766174_1.fa.gz
+    2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz,NA
+    2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,NA,NA
+    2613,ERR5766181,ILLUMINA,ERX5474930_ERR5766174_1.fa.gz,ERX5474930_ERR5766174_2.fa.gz,NA

    For an example see:
    https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
    """

+    FQ_EXTENSIONS = (".fq", ".fq.gz", ".fastq", ".fastq.gz")
+    FA_EXTENSIONS = (
+        ".fa",
+        ".fa.gz",
+        ".fasta",
+        ".fasta.gz",
+        ".fna",
+        ".fna.gz",
+        ".fas",
+        ".fas.gz",
+    )
+    INSTRUMENT_PLATFORMS = [
+        "ABI_SOLID",
+        "BGISEQ",
+        "CAPILLARY",
+        "COMPLETE_GENOMICS",
+        "DNBSEQ",
+        "HELICOS",
+        "ILLUMINA",
+        "ION_TORRENT",
+        "LS454",
+        "OXFORD_NANOPORE",
+        "PACBIO_SMRT",
+    ]
+
    sample_mapping_dict = {}
    with open(file_in, "r") as fin:

        ## Check header
-        MIN_COLS = 2
+        MIN_COLS = 4
        # TODO nf-core: Update the column names for the input samplesheet
-        HEADER = ["sample", "fastq_1", "fastq_2"]
+        HEADER = [
+            "sample",
+            "run_accession",
+            "instrument_platform",
+            "fastq_1",
+            "fastq_2",
+            "fasta",
+        ]
        header = [x.strip('"') for x in fin.readline().strip().split(",")]
        if header[: len(HEADER)] != HEADER:
-            print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
+            print(
+                "ERROR: Please check samplesheet header -> {} != {}".format(
+                    ",".join(header), ",".join(HEADER)
+                )
+            )
            sys.exit(1)

        ## Check sample entries
@ -78,13 +118,22 @@ def check_samplesheet(file_in, file_out):
            num_cols = len([x for x in lspl if x])
            if num_cols < MIN_COLS:
                print_error(
-                    "Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
+                    "Invalid number of populated columns (minimum = {})!".format(
+                        MIN_COLS
+                    ),
                    "Line",
                    line,
                )

            ## Check sample name entries
-            sample, fastq_1, fastq_2 = lspl[: len(HEADER)]
+            (
+                sample,
+                run_accession,
+                instrument_platform,
+                fastq_1,
+                fastq_2,
+                fasta,
+            ) = lspl[: len(HEADER)]
            sample = sample.replace(" ", "_")
            if not sample:
                print_error("Sample entry has not been specified!", "Line", line)
@ -94,23 +143,55 @@ def check_samplesheet(file_in, file_out):
                if fastq:
                    if fastq.find(" ") != -1:
                        print_error("FastQ file contains spaces!", "Line", line)
-                    if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
+                    if not fastq.endswith(FQ_EXTENSIONS):
                        print_error(
-                            "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
+                            f"FastQ file does not have extension {' or '.join(list(FQ_EXTENSIONS))} !",
                            "Line",
                            line,
                        )
+            if fasta:
+                if fasta.find(" ") != -1:
+                    print_error("FastA file contains spaces!", "Line", line)
+                if not fasta.endswith(FA_EXTENSIONS):
+                    print_error(
+                        f"FastA file does not have extension {' or '.join(list(FA_EXTENSIONS))}!",
+                        "Line",
+                        line,
+                    )
+            sample_info = []
+
+            # Check run_accession
+            if not run_accession:
+                print_error("Run accession has not been specified!", "Line", line)
+            else:
+                sample_info.append(run_accession)
+
+            # Check instrument_platform
+            if not instrument_platform:
+                print_error("Instrument platform has not been specified!", "Line", line)
+            else:
+                if instrument_platform not in INSTRUMENT_PLATFORMS:
+                    print_error(
+                        f"Instrument platform {instrument_platform} is not supported!",
+                        f"List of supported platforms {', '.join(INSTRUMENT_PLATFORMS)}",
+                        "Line",
+                        line,
+                    )
+                sample_info.append(instrument_platform)

            ## Auto-detect paired-end/single-end
-            sample_info = []  ## [single_end, fastq_1, fastq_2]
            if sample and fastq_1 and fastq_2:  ## Paired-end short reads
-                sample_info = ["0", fastq_1, fastq_2]
+                sample_info.extend(["0", fastq_1, fastq_2, fasta])
            elif sample and fastq_1 and not fastq_2:  ## Single-end short reads
-                sample_info = ["1", fastq_1, fastq_2]
+                sample_info.extend(["1", fastq_1, fastq_2, fasta])
+            elif (
+                sample and fasta and not fastq_1 and not fastq_2
+            ):  ## Single-end long reads
+                sample_info.extend(["1", fastq_1, fastq_2, fasta])
            else:
                print_error("Invalid combination of columns provided!", "Line", line)

-            ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] }
+            ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 , fasta, run_accession, instrument_platform] }
            if sample not in sample_mapping_dict:
                sample_mapping_dict[sample] = [sample_info]
            else:
@ -120,19 +201,36 @@ def check_samplesheet(file_in, file_out):
                    sample_mapping_dict[sample].append(sample_info)

    ## Write validated samplesheet with appropriate columns
+    HEADER_OUT = [
+        "sample",
+        "run_accession",
+        "instrument_platform",
+        "single_end",
+        "fastq_1",
+        "fastq_2",
+        "fasta",
+    ]
    if len(sample_mapping_dict) > 0:
        out_dir = os.path.dirname(file_out)
        make_dir(out_dir)
        with open(file_out, "w") as fout:
-            fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n")
+            fout.write(",".join(HEADER_OUT) + "\n")
            for sample in sorted(sample_mapping_dict.keys()):

                ## Check that multiple runs of the same sample are of the same datatype
-                if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):
-                    print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample))
+                if not all(
+                    x[0] == sample_mapping_dict[sample][0][0]
+                    for x in sample_mapping_dict[sample]
+                ):
+                    print_error(
+                        "Multiple runs of a sample must be of the same datatype!",
+                        "Sample: {}".format(sample),
+                    )

                for idx, val in enumerate(sample_mapping_dict[sample]):
-                    fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n")
+                    fout.write(
+                        ",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n"
+                    )
    else:
        print_error("No entries to process!", "Samplesheet: {}".format(file_in))