Merge pull request #90 from nf-core/samplesheet-check-improvement

Make check_samplesheet.py compatible with nf-core/fetchngs
2024-12-22 10:28:16 +00:00 · 2022-08-23 15:57:59 +01:00 · 2022-08-23 15:57:59 +01:00 · 1821496a49
commit 1821496a49
parent 22e03e73e3 4315141c5f
1 changed files with 20 additions and 5 deletions
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@ -37,6 +37,7 @@ def print_error(error, context="Line", context_str=""):
    print(error_str)
    sys.exit(1)

+
 def check_samplesheet(file_in, file_out):
    """
    This function checks that the samplesheet follows the following structure:
@ -87,17 +88,28 @@ def check_samplesheet(file_in, file_out):
            "fasta",
        ]
        header = [x.strip('"') for x in fin.readline().strip().split(",")]
-        if header[: len(HEADER)] != HEADER:
+
+        ## Check for missing mandatory columns
+        missing_columns = list(set(HEADER) - set(header))
+        if len(missing_columns) > 0:
            print(
-                "ERROR: Please check samplesheet header -> {} != {}".format(
-                    ",".join(header), ",".join(HEADER)
+                "ERROR: Missing required column header -> {}. Note some columns can otherwise be empty. See pipeline documentation (https://nf-co.re/taxprofiler/usage).".format(
+                    ",".join(missing_columns)
                )
            )
            sys.exit(1)

+        ## Find locations of mandatory columns
+        header_locs = {}
+        for i in HEADER:
+            header_locs[i] = header.index(i)
+
        ## Check sample entries
        for line in fin:
-            lspl = [x.strip().strip('"') for x in line.strip().split(",")]
+
+            ## Pull out only relevant columns for downstream checking
+            line_parsed = [x.strip().strip('"') for x in line.strip().split(",")]
+            lspl = [line_parsed[i] for i in header_locs.values()]

            # Check valid number of columns per row
            if len(lspl) < len(HEADER):
@ -117,6 +129,7 @@ def check_samplesheet(file_in, file_out):
                )

            ## Check sample name entries
+
            (
                sample,
                run_accession,
@ -173,7 +186,9 @@ def check_samplesheet(file_in, file_out):
            ## Auto-detect paired-end/single-end
            if sample and fastq_1 and fastq_2:  ## Paired-end short reads
                sample_info.extend(["0", fastq_1, fastq_2, fasta])
-            elif sample and fastq_1 and not fastq_2:  ## Single-end short/long fastq reads
+            elif (
+                sample and fastq_1 and not fastq_2
+            ):  ## Single-end short/long fastq reads
                sample_info.extend(["1", fastq_1, fastq_2, fasta])
            elif (
                sample and fasta and not fastq_1 and not fastq_2