From 4e5406c221e91ede93d9d8036110da62c86cca66 Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Tue, 14 Dec 2021 12:58:25 +0100 Subject: [PATCH] Add helper script to find duplicate test YAML md5s (#1167) * Add helper script to find duplicate test YAML md5s * Count how many modules the duplicates affect --- .github/check_duplicate_md5s.py | 82 +++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 .github/check_duplicate_md5s.py diff --git a/.github/check_duplicate_md5s.py b/.github/check_duplicate_md5s.py new file mode 100644 index 00000000..60506ab5 --- /dev/null +++ b/.github/check_duplicate_md5s.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python + +from rich import print +from rich.table import Table +import click +import glob +import os +import yaml + + +@click.command() +@click.option( + "--min_dups", + default=5, + show_default=True, + help="Minimum number of duplicates to report", +) +@click.option( + "--search_dir", + default=f"{os.path.dirname(__file__)}/../tests/**/test.yml", + show_default=True, + help="Glob directory pattern used to find test YAML files", +) +def find_duplicate_md5s(min_dups, search_dir): + """ + Find duplicate file MD5 sums in test YAML files. + """ + md5_filenames = {} + md5_output_fn_counts = {} + module_counts = {} + + # Loop through all files in tests/ called test.yml + for test_yml in glob.glob(search_dir, recursive=True): + # Open file and parse YAML + with open(test_yml, "r") as fh: + test_config = yaml.safe_load(fh) + # Loop through tests and check for duplicate md5s + for test in test_config: + for test_file in test.get("files", []): + if "md5sum" in test_file: + md5 = test_file["md5sum"] + md5_filenames[md5] = md5_filenames.get(md5, []) + [ + os.path.basename(test_file.get("path")) + ] + md5_output_fn_counts[md5] = md5_output_fn_counts.get(md5, 0) + 1 + # Log the module that this md5 was in + modname = os.path.basename(os.path.dirname(test_yml)) + # If tool/subtool show the whole thing + # Ugly code but trying to stat os-agnostic + if os.path.basename( + os.path.dirname(os.path.dirname(test_yml)) + ) not in ["modules", "config", "subworkflows"]: + modname = "{}/{}".format( + os.path.basename( + os.path.dirname(os.path.dirname(test_yml)) + ), + os.path.basename(os.path.dirname(test_yml)), + ) + module_counts[md5] = module_counts.get(md5, []) + [modname] + + # Set up rich table + table = Table(title="Duplicate MD5s", row_styles=["dim", ""]) + table.add_column("MD5", style="cyan", no_wrap=True) + table.add_column("Count", style="magenta", justify="right") + table.add_column("Num modules", style="blue", justify="right") + table.add_column("Filenames", style="green") + + # Add rows - sort md5_output_fn_counts by value + for md5 in sorted(md5_output_fn_counts, key=md5_output_fn_counts.get): + if md5_output_fn_counts[md5] >= min_dups: + table.add_row( + md5, + str(md5_output_fn_counts[md5]), + str(len(set(module_counts[md5]))), + ", ".join(set(md5_filenames[md5])), + ) + + print(table) + + +if __name__ == "__main__": + find_duplicate_md5s()