nf-core_modules/.github/check_duplicate_md5s.py

83 lines
3 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python
from rich import print
from rich.table import Table
import click
import glob
import os
import yaml
@click.command()
@click.option(
"--min_dups",
default=5,
show_default=True,
help="Minimum number of duplicates to report",
)
@click.option(
"--search_dir",
default=f"{os.path.dirname(__file__)}/../tests/**/test.yml",
show_default=True,
help="Glob directory pattern used to find test YAML files",
)
def find_duplicate_md5s(min_dups, search_dir):
"""
Find duplicate file MD5 sums in test YAML files.
"""
md5_filenames = {}
md5_output_fn_counts = {}
module_counts = {}
# Loop through all files in tests/ called test.yml
for test_yml in glob.glob(search_dir, recursive=True):
# Open file and parse YAML
with open(test_yml, "r") as fh:
test_config = yaml.safe_load(fh)
# Loop through tests and check for duplicate md5s
for test in test_config:
for test_file in test.get("files", []):
if "md5sum" in test_file:
md5 = test_file["md5sum"]
md5_filenames[md5] = md5_filenames.get(md5, []) + [
os.path.basename(test_file.get("path"))
]
md5_output_fn_counts[md5] = md5_output_fn_counts.get(md5, 0) + 1
# Log the module that this md5 was in
modname = os.path.basename(os.path.dirname(test_yml))
# If tool/subtool show the whole thing
# Ugly code but trying to stat os-agnostic
if os.path.basename(
os.path.dirname(os.path.dirname(test_yml))
) not in ["modules", "config", "subworkflows"]:
modname = "{}/{}".format(
os.path.basename(
os.path.dirname(os.path.dirname(test_yml))
),
os.path.basename(os.path.dirname(test_yml)),
)
module_counts[md5] = module_counts.get(md5, []) + [modname]
# Set up rich table
table = Table(title="Duplicate MD5s", row_styles=["dim", ""])
table.add_column("MD5", style="cyan", no_wrap=True)
table.add_column("Count", style="magenta", justify="right")
table.add_column("Num modules", style="blue", justify="right")
table.add_column("Filenames", style="green")
# Add rows - sort md5_output_fn_counts by value
for md5 in sorted(md5_output_fn_counts, key=md5_output_fn_counts.get):
if md5_output_fn_counts[md5] >= min_dups:
table.add_row(
md5,
str(md5_output_fn_counts[md5]),
str(len(set(module_counts[md5]))),
", ".join(set(md5_filenames[md5])),
)
print(table)
if __name__ == "__main__":
find_duplicate_md5s()