mirror of
https://github.com/MillironX/nf-core_modules.git
synced 2024-11-14 05:43:08 +00:00
Add helper script to find duplicate test YAML md5s (#1167)
* Add helper script to find duplicate test YAML md5s * Count how many modules the duplicates affect
This commit is contained in:
parent
826a5603db
commit
4e5406c221
1 changed files with 82 additions and 0 deletions
82
.github/check_duplicate_md5s.py
vendored
Normal file
82
.github/check_duplicate_md5s.py
vendored
Normal file
|
@ -0,0 +1,82 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
from rich import print
|
||||
from rich.table import Table
|
||||
import click
|
||||
import glob
|
||||
import os
|
||||
import yaml
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--min_dups",
|
||||
default=5,
|
||||
show_default=True,
|
||||
help="Minimum number of duplicates to report",
|
||||
)
|
||||
@click.option(
|
||||
"--search_dir",
|
||||
default=f"{os.path.dirname(__file__)}/../tests/**/test.yml",
|
||||
show_default=True,
|
||||
help="Glob directory pattern used to find test YAML files",
|
||||
)
|
||||
def find_duplicate_md5s(min_dups, search_dir):
|
||||
"""
|
||||
Find duplicate file MD5 sums in test YAML files.
|
||||
"""
|
||||
md5_filenames = {}
|
||||
md5_output_fn_counts = {}
|
||||
module_counts = {}
|
||||
|
||||
# Loop through all files in tests/ called test.yml
|
||||
for test_yml in glob.glob(search_dir, recursive=True):
|
||||
# Open file and parse YAML
|
||||
with open(test_yml, "r") as fh:
|
||||
test_config = yaml.safe_load(fh)
|
||||
# Loop through tests and check for duplicate md5s
|
||||
for test in test_config:
|
||||
for test_file in test.get("files", []):
|
||||
if "md5sum" in test_file:
|
||||
md5 = test_file["md5sum"]
|
||||
md5_filenames[md5] = md5_filenames.get(md5, []) + [
|
||||
os.path.basename(test_file.get("path"))
|
||||
]
|
||||
md5_output_fn_counts[md5] = md5_output_fn_counts.get(md5, 0) + 1
|
||||
# Log the module that this md5 was in
|
||||
modname = os.path.basename(os.path.dirname(test_yml))
|
||||
# If tool/subtool show the whole thing
|
||||
# Ugly code but trying to stat os-agnostic
|
||||
if os.path.basename(
|
||||
os.path.dirname(os.path.dirname(test_yml))
|
||||
) not in ["modules", "config", "subworkflows"]:
|
||||
modname = "{}/{}".format(
|
||||
os.path.basename(
|
||||
os.path.dirname(os.path.dirname(test_yml))
|
||||
),
|
||||
os.path.basename(os.path.dirname(test_yml)),
|
||||
)
|
||||
module_counts[md5] = module_counts.get(md5, []) + [modname]
|
||||
|
||||
# Set up rich table
|
||||
table = Table(title="Duplicate MD5s", row_styles=["dim", ""])
|
||||
table.add_column("MD5", style="cyan", no_wrap=True)
|
||||
table.add_column("Count", style="magenta", justify="right")
|
||||
table.add_column("Num modules", style="blue", justify="right")
|
||||
table.add_column("Filenames", style="green")
|
||||
|
||||
# Add rows - sort md5_output_fn_counts by value
|
||||
for md5 in sorted(md5_output_fn_counts, key=md5_output_fn_counts.get):
|
||||
if md5_output_fn_counts[md5] >= min_dups:
|
||||
table.add_row(
|
||||
md5,
|
||||
str(md5_output_fn_counts[md5]),
|
||||
str(len(set(module_counts[md5]))),
|
||||
", ".join(set(md5_filenames[md5])),
|
||||
)
|
||||
|
||||
print(table)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
find_duplicate_md5s()
|
Loading…
Reference in a new issue