cleanup: validation and integrations for importing data

This commit is contained in:
uprightbass360
2025-11-22 16:49:01 -05:00
parent 5c9f1d7389
commit 71c1be1b46
17 changed files with 6797 additions and 369 deletions

View File

@@ -0,0 +1,182 @@
#!/usr/bin/env python3
"""Generate a categorized list of GitHub modules missing from the manifest.
The script reuses the discovery logic from ``update_module_manifest.py`` to
fetch repositories by topic, filters out entries already tracked in
``config/module-manifest.json`` and writes the remainder (including type,
category, and inferred dependency hints) to a JSON file.
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from pathlib import Path
from typing import Dict, Iterable, List, Sequence, Tuple
from update_module_manifest import ( # type: ignore
CATEGORY_BY_TYPE,
DEFAULT_TOPICS,
GitHubClient,
collect_repositories,
load_manifest,
normalize_repo_url,
repo_name_to_key,
)
# heuristics used to surface potential dependency hints
DEPENDENCY_KEYWORDS: Tuple[Tuple[str, str], ...] = (
("playerbot", "MODULE_PLAYERBOTS"),
("ah-bot", "MODULE_PLAYERBOTS"),
("eluna", "MODULE_ELUNA"),
)
# keywords that help categorize entries that should probably stay hidden by default
SUPPRESSION_KEYWORDS: Tuple[Tuple[str, str], ...] = (
("virtual machine", "vm"),
(" vm ", "vm"),
(" docker", "docker"),
("container", "docker"),
("vagrant", "vagrant"),
("ansible", "automation"),
("terraform", "automation"),
("client", "client-distribution"),
("launcher", "client-distribution"),
)
def parse_args(argv: Sequence[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--manifest",
default="config/module-manifest.json",
help="Path to module manifest JSON (default: %(default)s)",
)
parser.add_argument(
"--output",
default="missing-modules.json",
help="Path to write the missing-module report JSON (default: %(default)s)",
)
parser.add_argument(
"--topic",
action="append",
default=[],
dest="topics",
help="GitHub topic (or '+' expression) to scan (defaults to built-in list).",
)
parser.add_argument(
"--max-pages",
type=int,
default=10,
help="Maximum pages (x100 results) to fetch per topic (default: %(default)s)",
)
parser.add_argument(
"--token",
help="GitHub API token (defaults to $GITHUB_TOKEN or $GITHUB_API_TOKEN)",
)
parser.add_argument(
"--log",
action="store_true",
help="Print verbose progress information",
)
return parser.parse_args(argv)
def implied_dependencies(module_type: str, text: str) -> List[str]:
deps: List[str] = []
if module_type == "lua":
deps.append("MODULE_ELUNA")
normalized = text.lower()
for keyword, dep in DEPENDENCY_KEYWORDS:
if keyword in normalized and dep not in deps:
deps.append(dep)
return deps
def suppression_flags(category: str, text: str) -> List[str]:
flags: List[str] = []
if category == "tooling":
flags.append("tooling")
normalized = text.lower()
for keyword, flag in SUPPRESSION_KEYWORDS:
if keyword in normalized and flag not in flags:
flags.append(flag)
return flags
def make_missing_entries(
manifest_modules: List[dict],
repos: Iterable,
) -> List[dict]:
by_key: Dict[str, dict] = {module.get("key"): module for module in manifest_modules if module.get("key")}
by_repo: Dict[str, dict] = {
normalize_repo_url(str(module.get("repo", ""))): module
for module in manifest_modules
if module.get("repo")
}
missing: List[dict] = []
for record in repos:
repo = record.data
repo_url = normalize_repo_url(repo.get("clone_url") or repo.get("html_url") or "")
existing = by_repo.get(repo_url)
key = repo_name_to_key(repo.get("name", ""))
if not existing:
existing = by_key.get(key)
if existing:
continue
module_type = record.module_type
category = CATEGORY_BY_TYPE.get(module_type, "uncategorized")
description = repo.get("description") or ""
combined_text = " ".join(
filter(
None,
[
repo.get("full_name"),
description,
" ".join(repo.get("topics") or []),
],
)
)
entry = {
"key": key,
"repo_name": repo.get("full_name"),
"topic": record.topic_expr,
"repo_url": repo.get("html_url") or repo.get("clone_url"),
"description": description,
"topics": repo.get("topics") or [],
"type": module_type,
"category": category,
"implied_dependencies": implied_dependencies(module_type, combined_text),
"flags": suppression_flags(category, combined_text),
}
missing.append(entry)
missing.sort(key=lambda item: item["key"])
return missing
def main(argv: Sequence[str]) -> int:
args = parse_args(argv)
topics = args.topics or DEFAULT_TOPICS
token = args.token or os.environ.get("GITHUB_TOKEN") or os.environ.get("GITHUB_API_TOKEN")
if not token:
print(
"Warning: no GitHub token provided, falling back to anonymous rate limit",
file=sys.stderr,
)
client = GitHubClient(token, verbose=args.log)
manifest = load_manifest(args.manifest)
repos = collect_repositories(client, topics, args.max_pages)
missing = make_missing_entries(manifest.get("modules", []), repos)
output_path = Path(args.output)
output_path.write_text(json.dumps(missing, indent=2))
print(f"Wrote {len(missing)} entries to {output_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))