cleanup: validation and integrations for importing data

This commit is contained in:
uprightbass360
2025-11-22 16:49:01 -05:00
parent 5c9f1d7389
commit 71c1be1b46
17 changed files with 6797 additions and 369 deletions

View File

@@ -31,54 +31,127 @@ def parse_bool(value: str) -> bool:
def load_env_file(env_path: Path) -> Dict[str, str]:
"""
Load environment variables from .env file.
Args:
env_path: Path to .env file
Returns:
Dictionary of environment variable key-value pairs
Note:
Returns empty dict if file doesn't exist (not an error).
Handles quotes, comments, and export statements.
"""
if not env_path.exists():
return {}
env: Dict[str, str] = {}
for raw_line in env_path.read_text(encoding="utf-8").splitlines():
try:
content = env_path.read_text(encoding="utf-8")
except Exception as e:
print(f"Warning: Failed to read environment file {env_path}: {e}", file=sys.stderr)
return {}
for line_num, raw_line in enumerate(content.splitlines(), start=1):
line = raw_line.strip()
# Skip empty lines and comments
if not line or line.startswith("#"):
continue
# Remove 'export' prefix if present
if line.startswith("export "):
line = line[len("export ") :].strip()
# Skip lines without '='
if "=" not in line:
continue
key, value = line.split("=", 1)
key = key.strip()
value = value.strip()
if value.startswith('"') and value.endswith('"'):
value = value[1:-1]
elif value.startswith("'") and value.endswith("'"):
value = value[1:-1]
env[key] = value
try:
key, value = line.split("=", 1)
key = key.strip()
value = value.strip()
# Strip quotes
if value.startswith('"') and value.endswith('"'):
value = value[1:-1]
elif value.startswith("'") and value.endswith("'"):
value = value[1:-1]
env[key] = value
except Exception as e:
print(
f"Warning: Failed to parse line {line_num} in {env_path}: {raw_line}\n"
f" Error: {e}",
file=sys.stderr
)
continue
return env
def load_manifest(manifest_path: Path) -> List[Dict[str, object]]:
"""
Load and validate module manifest from JSON file.
Args:
manifest_path: Path to module-manifest.json file
Returns:
List of validated module dictionaries
Raises:
FileNotFoundError: If manifest file doesn't exist
json.JSONDecodeError: If manifest is not valid JSON
ValueError: If manifest structure is invalid
"""
if not manifest_path.exists():
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
with manifest_path.open("r", encoding="utf-8") as fh:
manifest = json.load(fh)
try:
with manifest_path.open("r", encoding="utf-8") as fh:
manifest = json.load(fh)
except json.JSONDecodeError as e:
raise ValueError(
f"Invalid JSON in manifest file {manifest_path}:\n"
f" Line {e.lineno}, Column {e.colno}: {e.msg}"
) from e
except Exception as e:
raise ValueError(f"Failed to read manifest file {manifest_path}: {e}") from e
modules = manifest.get("modules")
if not isinstance(modules, list):
raise ValueError("Manifest must define a top-level 'modules' array")
validated: List[Dict[str, object]] = []
seen_keys: set[str] = set()
for entry in modules:
for idx, entry in enumerate(modules):
if not isinstance(entry, dict):
raise ValueError("Each manifest entry must be an object")
raise ValueError(f"Manifest entry at index {idx} must be an object")
key = entry.get("key")
name = entry.get("name")
repo = entry.get("repo")
if not key or not isinstance(key, str):
raise ValueError("Manifest entry missing 'key'")
raise ValueError(f"Manifest entry at index {idx} missing 'key'")
if key in seen_keys:
raise ValueError(f"Duplicate manifest key detected: {key}")
raise ValueError(f"Duplicate manifest key detected: '{key}' (at index {idx})")
seen_keys.add(key)
if not name or not isinstance(name, str):
raise ValueError(f"Manifest entry {key} missing 'name'")
raise ValueError(f"Manifest entry '{key}' missing 'name' field")
if not repo or not isinstance(repo, str):
raise ValueError(f"Manifest entry {key} missing 'repo'")
raise ValueError(f"Manifest entry '{key}' missing 'repo' field")
validated.append(entry)
return validated

View File

@@ -0,0 +1,182 @@
#!/usr/bin/env python3
"""Generate a categorized list of GitHub modules missing from the manifest.
The script reuses the discovery logic from ``update_module_manifest.py`` to
fetch repositories by topic, filters out entries already tracked in
``config/module-manifest.json`` and writes the remainder (including type,
category, and inferred dependency hints) to a JSON file.
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from pathlib import Path
from typing import Dict, Iterable, List, Sequence, Tuple
from update_module_manifest import ( # type: ignore
CATEGORY_BY_TYPE,
DEFAULT_TOPICS,
GitHubClient,
collect_repositories,
load_manifest,
normalize_repo_url,
repo_name_to_key,
)
# heuristics used to surface potential dependency hints
DEPENDENCY_KEYWORDS: Tuple[Tuple[str, str], ...] = (
("playerbot", "MODULE_PLAYERBOTS"),
("ah-bot", "MODULE_PLAYERBOTS"),
("eluna", "MODULE_ELUNA"),
)
# keywords that help categorize entries that should probably stay hidden by default
SUPPRESSION_KEYWORDS: Tuple[Tuple[str, str], ...] = (
("virtual machine", "vm"),
(" vm ", "vm"),
(" docker", "docker"),
("container", "docker"),
("vagrant", "vagrant"),
("ansible", "automation"),
("terraform", "automation"),
("client", "client-distribution"),
("launcher", "client-distribution"),
)
def parse_args(argv: Sequence[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--manifest",
default="config/module-manifest.json",
help="Path to module manifest JSON (default: %(default)s)",
)
parser.add_argument(
"--output",
default="missing-modules.json",
help="Path to write the missing-module report JSON (default: %(default)s)",
)
parser.add_argument(
"--topic",
action="append",
default=[],
dest="topics",
help="GitHub topic (or '+' expression) to scan (defaults to built-in list).",
)
parser.add_argument(
"--max-pages",
type=int,
default=10,
help="Maximum pages (x100 results) to fetch per topic (default: %(default)s)",
)
parser.add_argument(
"--token",
help="GitHub API token (defaults to $GITHUB_TOKEN or $GITHUB_API_TOKEN)",
)
parser.add_argument(
"--log",
action="store_true",
help="Print verbose progress information",
)
return parser.parse_args(argv)
def implied_dependencies(module_type: str, text: str) -> List[str]:
deps: List[str] = []
if module_type == "lua":
deps.append("MODULE_ELUNA")
normalized = text.lower()
for keyword, dep in DEPENDENCY_KEYWORDS:
if keyword in normalized and dep not in deps:
deps.append(dep)
return deps
def suppression_flags(category: str, text: str) -> List[str]:
flags: List[str] = []
if category == "tooling":
flags.append("tooling")
normalized = text.lower()
for keyword, flag in SUPPRESSION_KEYWORDS:
if keyword in normalized and flag not in flags:
flags.append(flag)
return flags
def make_missing_entries(
manifest_modules: List[dict],
repos: Iterable,
) -> List[dict]:
by_key: Dict[str, dict] = {module.get("key"): module for module in manifest_modules if module.get("key")}
by_repo: Dict[str, dict] = {
normalize_repo_url(str(module.get("repo", ""))): module
for module in manifest_modules
if module.get("repo")
}
missing: List[dict] = []
for record in repos:
repo = record.data
repo_url = normalize_repo_url(repo.get("clone_url") or repo.get("html_url") or "")
existing = by_repo.get(repo_url)
key = repo_name_to_key(repo.get("name", ""))
if not existing:
existing = by_key.get(key)
if existing:
continue
module_type = record.module_type
category = CATEGORY_BY_TYPE.get(module_type, "uncategorized")
description = repo.get("description") or ""
combined_text = " ".join(
filter(
None,
[
repo.get("full_name"),
description,
" ".join(repo.get("topics") or []),
],
)
)
entry = {
"key": key,
"repo_name": repo.get("full_name"),
"topic": record.topic_expr,
"repo_url": repo.get("html_url") or repo.get("clone_url"),
"description": description,
"topics": repo.get("topics") or [],
"type": module_type,
"category": category,
"implied_dependencies": implied_dependencies(module_type, combined_text),
"flags": suppression_flags(category, combined_text),
}
missing.append(entry)
missing.sort(key=lambda item: item["key"])
return missing
def main(argv: Sequence[str]) -> int:
args = parse_args(argv)
topics = args.topics or DEFAULT_TOPICS
token = args.token or os.environ.get("GITHUB_TOKEN") or os.environ.get("GITHUB_API_TOKEN")
if not token:
print(
"Warning: no GitHub token provided, falling back to anonymous rate limit",
file=sys.stderr,
)
client = GitHubClient(token, verbose=args.log)
manifest = load_manifest(args.manifest)
repos = collect_repositories(client, topics, args.max_pages)
missing = make_missing_entries(manifest.get("modules", []), repos)
output_path = Path(args.output)
output_path.write_text(json.dumps(missing, indent=2))
print(f"Wrote {len(missing)} entries to {output_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))