# build_cache.py import os import io import json import tarfile import subprocess import tempfile from pathlib import Path from datetime import datetime, timezone from huggingface_hub import HfApi from modular_graph_and_candidates import ( build_graph_json, generate_html, build_timeline_json, generate_timeline_html, ) REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers") CACHE_REPO = "Molbap/hf_cached_embeds_log" MIN_THRESH = 0.1 MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1", "true", "True", "YES", "yes"} SIM_METHOD = os.getenv("SIM_METHOD", "jaccard") MODULAR_CUTOFF_ISO = "2024-05-31" def _run(cwd: Path, *args: str) -> str: p = subprocess.run(["git", *args], cwd=cwd, text=True, capture_output=True, timeout=1200) if p.returncode != 0: raise RuntimeError(p.stderr.strip()[:400]) return p.stdout def _count_lines(text: str) -> int: return text.count("\n") + (1 if text and not text.endswith("\n") else 0) def _compute_loc_growth(repo: Path) -> dict: try: _run(repo, "fetch", "--unshallow", "--tags", "--prune") except Exception: _run(repo, "fetch", "--depth=100000", "--tags", "--prune") pathspec = "src/transformers/models" lines = _run(repo, "log", "--reverse", "--format=%H|%cI", "HEAD", "--", pathspec).splitlines() commits = [(ln.split("|", 1)[0], ln.split("|", 1)[1]) for ln in lines if "|" in ln] total = len(commits) if total > 500: step = max(1, total // 300) commits = commits[::step] out = [] for sha, date_iso in commits: proc = subprocess.run( ["git", "archive", sha, "--", pathspec], cwd=repo, capture_output=True, timeout=180 ) if proc.returncode != 0 or not proc.stdout: # Fallback: zero for this point; continue out.append({ "sha": sha, "date": date_iso, "loc_modeling_all": 0, "loc_modular": 0, "loc_modeling_included": 0, "effective_loc": 0, "n_models_with_modular": 0 }) continue buf = io.BytesIO(proc.stdout) modeling_by_model = {} modular_by_model = {} with tarfile.open(fileobj=buf, mode="r:*") as tar: for m in tar.getmembers(): if not m.isfile(): continue name = m.name if not name.endswith(".py"): continue if "/models/" not in name: continue parts = name.split("/") try: idx = parts.index("models") model = parts[idx + 1] if idx + 1 < len(parts) else "" except ValueError: model = "" if not model: continue if "/modeling_" in name or "/modular_" in name: f = tar.extractfile(m) if not f: continue try: txt = f.read().decode("utf-8", errors="ignore") finally: f.close() n = _count_lines(txt) if "/modular_" in name: modular_by_model[model] = modular_by_model.get(model, 0) + n elif "/modeling_" in name: modeling_by_model[model] = modeling_by_model.get(model, 0) + n modeling_all = sum(modeling_by_model.values()) modular_loc = sum(modular_by_model.values()) models_with_modular = set(modular_by_model.keys()) modeling_excluded = sum(modeling_by_model.get(m, 0) for m in models_with_modular) modeling_included = modeling_all - modeling_excluded effective = modeling_included + modular_loc out.append({ "sha": sha, "date": date_iso, "loc_modeling_all": modeling_all, "loc_modular": modular_loc, "loc_modeling_included": modeling_included, "effective_loc": effective, "n_models_with_modular": len(models_with_modular), }) return {"series": out, "cutoff": MODULAR_CUTOFF_ISO} def _loc_html(loc: dict) -> str: data = json.dumps(loc["series"], separators=(",", ":")) cutoff = loc["cutoff"] return f""" LOC growth
""" def main(): tmp = Path(tempfile.mkdtemp()) subprocess.check_call(["git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")]) sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip() repo_path = tmp / "repo" loc_growth = _compute_loc_growth(repo_path) loc_json_str = json.dumps(loc_growth, separators=(",", ":")) loc_html_str = _loc_html(loc_growth) graph = build_graph_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD) timeline = build_timeline_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD) graph_html = generate_html(graph) timeline_html = generate_timeline_html(timeline) api = HfApi() api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True) key = f"{sha}/{SIM_METHOD}-m{int(MULTIMODAL)}" latest = { "sha": sha, "updated_utc": datetime.now(timezone.utc).isoformat(), "defaults": {"sim_method": SIM_METHOD, "min_threshold": MIN_THRESH, "multimodal": MULTIMODAL}, "paths": { "graph_json": f"graph/{key}.json", "graph_html": f"graph/{key}.html", "timeline_json": f"timeline/{key}.json", "timeline_html": f"timeline/{key}.html", "loc_json": f"loc/{key}.json", "loc_html": f"loc/{key}.html", }, } def put(path_in_repo: str, text: str): api.upload_file( path_or_fileobj=io.BytesIO(text.encode("utf-8")), path_in_repo=path_in_repo, repo_id=CACHE_REPO, repo_type="dataset", commit_message=f"cache {path_in_repo}", ) put(f"graph/{key}.json", json.dumps(graph, separators=(",", ":"))) put(f"graph/{key}.html", graph_html) put(f"timeline/{key}.json", json.dumps(timeline, separators=(",", ":"))) put(f"timeline/{key}.html", timeline_html) put(f"loc/{key}.json", loc_json_str) put(f"loc/{key}.html", loc_html_str) put("latest.json", json.dumps(latest, separators=(",", ":"))) if __name__ == "__main__": main()