Molbap's picture
Molbap HF Staff
updates
c7100d5
raw
history blame
7.74 kB
# build_cache.py
import os
import io
import json
import tarfile
import subprocess
import tempfile
from pathlib import Path
from datetime import datetime, timezone
from huggingface_hub import HfApi
from modular_graph_and_candidates import (
build_graph_json,
generate_html,
build_timeline_json,
generate_timeline_html,
)
REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
CACHE_REPO = "Molbap/hf_cached_embeds_log"
MIN_THRESH = 0.1
MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1", "true", "True", "YES", "yes"}
SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
MODULAR_CUTOFF_ISO = "2024-05-31"
def _run(cwd: Path, *args: str) -> str:
p = subprocess.run(["git", *args], cwd=cwd, text=True, capture_output=True, timeout=1200)
if p.returncode != 0:
raise RuntimeError(p.stderr.strip()[:400])
return p.stdout
def _count_lines(text: str) -> int:
return text.count("\n") + (1 if text and not text.endswith("\n") else 0)
def _compute_loc_growth(repo: Path) -> dict:
try:
_run(repo, "fetch", "--unshallow", "--tags", "--prune")
except Exception:
_run(repo, "fetch", "--depth=100000", "--tags", "--prune")
pathspec = "src/transformers/models"
lines = _run(repo, "log", "--reverse", "--format=%H|%cI", "HEAD", "--", pathspec).splitlines()
commits = [(ln.split("|", 1)[0], ln.split("|", 1)[1]) for ln in lines if "|" in ln]
total = len(commits)
if total > 500:
step = max(1, total // 300)
commits = commits[::step]
out = []
for sha, date_iso in commits:
proc = subprocess.run(
["git", "archive", sha, "--", pathspec],
cwd=repo, capture_output=True, timeout=180
)
if proc.returncode != 0 or not proc.stdout:
# Fallback: zero for this point; continue
out.append({
"sha": sha, "date": date_iso,
"loc_modeling_all": 0, "loc_modular": 0,
"loc_modeling_included": 0, "effective_loc": 0,
"n_models_with_modular": 0
})
continue
buf = io.BytesIO(proc.stdout)
modeling_by_model = {}
modular_by_model = {}
with tarfile.open(fileobj=buf, mode="r:*") as tar:
for m in tar.getmembers():
if not m.isfile():
continue
name = m.name
if not name.endswith(".py"):
continue
if "/models/" not in name:
continue
parts = name.split("/")
try:
idx = parts.index("models")
model = parts[idx + 1] if idx + 1 < len(parts) else ""
except ValueError:
model = ""
if not model:
continue
if "/modeling_" in name or "/modular_" in name:
f = tar.extractfile(m)
if not f:
continue
try:
txt = f.read().decode("utf-8", errors="ignore")
finally:
f.close()
n = _count_lines(txt)
if "/modular_" in name:
modular_by_model[model] = modular_by_model.get(model, 0) + n
elif "/modeling_" in name:
modeling_by_model[model] = modeling_by_model.get(model, 0) + n
modeling_all = sum(modeling_by_model.values())
modular_loc = sum(modular_by_model.values())
models_with_modular = set(modular_by_model.keys())
modeling_excluded = sum(modeling_by_model.get(m, 0) for m in models_with_modular)
modeling_included = modeling_all - modeling_excluded
effective = modeling_included + modular_loc
out.append({
"sha": sha,
"date": date_iso,
"loc_modeling_all": modeling_all,
"loc_modular": modular_loc,
"loc_modeling_included": modeling_included,
"effective_loc": effective,
"n_models_with_modular": len(models_with_modular),
})
return {"series": out, "cutoff": MODULAR_CUTOFF_ISO}
def _loc_html(loc: dict) -> str:
data = json.dumps(loc["series"], separators=(",", ":"))
cutoff = loc["cutoff"]
return f"""<!doctype html><meta charset=utf-8>
<title>LOC growth</title>
<div id=chart style="height:60vh;width:90vw;margin:2rem auto;"></div>
<script src="https://cdn.jsdelivr.net/npm/apexcharts"></script>
<script>
const raw={data};
const xs=raw.map(d=>new Date(d.date).getTime());
const eff=raw.map(d=>d.effective_loc);
const mod=raw.map(d=>d.loc_modular);
const mdl_all=raw.map(d=>d.loc_modeling_all);
const mdl_inc=raw.map(d=>d.loc_modeling_included);
const cutoffTs=new Date("{cutoff}T00:00:00Z").getTime();
const opts={{
chart:{{type:"line",height:"100%"}},
series:[
{{name:"Effective LOC",data:xs.map((t,i)=>[t,eff[i]])}},
{{name:"Modular LOC",data:xs.map((t,i)=>[t,mod[i]])}},
{{name:"Modeling LOC (all)",data:xs.map((t,i)=>[t,mdl_all[i]])}},
{{name:"Modeling LOC (included)",data:xs.map((t,i)=>[t,mdl_inc[i]])}}
],
xaxis:{{type:"datetime"}},
yaxis:{{labels:{{formatter:v=>Math.round(v)}}}},
stroke:{{width:2}},
tooltip:{{shared:true,x:{{format:"yyyy-MM-dd"}}}},
annotations:{{xaxis:[{{x:cutoffTs,borderColor:"#e11d48",label:{{text:"2024-05-31 modular",style:{{color:"#fff",background:"#e11d48"}}}}}}]}}
}};
new ApexCharts(document.getElementById("chart"),opts).render();
</script>"""
def main():
tmp = Path(tempfile.mkdtemp())
subprocess.check_call(["git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")])
sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip()
repo_path = tmp / "repo"
loc_growth = _compute_loc_growth(repo_path)
loc_json_str = json.dumps(loc_growth, separators=(",", ":"))
loc_html_str = _loc_html(loc_growth)
graph = build_graph_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
timeline = build_timeline_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
graph_html = generate_html(graph)
timeline_html = generate_timeline_html(timeline)
api = HfApi()
api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True)
key = f"{sha}/{SIM_METHOD}-m{int(MULTIMODAL)}"
latest = {
"sha": sha,
"updated_utc": datetime.now(timezone.utc).isoformat(),
"defaults": {"sim_method": SIM_METHOD, "min_threshold": MIN_THRESH, "multimodal": MULTIMODAL},
"paths": {
"graph_json": f"graph/{key}.json",
"graph_html": f"graph/{key}.html",
"timeline_json": f"timeline/{key}.json",
"timeline_html": f"timeline/{key}.html",
"loc_json": f"loc/{key}.json",
"loc_html": f"loc/{key}.html",
},
}
def put(path_in_repo: str, text: str):
api.upload_file(
path_or_fileobj=io.BytesIO(text.encode("utf-8")),
path_in_repo=path_in_repo,
repo_id=CACHE_REPO,
repo_type="dataset",
commit_message=f"cache {path_in_repo}",
)
put(f"graph/{key}.json", json.dumps(graph, separators=(",", ":")))
put(f"graph/{key}.html", graph_html)
put(f"timeline/{key}.json", json.dumps(timeline, separators=(",", ":")))
put(f"timeline/{key}.html", timeline_html)
put(f"loc/{key}.json", loc_json_str)
put(f"loc/{key}.html", loc_html_str)
put("latest.json", json.dumps(latest, separators=(",", ":")))
if __name__ == "__main__":
main()