File size: 7,735 Bytes

# build_cache.py
import os
import io
import json
import tarfile
import subprocess
import tempfile
from pathlib import Path
from datetime import datetime, timezone

from huggingface_hub import HfApi

from modular_graph_and_candidates import (
    build_graph_json,
    generate_html,
    build_timeline_json,
    generate_timeline_html,
)

REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
CACHE_REPO = "Molbap/hf_cached_embeds_log"
MIN_THRESH = 0.1
MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1", "true", "True", "YES", "yes"}
SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
MODULAR_CUTOFF_ISO = "2024-05-31"

def _run(cwd: Path, *args: str) -> str:
    p = subprocess.run(["git", *args], cwd=cwd, text=True, capture_output=True, timeout=1200)
    if p.returncode != 0:
        raise RuntimeError(p.stderr.strip()[:400])
    return p.stdout

def _count_lines(text: str) -> int:
    return text.count("\n") + (1 if text and not text.endswith("\n") else 0)

def _compute_loc_growth(repo: Path) -> dict:
    try:
        _run(repo, "fetch", "--unshallow", "--tags", "--prune")
    except Exception:
        _run(repo, "fetch", "--depth=100000", "--tags", "--prune")

    pathspec = "src/transformers/models"
    lines = _run(repo, "log", "--reverse", "--format=%H|%cI", "HEAD", "--", pathspec).splitlines()
    commits = [(ln.split("|", 1)[0], ln.split("|", 1)[1]) for ln in lines if "|" in ln]
    total = len(commits)
    if total > 500:
        step = max(1, total // 300)
        commits = commits[::step]

    out = []
    for sha, date_iso in commits:
        proc = subprocess.run(
            ["git", "archive", sha, "--", pathspec],
            cwd=repo, capture_output=True, timeout=180
        )
        if proc.returncode != 0 or not proc.stdout:
            # Fallback: zero for this point; continue
            out.append({
                "sha": sha, "date": date_iso,
                "loc_modeling_all": 0, "loc_modular": 0,
                "loc_modeling_included": 0, "effective_loc": 0,
                "n_models_with_modular": 0
            })
            continue

        buf = io.BytesIO(proc.stdout)
        modeling_by_model = {}
        modular_by_model = {}

        with tarfile.open(fileobj=buf, mode="r:*") as tar:
            for m in tar.getmembers():
                if not m.isfile():
                    continue
                name = m.name
                if not name.endswith(".py"):
                    continue
                if "/models/" not in name:
                    continue
                parts = name.split("/")
                try:
                    idx = parts.index("models")
                    model = parts[idx + 1] if idx + 1 < len(parts) else ""
                except ValueError:
                    model = ""
                if not model:
                    continue
                if "/modeling_" in name or "/modular_" in name:
                    f = tar.extractfile(m)
                    if not f:
                        continue
                    try:
                        txt = f.read().decode("utf-8", errors="ignore")
                    finally:
                        f.close()
                    n = _count_lines(txt)
                    if "/modular_" in name:
                        modular_by_model[model] = modular_by_model.get(model, 0) + n
                    elif "/modeling_" in name:
                        modeling_by_model[model] = modeling_by_model.get(model, 0) + n

        modeling_all = sum(modeling_by_model.values())
        modular_loc = sum(modular_by_model.values())
        models_with_modular = set(modular_by_model.keys())
        modeling_excluded = sum(modeling_by_model.get(m, 0) for m in models_with_modular)
        modeling_included = modeling_all - modeling_excluded
        effective = modeling_included + modular_loc

        out.append({
            "sha": sha,
            "date": date_iso,
            "loc_modeling_all": modeling_all,
            "loc_modular": modular_loc,
            "loc_modeling_included": modeling_included,
            "effective_loc": effective,
            "n_models_with_modular": len(models_with_modular),
        })

    return {"series": out, "cutoff": MODULAR_CUTOFF_ISO}

def _loc_html(loc: dict) -> str:
    data = json.dumps(loc["series"], separators=(",", ":"))
    cutoff = loc["cutoff"]
    return f"""<!doctype html><meta charset=utf-8>
<title>LOC growth</title>
<div id=chart style="height:60vh;width:90vw;margin:2rem auto;"></div>
<script src="https://cdn.jsdelivr.net/npm/apexcharts"></script>
<script>
const raw={data};
const xs=raw.map(d=>new Date(d.date).getTime());
const eff=raw.map(d=>d.effective_loc);
const mod=raw.map(d=>d.loc_modular);
const mdl_all=raw.map(d=>d.loc_modeling_all);
const mdl_inc=raw.map(d=>d.loc_modeling_included);
const cutoffTs=new Date("{cutoff}T00:00:00Z").getTime();
const opts={{
  chart:{{type:"line",height:"100%"}},
  series:[
    {{name:"Effective LOC",data:xs.map((t,i)=>[t,eff[i]])}},
    {{name:"Modular LOC",data:xs.map((t,i)=>[t,mod[i]])}},
    {{name:"Modeling LOC (all)",data:xs.map((t,i)=>[t,mdl_all[i]])}},
    {{name:"Modeling LOC (included)",data:xs.map((t,i)=>[t,mdl_inc[i]])}}
  ],
  xaxis:{{type:"datetime"}},
  yaxis:{{labels:{{formatter:v=>Math.round(v)}}}},
  stroke:{{width:2}},
  tooltip:{{shared:true,x:{{format:"yyyy-MM-dd"}}}},
  annotations:{{xaxis:[{{x:cutoffTs,borderColor:"#e11d48",label:{{text:"2024-05-31 modular",style:{{color:"#fff",background:"#e11d48"}}}}}}]}}
}};
new ApexCharts(document.getElementById("chart"),opts).render();
</script>"""

def main():
    tmp = Path(tempfile.mkdtemp())
    subprocess.check_call(["git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")])
    sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip()
    repo_path = tmp / "repo"

    loc_growth = _compute_loc_growth(repo_path)
    loc_json_str = json.dumps(loc_growth, separators=(",", ":"))
    loc_html_str = _loc_html(loc_growth)

    graph = build_graph_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
    timeline = build_timeline_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
    graph_html = generate_html(graph)
    timeline_html = generate_timeline_html(timeline)

    api = HfApi()
    api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True)

    key = f"{sha}/{SIM_METHOD}-m{int(MULTIMODAL)}"
    latest = {
        "sha": sha,
        "updated_utc": datetime.now(timezone.utc).isoformat(),
        "defaults": {"sim_method": SIM_METHOD, "min_threshold": MIN_THRESH, "multimodal": MULTIMODAL},
        "paths": {
            "graph_json": f"graph/{key}.json",
            "graph_html": f"graph/{key}.html",
            "timeline_json": f"timeline/{key}.json",
            "timeline_html": f"timeline/{key}.html",
            "loc_json": f"loc/{key}.json",
            "loc_html": f"loc/{key}.html",
        },
    }

    def put(path_in_repo: str, text: str):
        api.upload_file(
            path_or_fileobj=io.BytesIO(text.encode("utf-8")),
            path_in_repo=path_in_repo,
            repo_id=CACHE_REPO,
            repo_type="dataset",
            commit_message=f"cache {path_in_repo}",
        )

    put(f"graph/{key}.json", json.dumps(graph, separators=(",", ":")))
    put(f"graph/{key}.html", graph_html)
    put(f"timeline/{key}.json", json.dumps(timeline, separators=(",", ":")))
    put(f"timeline/{key}.html", timeline_html)
    put(f"loc/{key}.json", loc_json_str)
    put(f"loc/{key}.html", loc_html_str)
    put("latest.json", json.dumps(latest, separators=(",", ":")))

if __name__ == "__main__":
    main()