|
|
|
import os |
|
import io |
|
import json |
|
import tarfile |
|
import subprocess |
|
import tempfile |
|
from pathlib import Path |
|
from datetime import datetime, timezone |
|
|
|
from huggingface_hub import HfApi |
|
|
|
from modular_graph_and_candidates import ( |
|
build_graph_json, |
|
generate_html, |
|
build_timeline_json, |
|
generate_timeline_html, |
|
) |
|
|
|
REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers") |
|
CACHE_REPO = "Molbap/hf_cached_embeds_log" |
|
MIN_THRESH = 0.1 |
|
MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1", "true", "True", "YES", "yes"} |
|
SIM_METHOD = os.getenv("SIM_METHOD", "jaccard") |
|
MODULAR_CUTOFF_ISO = "2024-05-31" |
|
|
|
def _run(cwd: Path, *args: str) -> str: |
|
p = subprocess.run(["git", *args], cwd=cwd, text=True, capture_output=True, timeout=1200) |
|
if p.returncode != 0: |
|
raise RuntimeError(p.stderr.strip()[:400]) |
|
return p.stdout |
|
|
|
def _count_lines(text: str) -> int: |
|
return text.count("\n") + (1 if text and not text.endswith("\n") else 0) |
|
|
|
def _compute_loc_growth(repo: Path) -> dict: |
|
try: |
|
_run(repo, "fetch", "--unshallow", "--tags", "--prune") |
|
except Exception: |
|
_run(repo, "fetch", "--depth=100000", "--tags", "--prune") |
|
|
|
pathspec = "src/transformers/models" |
|
lines = _run(repo, "log", "--reverse", "--format=%H|%cI", "HEAD", "--", pathspec).splitlines() |
|
commits = [(ln.split("|", 1)[0], ln.split("|", 1)[1]) for ln in lines if "|" in ln] |
|
total = len(commits) |
|
if total > 500: |
|
step = max(1, total // 300) |
|
commits = commits[::step] |
|
|
|
out = [] |
|
for sha, date_iso in commits: |
|
proc = subprocess.run( |
|
["git", "archive", sha, "--", pathspec], |
|
cwd=repo, capture_output=True, timeout=180 |
|
) |
|
if proc.returncode != 0 or not proc.stdout: |
|
|
|
out.append({ |
|
"sha": sha, "date": date_iso, |
|
"loc_modeling_all": 0, "loc_modular": 0, |
|
"loc_modeling_included": 0, "effective_loc": 0, |
|
"n_models_with_modular": 0 |
|
}) |
|
continue |
|
|
|
buf = io.BytesIO(proc.stdout) |
|
modeling_by_model = {} |
|
modular_by_model = {} |
|
|
|
with tarfile.open(fileobj=buf, mode="r:*") as tar: |
|
for m in tar.getmembers(): |
|
if not m.isfile(): |
|
continue |
|
name = m.name |
|
if not name.endswith(".py"): |
|
continue |
|
if "/models/" not in name: |
|
continue |
|
parts = name.split("/") |
|
try: |
|
idx = parts.index("models") |
|
model = parts[idx + 1] if idx + 1 < len(parts) else "" |
|
except ValueError: |
|
model = "" |
|
if not model: |
|
continue |
|
if "/modeling_" in name or "/modular_" in name: |
|
f = tar.extractfile(m) |
|
if not f: |
|
continue |
|
try: |
|
txt = f.read().decode("utf-8", errors="ignore") |
|
finally: |
|
f.close() |
|
n = _count_lines(txt) |
|
if "/modular_" in name: |
|
modular_by_model[model] = modular_by_model.get(model, 0) + n |
|
elif "/modeling_" in name: |
|
modeling_by_model[model] = modeling_by_model.get(model, 0) + n |
|
|
|
modeling_all = sum(modeling_by_model.values()) |
|
modular_loc = sum(modular_by_model.values()) |
|
models_with_modular = set(modular_by_model.keys()) |
|
modeling_excluded = sum(modeling_by_model.get(m, 0) for m in models_with_modular) |
|
modeling_included = modeling_all - modeling_excluded |
|
effective = modeling_included + modular_loc |
|
|
|
out.append({ |
|
"sha": sha, |
|
"date": date_iso, |
|
"loc_modeling_all": modeling_all, |
|
"loc_modular": modular_loc, |
|
"loc_modeling_included": modeling_included, |
|
"effective_loc": effective, |
|
"n_models_with_modular": len(models_with_modular), |
|
}) |
|
|
|
return {"series": out, "cutoff": MODULAR_CUTOFF_ISO} |
|
|
|
def _loc_html(loc: dict) -> str: |
|
data = json.dumps(loc["series"], separators=(",", ":")) |
|
cutoff = loc["cutoff"] |
|
return f"""<!doctype html><meta charset=utf-8> |
|
<title>LOC growth</title> |
|
<div id=chart style="height:60vh;width:90vw;margin:2rem auto;"></div> |
|
<script src="https://cdn.jsdelivr.net/npm/apexcharts"></script> |
|
<script> |
|
const raw={data}; |
|
const xs=raw.map(d=>new Date(d.date).getTime()); |
|
const eff=raw.map(d=>d.effective_loc); |
|
const mod=raw.map(d=>d.loc_modular); |
|
const mdl_all=raw.map(d=>d.loc_modeling_all); |
|
const mdl_inc=raw.map(d=>d.loc_modeling_included); |
|
const cutoffTs=new Date("{cutoff}T00:00:00Z").getTime(); |
|
const opts={{ |
|
chart:{{type:"line",height:"100%"}}, |
|
series:[ |
|
{{name:"Effective LOC",data:xs.map((t,i)=>[t,eff[i]])}}, |
|
{{name:"Modular LOC",data:xs.map((t,i)=>[t,mod[i]])}}, |
|
{{name:"Modeling LOC (all)",data:xs.map((t,i)=>[t,mdl_all[i]])}}, |
|
{{name:"Modeling LOC (included)",data:xs.map((t,i)=>[t,mdl_inc[i]])}} |
|
], |
|
xaxis:{{type:"datetime"}}, |
|
yaxis:{{labels:{{formatter:v=>Math.round(v)}}}}, |
|
stroke:{{width:2}}, |
|
tooltip:{{shared:true,x:{{format:"yyyy-MM-dd"}}}}, |
|
annotations:{{xaxis:[{{x:cutoffTs,borderColor:"#e11d48",label:{{text:"2024-05-31 modular",style:{{color:"#fff",background:"#e11d48"}}}}}}]}} |
|
}}; |
|
new ApexCharts(document.getElementById("chart"),opts).render(); |
|
</script>""" |
|
|
|
def main(): |
|
tmp = Path(tempfile.mkdtemp()) |
|
subprocess.check_call(["git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")]) |
|
sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip() |
|
repo_path = tmp / "repo" |
|
|
|
loc_growth = _compute_loc_growth(repo_path) |
|
loc_json_str = json.dumps(loc_growth, separators=(",", ":")) |
|
loc_html_str = _loc_html(loc_growth) |
|
|
|
graph = build_graph_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD) |
|
timeline = build_timeline_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD) |
|
graph_html = generate_html(graph) |
|
timeline_html = generate_timeline_html(timeline) |
|
|
|
api = HfApi() |
|
api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True) |
|
|
|
key = f"{sha}/{SIM_METHOD}-m{int(MULTIMODAL)}" |
|
latest = { |
|
"sha": sha, |
|
"updated_utc": datetime.now(timezone.utc).isoformat(), |
|
"defaults": {"sim_method": SIM_METHOD, "min_threshold": MIN_THRESH, "multimodal": MULTIMODAL}, |
|
"paths": { |
|
"graph_json": f"graph/{key}.json", |
|
"graph_html": f"graph/{key}.html", |
|
"timeline_json": f"timeline/{key}.json", |
|
"timeline_html": f"timeline/{key}.html", |
|
"loc_json": f"loc/{key}.json", |
|
"loc_html": f"loc/{key}.html", |
|
}, |
|
} |
|
|
|
def put(path_in_repo: str, text: str): |
|
api.upload_file( |
|
path_or_fileobj=io.BytesIO(text.encode("utf-8")), |
|
path_in_repo=path_in_repo, |
|
repo_id=CACHE_REPO, |
|
repo_type="dataset", |
|
commit_message=f"cache {path_in_repo}", |
|
) |
|
|
|
put(f"graph/{key}.json", json.dumps(graph, separators=(",", ":"))) |
|
put(f"graph/{key}.html", graph_html) |
|
put(f"timeline/{key}.json", json.dumps(timeline, separators=(",", ":"))) |
|
put(f"timeline/{key}.html", timeline_html) |
|
put(f"loc/{key}.json", loc_json_str) |
|
put(f"loc/{key}.html", loc_html_str) |
|
put("latest.json", json.dumps(latest, separators=(",", ":"))) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|