build a clean cache
Browse files- app.py +18 -60
- build_cache.py +64 -0
- schedule_job.json +23 -0
app.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
-
# app.py β Gradio Space wrapper for modular_graph_and_candidates
|
2 |
-
|
3 |
from __future__ import annotations
|
4 |
|
5 |
import json
|
@@ -9,36 +7,31 @@ import tempfile
|
|
9 |
from datetime import datetime, timedelta
|
10 |
from functools import lru_cache
|
11 |
from pathlib import Path
|
|
|
12 |
|
13 |
import gradio as gr
|
14 |
|
15 |
-
# ββ refactored helpers ββ
|
16 |
from modular_graph_and_candidates import build_graph_json, generate_html, build_timeline_json, generate_timeline_html
|
17 |
|
18 |
HF_MAIN_REPO = "https://github.com/huggingface/transformers"
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
@lru_cache(maxsize=4)
|
23 |
-
def clone_or_cache(repo_url: str) -> Path:
|
24 |
-
"""Shallowβclone *repo_url* and reuse it for 24β―h."""
|
25 |
-
tmp_root = Path(tempfile.gettempdir())
|
26 |
-
cache_dir = tmp_root / f"repo_{abs(hash(repo_url))}"
|
27 |
-
stamp = cache_dir / ".cloned_at"
|
28 |
-
|
29 |
-
if cache_dir.exists() and stamp.exists():
|
30 |
-
try:
|
31 |
-
if datetime.utcnow() - datetime.fromisoformat(stamp.read_text().strip()) < timedelta(days=1):
|
32 |
-
return cache_dir
|
33 |
-
except Exception:
|
34 |
-
pass # fall through β reclone
|
35 |
-
shutil.rmtree(cache_dir, ignore_errors=True)
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
# βββββββββββββββββββββββββββββ main callback βββββββββββββββββββββββββββββββββββββ
|
42 |
|
43 |
def _escape_srcdoc(text: str) -> str:
|
44 |
"""Escape for inclusion inside an <iframe srcdoc="β¦"> attribute."""
|
@@ -52,48 +45,13 @@ def _escape_srcdoc(text: str) -> str:
|
|
52 |
|
53 |
|
54 |
def run_graph(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
|
55 |
-
""
|
56 |
-
repo_path = clone_or_cache(repo_url)
|
57 |
-
|
58 |
-
graph = build_graph_json(
|
59 |
-
transformers_dir=repo_path,
|
60 |
-
threshold=threshold,
|
61 |
-
multimodal=multimodal,
|
62 |
-
sim_method=sim_method,
|
63 |
-
)
|
64 |
-
|
65 |
-
raw_html = generate_html(graph)
|
66 |
|
67 |
-
iframe_html = (
|
68 |
-
f'<iframe style="width:100%;height:85vh;border:none;" '
|
69 |
-
f'srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
|
70 |
-
)
|
71 |
-
|
72 |
-
tmp_json = Path(tempfile.mktemp(suffix=".json"))
|
73 |
-
tmp_json.write_text(json.dumps(graph), encoding="utf-8")
|
74 |
-
return iframe_html, str(tmp_json)
|
75 |
|
76 |
def run_timeline(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
|
77 |
-
""
|
78 |
-
repo_path = clone_or_cache(repo_url)
|
79 |
-
|
80 |
-
timeline = build_timeline_json(
|
81 |
-
transformers_dir=repo_path,
|
82 |
-
threshold=threshold,
|
83 |
-
multimodal=multimodal,
|
84 |
-
sim_method=sim_method,
|
85 |
-
)
|
86 |
|
87 |
-
raw_html = generate_timeline_html(timeline)
|
88 |
-
|
89 |
-
iframe_html = (
|
90 |
-
f'<iframe style="width:100%;height:85vh;border:none;" '
|
91 |
-
f'srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
|
92 |
-
)
|
93 |
|
94 |
-
tmp_json = Path(tempfile.mktemp(suffix="_timeline.json"))
|
95 |
-
tmp_json.write_text(json.dumps(timeline), encoding="utf-8")
|
96 |
-
return iframe_html, str(tmp_json)
|
97 |
|
98 |
# βββββββββββββββββββββββββββββ UI ββββββββββββββββββββββββββββββββββββββββββββββββ
|
99 |
|
|
|
|
|
|
|
1 |
from __future__ import annotations
|
2 |
|
3 |
import json
|
|
|
7 |
from datetime import datetime, timedelta
|
8 |
from functools import lru_cache
|
9 |
from pathlib import Path
|
10 |
+
from huggingface_hub import hf_hub_download
|
11 |
|
12 |
import gradio as gr
|
13 |
|
|
|
14 |
from modular_graph_and_candidates import build_graph_json, generate_html, build_timeline_json, generate_timeline_html
|
15 |
|
16 |
HF_MAIN_REPO = "https://github.com/huggingface/transformers"
|
17 |
|
18 |
+
CACHE_REPO = "Molbap/hf_cached_embeds_log"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
def _fetch_from_cache_repo(kind: str, sim_method: str, threshold: float, multimodal: bool):
|
21 |
+
repo_id = CACHE_REPO
|
22 |
+
latest_fp = hf_hub_download(repo_id=repo_id, filename="latest.json", repo_type="dataset")
|
23 |
+
info = json.loads(Path(latest_fp).read_text(encoding="utf-8"))
|
24 |
+
sha = info.get("sha")
|
25 |
+
key = f"{sha}/{sim_method}-{threshold:.2f}-m{int(multimodal)}"
|
26 |
+
html_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.html", repo_type="dataset")
|
27 |
+
json_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.json", repo_type="dataset")
|
28 |
+
raw_html = Path(html_fp).read_text(encoding="utf-8")
|
29 |
+
json_text = Path(json_fp).read_text(encoding="utf-8")
|
30 |
+
iframe_html = f'<iframe style="width:100%;height:85vh;border:none;" srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
|
31 |
+
tmp = Path(tempfile.mkstemp(suffix=("_timeline.json" if kind == "timeline" else ".json"))[1])
|
32 |
+
tmp.write_text(json_text, encoding="utf-8")
|
33 |
+
return iframe_html, str(tmp)
|
34 |
|
|
|
35 |
|
36 |
def _escape_srcdoc(text: str) -> str:
|
37 |
"""Escape for inclusion inside an <iframe srcdoc="β¦"> attribute."""
|
|
|
45 |
|
46 |
|
47 |
def run_graph(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
|
48 |
+
return _fetch_from_cache_repo("graph", sim_method, threshold, multimodal)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
def run_timeline(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
|
52 |
+
return _fetch_from_cache_repo("timeline", sim_method, threshold, multimodal)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
|
|
|
|
|
|
55 |
|
56 |
# βββββββββββββββββββββββββββββ UI ββββββββββββββββββββββββββββββββββββββββββββββββ
|
57 |
|
build_cache.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# build_cache.py
|
2 |
+
import os
|
3 |
+
import io
|
4 |
+
import json
|
5 |
+
import subprocess
|
6 |
+
import tempfile
|
7 |
+
from pathlib import Path
|
8 |
+
from datetime import datetime, timezone
|
9 |
+
from huggingface_hub import HfApi
|
10 |
+
from modular_graph_and_candidates import (
|
11 |
+
build_graph_json, generate_html,
|
12 |
+
build_timeline_json, generate_timeline_html
|
13 |
+
)
|
14 |
+
|
15 |
+
REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
|
16 |
+
CACHE_REPO = "Molbap/hf_cached_embeds_log"
|
17 |
+
THRESH = float(os.getenv("SIM_THRESHOLD", "0.50"))
|
18 |
+
MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1","true","True","YES","yes"}
|
19 |
+
SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
|
20 |
+
|
21 |
+
def main():
|
22 |
+
tmp = Path(tempfile.mkdtemp())
|
23 |
+
subprocess.check_call(["git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")])
|
24 |
+
sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip()
|
25 |
+
repo_path = tmp / "repo"
|
26 |
+
|
27 |
+
graph = build_graph_json(repo_path, threshold=THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
|
28 |
+
timeline = build_timeline_json(repo_path, threshold=THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
|
29 |
+
graph_html = generate_html(graph)
|
30 |
+
timeline_html = generate_timeline_html(timeline)
|
31 |
+
|
32 |
+
api = HfApi()
|
33 |
+
api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True)
|
34 |
+
|
35 |
+
key = f"{sha}/{SIM_METHOD}-{THRESH:.2f}-m{int(MULTIMODAL)}"
|
36 |
+
latest = {
|
37 |
+
"sha": sha,
|
38 |
+
"updated_utc": datetime.now(timezone.utc).isoformat(),
|
39 |
+
"defaults": {"sim_method": SIM_METHOD, "threshold": THRESH, "multimodal": MULTIMODAL},
|
40 |
+
"paths": {
|
41 |
+
"graph_json": f"graph/{key}.json",
|
42 |
+
"graph_html": f"graph/{key}.html",
|
43 |
+
"timeline_json": f"timeline/{key}.json",
|
44 |
+
"timeline_html": f"timeline/{key}.html",
|
45 |
+
},
|
46 |
+
}
|
47 |
+
|
48 |
+
def put(path_in_repo: str, text: str):
|
49 |
+
api.upload_file(
|
50 |
+
path_or_fileobj=io.BytesIO(text.encode("utf-8")),
|
51 |
+
path_in_repo=path_in_repo,
|
52 |
+
repo_id=CACHE_REPO,
|
53 |
+
repo_type="dataset",
|
54 |
+
commit_message=f"cache {path_in_repo}",
|
55 |
+
)
|
56 |
+
|
57 |
+
put(f"graph/{key}.json", json.dumps(graph, separators=(",", ":")))
|
58 |
+
put(f"graph/{key}.html", graph_html)
|
59 |
+
put(f"timeline/{key}.json", json.dumps(timeline, separators=(",", ":")))
|
60 |
+
put(f"timeline/{key}.html", timeline_html)
|
61 |
+
put("latest.json", json.dumps(latest, separators=(",", ":")))
|
62 |
+
|
63 |
+
if __name__ == "__main__":
|
64 |
+
main()
|
schedule_job.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"jobSpec": {
|
3 |
+
"dockerImage": "python:3.12",
|
4 |
+
"command": [
|
5 |
+
"bash",
|
6 |
+
"-lc",
|
7 |
+
"apt-get update -y && apt-get install -y git ca-certificates && git clone --depth 1 https://github.com/Molbap/transformers-modular-refactor.git /w && pip install -U huggingface_hub numpy tqdm sentence-transformers torch spaces && python /w/build_cache.py"
|
8 |
+
],
|
9 |
+
"flavor": "cpu-basic",
|
10 |
+
"timeout": 7200,
|
11 |
+
"env": {
|
12 |
+
"REPO_URL": "https://github.com/huggingface/transformers",
|
13 |
+
"SIM_THRESHOLD": "0.50",
|
14 |
+
"MULTIMODAL": "0",
|
15 |
+
"SIM_METHOD": "jaccard"
|
16 |
+
},
|
17 |
+
"secrets": ["HF_TOKEN"],
|
18 |
+
"arch": "amd64"
|
19 |
+
},
|
20 |
+
"schedule": "8 9 * * *",
|
21 |
+
"suspend": false,
|
22 |
+
"concurrency": false
|
23 |
+
}
|