Molbap HF Staff commited on
Commit
6641fa8
Β·
1 Parent(s): 49600c8

build a clean cache

Browse files
Files changed (3) hide show
  1. app.py +18 -60
  2. build_cache.py +64 -0
  3. schedule_job.json +23 -0
app.py CHANGED
@@ -1,5 +1,3 @@
1
- # app.py – Gradio Space wrapper for modular_graph_and_candidates
2
-
3
  from __future__ import annotations
4
 
5
  import json
@@ -9,36 +7,31 @@ import tempfile
9
  from datetime import datetime, timedelta
10
  from functools import lru_cache
11
  from pathlib import Path
 
12
 
13
  import gradio as gr
14
 
15
- # β€”β€” refactored helpers β€”β€”
16
  from modular_graph_and_candidates import build_graph_json, generate_html, build_timeline_json, generate_timeline_html
17
 
18
  HF_MAIN_REPO = "https://github.com/huggingface/transformers"
19
 
20
- # ───────────────────────────── cache repo once per 24β€―h ───────────────────────────
21
-
22
- @lru_cache(maxsize=4)
23
- def clone_or_cache(repo_url: str) -> Path:
24
- """Shallow‑clone *repo_url* and reuse it for 24β€―h."""
25
- tmp_root = Path(tempfile.gettempdir())
26
- cache_dir = tmp_root / f"repo_{abs(hash(repo_url))}"
27
- stamp = cache_dir / ".cloned_at"
28
-
29
- if cache_dir.exists() and stamp.exists():
30
- try:
31
- if datetime.utcnow() - datetime.fromisoformat(stamp.read_text().strip()) < timedelta(days=1):
32
- return cache_dir
33
- except Exception:
34
- pass # fall through β†’ reclone
35
- shutil.rmtree(cache_dir, ignore_errors=True)
36
 
37
- subprocess.check_call(["git", "clone", "--depth", "1", repo_url, str(cache_dir)])
38
- stamp.write_text(datetime.utcnow().isoformat())
39
- return cache_dir
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- # ───────────────────────────── main callback ─────────────────────────────────────
42
 
43
  def _escape_srcdoc(text: str) -> str:
44
  """Escape for inclusion inside an <iframe srcdoc="…"> attribute."""
@@ -52,48 +45,13 @@ def _escape_srcdoc(text: str) -> str:
52
 
53
 
54
  def run_graph(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
55
- """Generate the dependency graph visualization."""
56
- repo_path = clone_or_cache(repo_url)
57
-
58
- graph = build_graph_json(
59
- transformers_dir=repo_path,
60
- threshold=threshold,
61
- multimodal=multimodal,
62
- sim_method=sim_method,
63
- )
64
-
65
- raw_html = generate_html(graph)
66
 
67
- iframe_html = (
68
- f'<iframe style="width:100%;height:85vh;border:none;" '
69
- f'srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
70
- )
71
-
72
- tmp_json = Path(tempfile.mktemp(suffix=".json"))
73
- tmp_json.write_text(json.dumps(graph), encoding="utf-8")
74
- return iframe_html, str(tmp_json)
75
 
76
  def run_timeline(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
77
- """Generate the chronological timeline visualization."""
78
- repo_path = clone_or_cache(repo_url)
79
-
80
- timeline = build_timeline_json(
81
- transformers_dir=repo_path,
82
- threshold=threshold,
83
- multimodal=multimodal,
84
- sim_method=sim_method,
85
- )
86
 
87
- raw_html = generate_timeline_html(timeline)
88
-
89
- iframe_html = (
90
- f'<iframe style="width:100%;height:85vh;border:none;" '
91
- f'srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
92
- )
93
 
94
- tmp_json = Path(tempfile.mktemp(suffix="_timeline.json"))
95
- tmp_json.write_text(json.dumps(timeline), encoding="utf-8")
96
- return iframe_html, str(tmp_json)
97
 
98
  # ───────────────────────────── UI ────────────────────────────────────────────────
99
 
 
 
 
1
  from __future__ import annotations
2
 
3
  import json
 
7
  from datetime import datetime, timedelta
8
  from functools import lru_cache
9
  from pathlib import Path
10
+ from huggingface_hub import hf_hub_download
11
 
12
  import gradio as gr
13
 
 
14
  from modular_graph_and_candidates import build_graph_json, generate_html, build_timeline_json, generate_timeline_html
15
 
16
  HF_MAIN_REPO = "https://github.com/huggingface/transformers"
17
 
18
+ CACHE_REPO = "Molbap/hf_cached_embeds_log"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ def _fetch_from_cache_repo(kind: str, sim_method: str, threshold: float, multimodal: bool):
21
+ repo_id = CACHE_REPO
22
+ latest_fp = hf_hub_download(repo_id=repo_id, filename="latest.json", repo_type="dataset")
23
+ info = json.loads(Path(latest_fp).read_text(encoding="utf-8"))
24
+ sha = info.get("sha")
25
+ key = f"{sha}/{sim_method}-{threshold:.2f}-m{int(multimodal)}"
26
+ html_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.html", repo_type="dataset")
27
+ json_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.json", repo_type="dataset")
28
+ raw_html = Path(html_fp).read_text(encoding="utf-8")
29
+ json_text = Path(json_fp).read_text(encoding="utf-8")
30
+ iframe_html = f'<iframe style="width:100%;height:85vh;border:none;" srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
31
+ tmp = Path(tempfile.mkstemp(suffix=("_timeline.json" if kind == "timeline" else ".json"))[1])
32
+ tmp.write_text(json_text, encoding="utf-8")
33
+ return iframe_html, str(tmp)
34
 
 
35
 
36
  def _escape_srcdoc(text: str) -> str:
37
  """Escape for inclusion inside an <iframe srcdoc="…"> attribute."""
 
45
 
46
 
47
  def run_graph(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
48
+ return _fetch_from_cache_repo("graph", sim_method, threshold, multimodal)
 
 
 
 
 
 
 
 
 
 
49
 
 
 
 
 
 
 
 
 
50
 
51
  def run_timeline(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
52
+ return _fetch_from_cache_repo("timeline", sim_method, threshold, multimodal)
 
 
 
 
 
 
 
 
53
 
 
 
 
 
 
 
54
 
 
 
 
55
 
56
  # ───────────────────────────── UI ────────────────────────────────────────────────
57
 
build_cache.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build_cache.py
2
+ import os
3
+ import io
4
+ import json
5
+ import subprocess
6
+ import tempfile
7
+ from pathlib import Path
8
+ from datetime import datetime, timezone
9
+ from huggingface_hub import HfApi
10
+ from modular_graph_and_candidates import (
11
+ build_graph_json, generate_html,
12
+ build_timeline_json, generate_timeline_html
13
+ )
14
+
15
+ REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
16
+ CACHE_REPO = "Molbap/hf_cached_embeds_log"
17
+ THRESH = float(os.getenv("SIM_THRESHOLD", "0.50"))
18
+ MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1","true","True","YES","yes"}
19
+ SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
20
+
21
+ def main():
22
+ tmp = Path(tempfile.mkdtemp())
23
+ subprocess.check_call(["git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")])
24
+ sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip()
25
+ repo_path = tmp / "repo"
26
+
27
+ graph = build_graph_json(repo_path, threshold=THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
28
+ timeline = build_timeline_json(repo_path, threshold=THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
29
+ graph_html = generate_html(graph)
30
+ timeline_html = generate_timeline_html(timeline)
31
+
32
+ api = HfApi()
33
+ api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True)
34
+
35
+ key = f"{sha}/{SIM_METHOD}-{THRESH:.2f}-m{int(MULTIMODAL)}"
36
+ latest = {
37
+ "sha": sha,
38
+ "updated_utc": datetime.now(timezone.utc).isoformat(),
39
+ "defaults": {"sim_method": SIM_METHOD, "threshold": THRESH, "multimodal": MULTIMODAL},
40
+ "paths": {
41
+ "graph_json": f"graph/{key}.json",
42
+ "graph_html": f"graph/{key}.html",
43
+ "timeline_json": f"timeline/{key}.json",
44
+ "timeline_html": f"timeline/{key}.html",
45
+ },
46
+ }
47
+
48
+ def put(path_in_repo: str, text: str):
49
+ api.upload_file(
50
+ path_or_fileobj=io.BytesIO(text.encode("utf-8")),
51
+ path_in_repo=path_in_repo,
52
+ repo_id=CACHE_REPO,
53
+ repo_type="dataset",
54
+ commit_message=f"cache {path_in_repo}",
55
+ )
56
+
57
+ put(f"graph/{key}.json", json.dumps(graph, separators=(",", ":")))
58
+ put(f"graph/{key}.html", graph_html)
59
+ put(f"timeline/{key}.json", json.dumps(timeline, separators=(",", ":")))
60
+ put(f"timeline/{key}.html", timeline_html)
61
+ put("latest.json", json.dumps(latest, separators=(",", ":")))
62
+
63
+ if __name__ == "__main__":
64
+ main()
schedule_job.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "jobSpec": {
3
+ "dockerImage": "python:3.12",
4
+ "command": [
5
+ "bash",
6
+ "-lc",
7
+ "apt-get update -y && apt-get install -y git ca-certificates && git clone --depth 1 https://github.com/Molbap/transformers-modular-refactor.git /w && pip install -U huggingface_hub numpy tqdm sentence-transformers torch spaces && python /w/build_cache.py"
8
+ ],
9
+ "flavor": "cpu-basic",
10
+ "timeout": 7200,
11
+ "env": {
12
+ "REPO_URL": "https://github.com/huggingface/transformers",
13
+ "SIM_THRESHOLD": "0.50",
14
+ "MULTIMODAL": "0",
15
+ "SIM_METHOD": "jaccard"
16
+ },
17
+ "secrets": ["HF_TOKEN"],
18
+ "arch": "amd64"
19
+ },
20
+ "schedule": "8 9 * * *",
21
+ "suspend": false,
22
+ "concurrency": false
23
+ }