Molbap HF Staff commited on
Commit
6d106b8
Β·
1 Parent(s): 49600c8

setup cache

Browse files
Files changed (3) hide show
  1. app.py +44 -11
  2. build_cache.py +107 -0
  3. modular_graph_and_candidates.py +2 -33
app.py CHANGED
@@ -9,12 +9,47 @@ import tempfile
9
  from datetime import datetime, timedelta
10
  from functools import lru_cache
11
  from pathlib import Path
12
-
 
 
13
  import gradio as gr
14
 
15
  # β€”β€” refactored helpers β€”β€”
16
  from modular_graph_and_candidates import build_graph_json, generate_html, build_timeline_json, generate_timeline_html
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  HF_MAIN_REPO = "https://github.com/huggingface/transformers"
19
 
20
  # ───────────────────────────── cache repo once per 24β€―h ───────────────────────────
@@ -40,19 +75,13 @@ def clone_or_cache(repo_url: str) -> Path:
40
 
41
  # ───────────────────────────── main callback ─────────────────────────────────────
42
 
43
- def _escape_srcdoc(text: str) -> str:
44
- """Escape for inclusion inside an <iframe srcdoc="…"> attribute."""
45
- return (
46
- text.replace("&", "&amp;")
47
- .replace("\"", "&quot;")
48
- .replace("'", "&#x27;")
49
- .replace("<", "&lt;")
50
- .replace(">", "&gt;")
51
- )
52
-
53
 
54
  def run_graph(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
55
  """Generate the dependency graph visualization."""
 
 
 
 
56
  repo_path = clone_or_cache(repo_url)
57
 
58
  graph = build_graph_json(
@@ -75,6 +104,10 @@ def run_graph(repo_url: str, threshold: float, multimodal: bool, sim_method: str
75
 
76
  def run_timeline(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
77
  """Generate the chronological timeline visualization."""
 
 
 
 
78
  repo_path = clone_or_cache(repo_url)
79
 
80
  timeline = build_timeline_json(
 
9
  from datetime import datetime, timedelta
10
  from functools import lru_cache
11
  from pathlib import Path
12
+ import os, json, tempfile
13
+ from pathlib import Path
14
+ from huggingface_hub import hf_hub_download
15
  import gradio as gr
16
 
17
  # β€”β€” refactored helpers β€”β€”
18
  from modular_graph_and_candidates import build_graph_json, generate_html, build_timeline_json, generate_timeline_html
19
 
20
+ def _escape_srcdoc(text: str) -> str:
21
+ """Escape for inclusion inside an <iframe srcdoc="…"> attribute."""
22
+ return (
23
+ text.replace("&", "&amp;")
24
+ .replace("\"", "&quot;")
25
+ .replace("'", "&#x27;")
26
+ .replace("<", "&lt;")
27
+ .replace(">", "&gt;")
28
+ )
29
+
30
+ def _fetch_from_cache_repo(kind: str, sim_method: str, threshold: float, multimodal: bool):
31
+ """Fetch cached data from Molbap/hf_cached_embeds_log repo."""
32
+
33
+ repo_id = "Molbap/hf_cached_embeds_log"
34
+ try:
35
+ latest_fp = hf_hub_download(repo_id=repo_id, filename="latest.json")
36
+ info = json.loads(Path(latest_fp).read_text(encoding="utf-8"))
37
+ sha = info.get("sha")
38
+ key = f"{sha}/{sim_method}-{threshold:.2f}-m{int(multimodal)}"
39
+
40
+ html_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.html")
41
+ json_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.json")
42
+
43
+ raw_html = Path(html_fp).read_text(encoding="utf-8")
44
+ json_text = Path(json_fp).read_text(encoding="utf-8")
45
+
46
+ iframe_html = f'<iframe style="width:100%;height:85vh;border:none;" srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
47
+ tmp = Path(tempfile.mkstemp(suffix=("_timeline.json" if kind == "timeline" else ".json"))[1])
48
+ tmp.write_text(json_text, encoding="utf-8")
49
+ return iframe_html, str(tmp)
50
+ except Exception:
51
+ return None
52
+
53
  HF_MAIN_REPO = "https://github.com/huggingface/transformers"
54
 
55
  # ───────────────────────────── cache repo once per 24β€―h ───────────────────────────
 
75
 
76
  # ───────────────────────────── main callback ─────────────────────────────────────
77
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  def run_graph(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
80
  """Generate the dependency graph visualization."""
81
+ hit = _fetch_from_cache_repo("graph", sim_method, threshold, multimodal)
82
+ if hit:
83
+ return hit
84
+
85
  repo_path = clone_or_cache(repo_url)
86
 
87
  graph = build_graph_json(
 
104
 
105
  def run_timeline(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
106
  """Generate the chronological timeline visualization."""
107
+ hit = _fetch_from_cache_repo("timeline", sim_method, threshold, multimodal)
108
+ if hit:
109
+ return hit
110
+
111
  repo_path = clone_or_cache(repo_url)
112
 
113
  timeline = build_timeline_json(
build_cache.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import subprocess
4
+ import tempfile
5
+ from pathlib import Path
6
+ from datetime import datetime, timezone
7
+ from huggingface_hub import HfApi
8
+
9
+ from modular_graph_and_candidates import (
10
+ build_graph_json,
11
+ generate_html,
12
+ build_timeline_json,
13
+ generate_timeline_html
14
+ )
15
+
16
+ REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
17
+ CACHE_REPO = "Molbap/hf_cached_embeds_log"
18
+ THRESH = float(os.getenv("SIM_THRESHOLD", "0.50"))
19
+ MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1", "true", "True", "YES", "yes"}
20
+ SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
21
+
22
+ def main():
23
+ print(f"Building cache for {REPO_URL}")
24
+ print(f"Config: threshold={THRESH}, multimodal={MULTIMODAL}, method={SIM_METHOD}")
25
+
26
+ tmp = Path(tempfile.mkdtemp())
27
+ print(f"Working in {tmp}")
28
+
29
+ print("Cloning repository...")
30
+ subprocess.check_call([
31
+ "git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")
32
+ ])
33
+
34
+ sha = subprocess.check_output([
35
+ "git", "rev-parse", "HEAD"
36
+ ], cwd=tmp / "repo", text=True).strip()
37
+
38
+ print(f"Repository SHA: {sha}")
39
+
40
+ repo_path = tmp / "repo"
41
+
42
+ print("Building graph...")
43
+ graph = build_graph_json(
44
+ transformers_dir=repo_path,
45
+ threshold=THRESH,
46
+ multimodal=MULTIMODAL,
47
+ sim_method=SIM_METHOD,
48
+ )
49
+
50
+ print("Building timeline...")
51
+ timeline = build_timeline_json(
52
+ transformers_dir=repo_path,
53
+ threshold=THRESH,
54
+ multimodal=MULTIMODAL,
55
+ sim_method=SIM_METHOD,
56
+ )
57
+
58
+ print("Generating HTML...")
59
+ graph_html = generate_html(graph)
60
+ timeline_html = generate_timeline_html(timeline)
61
+
62
+ print(f"Uploading to {CACHE_REPO}...")
63
+
64
+ api = HfApi()
65
+
66
+ key = f"{sha}/{SIM_METHOD}-{THRESH:.2f}-m{int(MULTIMODAL)}"
67
+
68
+ latest = {
69
+ "sha": sha,
70
+ "updated_utc": datetime.now(timezone.utc).isoformat(),
71
+ "defaults": {
72
+ "sim_method": SIM_METHOD,
73
+ "threshold": THRESH,
74
+ "multimodal": MULTIMODAL
75
+ },
76
+ "paths": {
77
+ "graph_json": f"graph/{key}.json",
78
+ "graph_html": f"graph/{key}.html",
79
+ "timeline_json": f"timeline/{key}.json",
80
+ "timeline_html": f"timeline/{key}.html",
81
+ },
82
+ }
83
+
84
+ files_to_upload = [
85
+ (f"graph/{key}.json", json.dumps(graph, separators=(',', ':'))),
86
+ (f"graph/{key}.html", graph_html),
87
+ (f"timeline/{key}.json", json.dumps(timeline, separators=(',', ':'))),
88
+ (f"timeline/{key}.html", timeline_html),
89
+ ("latest.json", json.dumps(latest, separators=(',', ':'))),
90
+ ]
91
+
92
+ for path_in_repo, content in files_to_upload:
93
+ temp_file = tmp / "upload_temp"
94
+ temp_file.write_text(content, encoding="utf-8")
95
+
96
+ api.upload_file(
97
+ path_or_fileobj=str(temp_file),
98
+ path_in_repo=path_in_repo,
99
+ repo_id=CACHE_REPO,
100
+ commit_message=f"Cache update {sha[:7]} - {SIM_METHOD} t={THRESH} m={int(MULTIMODAL)}"
101
+ )
102
+ print(f"Uploaded {path_in_repo}")
103
+
104
+ print(f"Successfully uploaded cache for {key}")
105
+
106
+ if __name__ == "__main__":
107
+ main()
modular_graph_and_candidates.py CHANGED
@@ -685,7 +685,6 @@ function updateVisibility() {
685
  }
686
  document.getElementById('toggleRed').addEventListener('change', updateVisibility);
687
 
688
- const HF_LOGO_URI = "./static/hf-logo.png";
689
  const graph = __GRAPH_DATA__;
690
  const W = innerWidth, H = innerHeight;
691
  const svg = d3.select('#dependency').call(d3.zoom().on('zoom', e => g.attr('transform', e.transform)));
@@ -709,23 +708,7 @@ const node = g.selectAll('g.node')
709
  .call(d3.drag().on('start', dragStart).on('drag', dragged).on('end', dragEnd));
710
 
711
  const baseSel = node.filter(d => d.cls === 'base');
712
- if (HF_LOGO_URI){
713
- baseSel.append('image')
714
- .attr('href', HF_LOGO_URI)
715
- .attr('width', 40)
716
- .attr('height', 40)
717
- .attr('x', -20)
718
- .attr('y', -20)
719
- .on('error', function() {
720
- console.log('Image failed to load:', HF_LOGO_URI);
721
- // Fallback to circle
722
- d3.select(this.parentNode).append('circle')
723
- .attr('r', 22).attr('fill', '#ffbe0b');
724
- });
725
- console.log('Loading logo from:', HF_LOGO_URI);
726
- }else{
727
- baseSel.append('circle').attr('r', d => 22*d.sz).attr('fill', '#ffbe0b');
728
- }
729
  node.filter(d => d.cls !== 'base').append('circle').attr('r', d => 20*d.sz);
730
 
731
  node.append('text')
@@ -922,7 +905,6 @@ function updateVisibility() {
922
  }
923
  document.getElementById('toggleRed').addEventListener('change', updateVisibility);
924
 
925
- const HF_LOGO_URI = "./static/hf-logo.png";
926
  const timeline = __TIMELINE_DATA__;
927
  const W = innerWidth, H = innerHeight;
928
 
@@ -1003,20 +985,7 @@ const node = g.selectAll('g.node')
1003
  .call(d3.drag().on('start', dragStart).on('drag', dragged).on('end', dragEnd));
1004
 
1005
  const baseSel = node.filter(d => d.cls === 'base');
1006
- if (HF_LOGO_URI) {
1007
- baseSel.append('image')
1008
- .attr('href', HF_LOGO_URI)
1009
- .attr('width', 35)
1010
- .attr('height', 35)
1011
- .attr('x', -17.5)
1012
- .attr('y', -17.5)
1013
- .on('error', function() {
1014
- d3.select(this.parentNode).append('circle')
1015
- .attr('r', 20).attr('fill', '#ffbe0b');
1016
- });
1017
- } else {
1018
- baseSel.append('circle').attr('r', 20).attr('fill', '#ffbe0b');
1019
- }
1020
  node.filter(d => d.cls !== 'base').append('circle').attr('r', 18);
1021
 
1022
  node.append('text')
 
685
  }
686
  document.getElementById('toggleRed').addEventListener('change', updateVisibility);
687
 
 
688
  const graph = __GRAPH_DATA__;
689
  const W = innerWidth, H = innerHeight;
690
  const svg = d3.select('#dependency').call(d3.zoom().on('zoom', e => g.attr('transform', e.transform)));
 
708
  .call(d3.drag().on('start', dragStart).on('drag', dragged).on('end', dragEnd));
709
 
710
  const baseSel = node.filter(d => d.cls === 'base');
711
+ baseSel.append('circle').attr('r', d => 22*d.sz).attr('fill', '#ffbe0b');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
712
  node.filter(d => d.cls !== 'base').append('circle').attr('r', d => 20*d.sz);
713
 
714
  node.append('text')
 
905
  }
906
  document.getElementById('toggleRed').addEventListener('change', updateVisibility);
907
 
 
908
  const timeline = __TIMELINE_DATA__;
909
  const W = innerWidth, H = innerHeight;
910
 
 
985
  .call(d3.drag().on('start', dragStart).on('drag', dragged).on('end', dragEnd));
986
 
987
  const baseSel = node.filter(d => d.cls === 'base');
988
+ baseSel.append('circle').attr('r', 20).attr('fill', '#ffbe0b');
 
 
 
 
 
 
 
 
 
 
 
 
 
989
  node.filter(d => d.cls !== 'base').append('circle').attr('r', 18);
990
 
991
  node.append('text')