Molbap's picture
Molbap HF Staff
caching
a12858e
# app.py – Gradio Space wrapper for modular_graph_and_candidates
from __future__ import annotations
import json
import shutil
import subprocess
import tempfile
from datetime import datetime, timedelta
from functools import lru_cache
from pathlib import Path
import os, json, tempfile
from pathlib import Path
from huggingface_hub import hf_hub_download
import gradio as gr
# β€”β€” refactored helpers β€”β€”
from modular_graph_and_candidates import build_graph_json, generate_html, build_timeline_json, generate_timeline_html
def _escape_srcdoc(text: str) -> str:
"""Escape for inclusion inside an <iframe srcdoc="…"> attribute."""
return (
text.replace("&", "&amp;")
.replace("\"", "&quot;")
.replace("'", "&#x27;")
.replace("<", "&lt;")
.replace(">", "&gt;")
)
def _fetch_from_cache_repo(kind: str, sim_method: str, threshold: float, multimodal: bool):
repo_id = "Molbap/hf_cached_embeds_log"
try:
latest_fp = hf_hub_download(repo_id=repo_id, filename="latest.json", repo_type="dataset")
info = json.loads(Path(latest_fp).read_text(encoding="utf-8"))
sha = info.get("sha")
key = f"{sha}/{sim_method}-{threshold:.2f}-m{int(multimodal)}"
html_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.html", repo_type="dataset")
json_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.json", repo_type="dataset")
raw_html = Path(html_fp).read_text(encoding="utf-8")
json_text = Path(json_fp).read_text(encoding="utf-8")
iframe_html = f'<iframe style="width:100%;height:85vh;border:none;" srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
tmp = Path(tempfile.mkstemp(suffix=("_timeline.json" if kind == "timeline" else ".json"))[1])
tmp.write_text(json_text, encoding="utf-8")
return iframe_html, str(tmp)
except Exception:
return None
HF_MAIN_REPO = "https://github.com/huggingface/transformers"
# ───────────────────────────── cache repo once per 24β€―h ───────────────────────────
@lru_cache(maxsize=4)
def clone_or_cache(repo_url: str) -> Path:
"""Shallow‑clone *repo_url* and reuse it for 24β€―h."""
tmp_root = Path(tempfile.gettempdir())
cache_dir = tmp_root / f"repo_{abs(hash(repo_url))}"
stamp = cache_dir / ".cloned_at"
if cache_dir.exists() and stamp.exists():
try:
if datetime.utcnow() - datetime.fromisoformat(stamp.read_text().strip()) < timedelta(days=1):
return cache_dir
except Exception:
pass # fall through β†’ reclone
shutil.rmtree(cache_dir, ignore_errors=True)
subprocess.check_call(["git", "clone", "--depth", "1", repo_url, str(cache_dir)])
stamp.write_text(datetime.utcnow().isoformat())
return cache_dir
# ───────────────────────────── main callback ─────────────────────────────────────
def run_graph(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
"""Generate the dependency graph visualization."""
hit = _fetch_from_cache_repo("graph", sim_method, threshold, multimodal)
if hit:
return hit
repo_path = clone_or_cache(repo_url)
graph = build_graph_json(
transformers_dir=repo_path,
threshold=threshold,
multimodal=multimodal,
sim_method=sim_method,
)
raw_html = generate_html(graph)
iframe_html = (
f'<iframe style="width:100%;height:85vh;border:none;" '
f'srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
)
tmp_json = Path(tempfile.mktemp(suffix=".json"))
tmp_json.write_text(json.dumps(graph), encoding="utf-8")
return iframe_html, str(tmp_json)
def run_timeline(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
"""Generate the chronological timeline visualization."""
hit = _fetch_from_cache_repo("timeline", sim_method, threshold, multimodal)
if hit:
return hit
repo_path = clone_or_cache(repo_url)
timeline = build_timeline_json(
transformers_dir=repo_path,
threshold=threshold,
multimodal=multimodal,
sim_method=sim_method,
)
raw_html = generate_timeline_html(timeline)
iframe_html = (
f'<iframe style="width:100%;height:85vh;border:none;" '
f'srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
)
tmp_json = Path(tempfile.mktemp(suffix="_timeline.json"))
tmp_json.write_text(json.dumps(timeline), encoding="utf-8")
return iframe_html, str(tmp_json)
# ───────────────────────────── UI ────────────────────────────────────────────────
CUSTOM_CSS = """
#graph_html iframe, #timeline_html iframe {height:85vh !important; width:100% !important; border:none;}
"""
with gr.Blocks(css=CUSTOM_CSS) as demo:
gr.Markdown("## πŸ” Modular‑candidate explorer for πŸ€— Transformers")
with gr.Tabs():
with gr.Tab("Dependency Graph"):
with gr.Row():
repo_in = gr.Text(value=HF_MAIN_REPO, label="Repo / fork URL")
thresh = gr.Slider(0.50, 0.95, value=0.5, step=0.01, label="Similarity β‰₯")
multi_cb = gr.Checkbox(label="Only multimodal models")
sim_radio = gr.Radio(["jaccard", "embedding"], value="jaccard", label="Similarity metric")
go_btn = gr.Button("Build graph")
graph_html_out = gr.HTML(elem_id="graph_html", show_label=False)
graph_json_out = gr.File(label="Download graph.json")
go_btn.click(run_graph, [repo_in, thresh, multi_cb, sim_radio], [graph_html_out, graph_json_out])
with gr.Tab("Chronological Timeline"):
with gr.Row():
timeline_repo_in = gr.Text(value=HF_MAIN_REPO, label="Repo / fork URL")
timeline_thresh = gr.Slider(0.50, 0.95, value=0.5, step=0.01, label="Similarity β‰₯")
timeline_multi_cb = gr.Checkbox(label="Only multimodal models")
timeline_sim_radio = gr.Radio(["jaccard", "embedding"], value="jaccard", label="Similarity metric")
timeline_btn = gr.Button("Build timeline")
timeline_html_out = gr.HTML(elem_id="timeline_html", show_label=False)
timeline_json_out = gr.File(label="Download timeline.json")
timeline_btn.click(run_timeline, [timeline_repo_in, timeline_thresh, timeline_multi_cb, timeline_sim_radio], [timeline_html_out, timeline_json_out])
if __name__ == "__main__":
demo.launch(allowed_paths=["static"])