Molbap's picture
Molbap HF Staff
caching
a12858e
raw
history blame
2.39 kB
import os, json, subprocess, tempfile, io
from pathlib import Path
from datetime import datetime, timezone
from huggingface_hub import HfApi
from modular_graph_and_candidates import (
build_graph_json, generate_html,
build_timeline_json, generate_timeline_html
)
REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
CACHE_REPO = "Molbap/hf_cached_embeds_log"
THRESH = float(os.getenv("SIM_THRESHOLD", "0.50"))
MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1","true","True","YES","yes"}
SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
def main():
tmp = Path(tempfile.mkdtemp())
subprocess.check_call(["git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")])
sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip()
repo_path = tmp / "repo"
graph = build_graph_json(repo_path, threshold=THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
timeline = build_timeline_json(repo_path, threshold=THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
graph_html = generate_html(graph)
timeline_html = generate_timeline_html(timeline)
api = HfApi()
api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True)
key = f"{sha}/{SIM_METHOD}-{THRESH:.2f}-m{int(MULTIMODAL)}"
latest = {
"sha": sha,
"updated_utc": datetime.now(timezone.utc).isoformat(),
"defaults": {"sim_method": SIM_METHOD, "threshold": THRESH, "multimodal": MULTIMODAL},
"paths": {
"graph_json": f"graph/{key}.json",
"graph_html": f"graph/{key}.html",
"timeline_json": f"timeline/{key}.json",
"timeline_html": f"timeline/{key}.html",
},
}
def put(path_in_repo: str, text: str):
api.upload_file(
path_or_fileobj=io.BytesIO(text.encode("utf-8")),
path_in_repo=path_in_repo,
repo_id=CACHE_REPO,
repo_type="dataset",
commit_message=f"cache {path_in_repo}",
)
put(f"graph/{key}.json", json.dumps(graph, separators=(",", ":")))
put(f"graph/{key}.html", graph_html)
put(f"timeline/{key}.json", json.dumps(timeline, separators=(",", ":")))
put(f"timeline/{key}.html", timeline_html)
put("latest.json", json.dumps(latest, separators=(",", ":")))
if __name__ == "__main__":
main()