|
|
|
import os |
|
import io |
|
import json |
|
import subprocess |
|
import tempfile |
|
from pathlib import Path |
|
from datetime import datetime, timezone |
|
from huggingface_hub import HfApi |
|
from modular_graph_and_candidates import ( |
|
build_graph_json, generate_html, |
|
build_timeline_json, generate_timeline_html |
|
) |
|
|
|
REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers") |
|
CACHE_REPO = "Molbap/hf_cached_embeds_log" |
|
THRESH = float(os.getenv("SIM_THRESHOLD", "0.50")) |
|
MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1","true","True","YES","yes"} |
|
SIM_METHOD = os.getenv("SIM_METHOD", "jaccard") |
|
|
|
def main(): |
|
tmp = Path(tempfile.mkdtemp()) |
|
subprocess.check_call(["git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")]) |
|
sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip() |
|
repo_path = tmp / "repo" |
|
|
|
graph = build_graph_json(repo_path, threshold=THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD) |
|
timeline = build_timeline_json(repo_path, threshold=THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD) |
|
graph_html = generate_html(graph) |
|
timeline_html = generate_timeline_html(timeline) |
|
|
|
api = HfApi() |
|
api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True) |
|
|
|
key = f"{sha}/{SIM_METHOD}-{THRESH:.2f}-m{int(MULTIMODAL)}" |
|
latest = { |
|
"sha": sha, |
|
"updated_utc": datetime.now(timezone.utc).isoformat(), |
|
"defaults": {"sim_method": SIM_METHOD, "threshold": THRESH, "multimodal": MULTIMODAL}, |
|
"paths": { |
|
"graph_json": f"graph/{key}.json", |
|
"graph_html": f"graph/{key}.html", |
|
"timeline_json": f"timeline/{key}.json", |
|
"timeline_html": f"timeline/{key}.html", |
|
}, |
|
} |
|
|
|
def put(path_in_repo: str, text: str): |
|
api.upload_file( |
|
path_or_fileobj=io.BytesIO(text.encode("utf-8")), |
|
path_in_repo=path_in_repo, |
|
repo_id=CACHE_REPO, |
|
repo_type="dataset", |
|
commit_message=f"cache {path_in_repo}", |
|
) |
|
|
|
put(f"graph/{key}.json", json.dumps(graph, separators=(",", ":"))) |
|
put(f"graph/{key}.html", graph_html) |
|
put(f"timeline/{key}.json", json.dumps(timeline, separators=(",", ":"))) |
|
put(f"timeline/{key}.html", timeline_html) |
|
put("latest.json", json.dumps(latest, separators=(",", ":"))) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|