File size: 2,385 Bytes
a12858e
6d106b8
 
 
 
 
a12858e
 
6d106b8
 
a12858e
6d106b8
a12858e
 
6d106b8
 
 
 
a12858e
 
6d106b8
 
a12858e
 
6d106b8
 
 
 
a12858e
6d106b8
 
 
 
 
a12858e
6d106b8
a12858e
 
6d106b8
 
 
 
 
a12858e
6d106b8
a12858e
6d106b8
 
a12858e
 
6d106b8
 
a12858e
 
 
 
 
6d106b8
 
a12858e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os, json, subprocess, tempfile, io
from pathlib import Path
from datetime import datetime, timezone
from huggingface_hub import HfApi

from modular_graph_and_candidates import (
    build_graph_json, generate_html,
    build_timeline_json, generate_timeline_html
)

REPO_URL   = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
CACHE_REPO = "Molbap/hf_cached_embeds_log"
THRESH     = float(os.getenv("SIM_THRESHOLD", "0.50"))
MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1","true","True","YES","yes"}
SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")

def main():
    tmp = Path(tempfile.mkdtemp())
    subprocess.check_call(["git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")])
    sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip()
    repo_path = tmp / "repo"

    graph = build_graph_json(repo_path, threshold=THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
    timeline = build_timeline_json(repo_path, threshold=THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
    graph_html = generate_html(graph)
    timeline_html = generate_timeline_html(timeline)

    api = HfApi()
    api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True)

    key = f"{sha}/{SIM_METHOD}-{THRESH:.2f}-m{int(MULTIMODAL)}"
    latest = {
        "sha": sha,
        "updated_utc": datetime.now(timezone.utc).isoformat(),
        "defaults": {"sim_method": SIM_METHOD, "threshold": THRESH, "multimodal": MULTIMODAL},
        "paths": {
            "graph_json":    f"graph/{key}.json",
            "graph_html":    f"graph/{key}.html",
            "timeline_json": f"timeline/{key}.json",
            "timeline_html": f"timeline/{key}.html",
        },
    }

    def put(path_in_repo: str, text: str):
        api.upload_file(
            path_or_fileobj=io.BytesIO(text.encode("utf-8")),
            path_in_repo=path_in_repo,
            repo_id=CACHE_REPO,
            repo_type="dataset",
            commit_message=f"cache {path_in_repo}",
        )

    put(f"graph/{key}.json", json.dumps(graph, separators=(",", ":")))
    put(f"graph/{key}.html", graph_html)
    put(f"timeline/{key}.json", json.dumps(timeline, separators=(",", ":")))
    put(f"timeline/{key}.html", timeline_html)
    put("latest.json", json.dumps(latest, separators=(",", ":")))

if __name__ == "__main__":
    main()