Spaces:

Molbap
/

transformers-modular-refactor

Sleeping

App Files Files Community

Molbap HF Staff commited on Aug 18

Commit

09c1921

1 Parent(s): 06274f5

update

Browse files

Files changed (1) hide show

modular_graph_and_candidates.py +34 -18

modular_graph_and_candidates.py CHANGED Viewed

@@ -61,10 +61,10 @@ def _strip_source(code: str) -> str:
                      if not re.match(r"\s*(from|import)\s+", ln))
 def _tokenise(code: str) -> Set[str]:
     toks: Set[str] = set()
-    for tok in tokenize.generate_tokens(iter(code.splitlines(keepends=True)).__next__):
-        if tok.type == tokenize.NAME:
-            toks.add(tok.string)
     return toks
 def build_token_bags(models_root: Path) -> Tuple[Dict[str, List[Set[str]]], Dict[str, int]]:
@@ -124,9 +124,9 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
     all_embeddings = []
     print("Encoding embeddings...")
-    batch_size = 2
-    for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
-        batch = [texts[n] for n in names[i:i+batch_size]]
         emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
         all_embeddings.append(emb)
@@ -194,6 +194,26 @@ def dependency_graph(modular_files: List[Path], models_root: Path) -> Dict[str,
 # modular_graph_and_candidates.py (top-level)
 def build_graph_json(
     transformers_dir: Path,
     threshold: float = SIM_DEFAULT,
@@ -202,20 +222,16 @@ def build_graph_json(
 ) -> dict:
     """Return the {nodes, links} dict that D3 needs."""
     models_root = transformers_dir / "src/transformers/models"
-    bags, pix_hits = build_token_bags(models_root)
     mod_files = modular_files(models_root)
     deps = dependency_graph(mod_files, models_root)
-    models_with_modular = {p.parent.name for p in mod_files}
-    missing = [m for m in bags if m not in models_with_modular]
-    if multimodal:
-        missing = [m for m in missing if pix_hits[m] >= PIXEL_MIN_HITS]
-    if sim_method == "jaccard":
-        sims = similarity_clusters({m: bags[m] for m in missing}, threshold)
-    else:
-        sims = embedding_similarity_clusters(models_root, missing, threshold)
     # ---- assemble nodes & links ----
     nodes: Set[str] = set()

                      if not re.match(r"\s*(from|import)\s+", ln))
 def _tokenise(code: str) -> Set[str]:
+    """Extract identifiers using regex - more robust than tokenizer for malformed code."""
     toks: Set[str] = set()
+    for match in re.finditer(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', code):
+        toks.add(match.group())
     return toks
 def build_token_bags(models_root: Path) -> Tuple[Dict[str, List[Set[str]]], Dict[str, int]]:
     all_embeddings = []
     print("Encoding embeddings...")
+    batch_size = 1
+    for i in tqdm(range(0, len(names), batch_size), desc="Models", leave=False):
+        batch = [texts[names[i]]]
         emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
         all_embeddings.append(emb)
 # modular_graph_and_candidates.py (top-level)
+def get_missing_models(models_root: Path, multimodal: bool = False) -> Tuple[List[str], Dict[str, List[Set[str]]], Dict[str, int]]:
+    """Get list of models missing modular implementations."""
+    bags, pix_hits = build_token_bags(models_root)
+    mod_files = modular_files(models_root)
+    models_with_modular = {p.parent.name for p in mod_files}
+    missing = [m for m in bags if m not in models_with_modular]
+    if multimodal:
+        missing = [m for m in missing if pix_hits[m] >= PIXEL_MIN_HITS]
+    return missing, bags, pix_hits
+def compute_similarities(models_root: Path, missing: List[str], bags: Dict[str, List[Set[str]]],
+                        threshold: float, sim_method: str) -> Dict[Tuple[str, str], float]:
+    """Compute similarities between missing models using specified method."""
+    if sim_method == "jaccard":
+        return similarity_clusters({m: bags[m] for m in missing}, threshold)
+    else:
+        return embedding_similarity_clusters(models_root, missing, threshold)
 def build_graph_json(
     transformers_dir: Path,
     threshold: float = SIM_DEFAULT,
 ) -> dict:
     """Return the {nodes, links} dict that D3 needs."""
     models_root = transformers_dir / "src/transformers/models"
+    # Get missing models and their data
+    missing, bags, pix_hits = get_missing_models(models_root, multimodal)
+    # Build dependency graph
     mod_files = modular_files(models_root)
     deps = dependency_graph(mod_files, models_root)
+    # Compute similarities
+    sims = compute_similarities(models_root, missing, bags, threshold, sim_method)
     # ---- assemble nodes & links ----
     nodes: Set[str] = set()