update
Browse files- modular_graph_and_candidates.py +34 -18
modular_graph_and_candidates.py
CHANGED
|
@@ -61,10 +61,10 @@ def _strip_source(code: str) -> str:
|
|
| 61 |
if not re.match(r"\s*(from|import)\s+", ln))
|
| 62 |
|
| 63 |
def _tokenise(code: str) -> Set[str]:
|
|
|
|
| 64 |
toks: Set[str] = set()
|
| 65 |
-
for
|
| 66 |
-
|
| 67 |
-
toks.add(tok.string)
|
| 68 |
return toks
|
| 69 |
|
| 70 |
def build_token_bags(models_root: Path) -> Tuple[Dict[str, List[Set[str]]], Dict[str, int]]:
|
|
@@ -124,9 +124,9 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
| 124 |
all_embeddings = []
|
| 125 |
|
| 126 |
print("Encoding embeddings...")
|
| 127 |
-
batch_size =
|
| 128 |
-
for i in tqdm(range(0, len(names), batch_size), desc="
|
| 129 |
-
batch = [texts[
|
| 130 |
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
| 131 |
all_embeddings.append(emb)
|
| 132 |
|
|
@@ -194,6 +194,26 @@ def dependency_graph(modular_files: List[Path], models_root: Path) -> Dict[str,
|
|
| 194 |
|
| 195 |
# modular_graph_and_candidates.py (top-level)
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
def build_graph_json(
|
| 198 |
transformers_dir: Path,
|
| 199 |
threshold: float = SIM_DEFAULT,
|
|
@@ -202,20 +222,16 @@ def build_graph_json(
|
|
| 202 |
) -> dict:
|
| 203 |
"""Return the {nodes, links} dict that D3 needs."""
|
| 204 |
models_root = transformers_dir / "src/transformers/models"
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
| 207 |
mod_files = modular_files(models_root)
|
| 208 |
deps = dependency_graph(mod_files, models_root)
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
if multimodal:
|
| 213 |
-
missing = [m for m in missing if pix_hits[m] >= PIXEL_MIN_HITS]
|
| 214 |
-
|
| 215 |
-
if sim_method == "jaccard":
|
| 216 |
-
sims = similarity_clusters({m: bags[m] for m in missing}, threshold)
|
| 217 |
-
else:
|
| 218 |
-
sims = embedding_similarity_clusters(models_root, missing, threshold)
|
| 219 |
|
| 220 |
# ---- assemble nodes & links ----
|
| 221 |
nodes: Set[str] = set()
|
|
|
|
| 61 |
if not re.match(r"\s*(from|import)\s+", ln))
|
| 62 |
|
| 63 |
def _tokenise(code: str) -> Set[str]:
|
| 64 |
+
"""Extract identifiers using regex - more robust than tokenizer for malformed code."""
|
| 65 |
toks: Set[str] = set()
|
| 66 |
+
for match in re.finditer(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', code):
|
| 67 |
+
toks.add(match.group())
|
|
|
|
| 68 |
return toks
|
| 69 |
|
| 70 |
def build_token_bags(models_root: Path) -> Tuple[Dict[str, List[Set[str]]], Dict[str, int]]:
|
|
|
|
| 124 |
all_embeddings = []
|
| 125 |
|
| 126 |
print("Encoding embeddings...")
|
| 127 |
+
batch_size = 1
|
| 128 |
+
for i in tqdm(range(0, len(names), batch_size), desc="Models", leave=False):
|
| 129 |
+
batch = [texts[names[i]]]
|
| 130 |
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
| 131 |
all_embeddings.append(emb)
|
| 132 |
|
|
|
|
| 194 |
|
| 195 |
# modular_graph_and_candidates.py (top-level)
|
| 196 |
|
| 197 |
+
def get_missing_models(models_root: Path, multimodal: bool = False) -> Tuple[List[str], Dict[str, List[Set[str]]], Dict[str, int]]:
|
| 198 |
+
"""Get list of models missing modular implementations."""
|
| 199 |
+
bags, pix_hits = build_token_bags(models_root)
|
| 200 |
+
mod_files = modular_files(models_root)
|
| 201 |
+
models_with_modular = {p.parent.name for p in mod_files}
|
| 202 |
+
missing = [m for m in bags if m not in models_with_modular]
|
| 203 |
+
|
| 204 |
+
if multimodal:
|
| 205 |
+
missing = [m for m in missing if pix_hits[m] >= PIXEL_MIN_HITS]
|
| 206 |
+
|
| 207 |
+
return missing, bags, pix_hits
|
| 208 |
+
|
| 209 |
+
def compute_similarities(models_root: Path, missing: List[str], bags: Dict[str, List[Set[str]]],
|
| 210 |
+
threshold: float, sim_method: str) -> Dict[Tuple[str, str], float]:
|
| 211 |
+
"""Compute similarities between missing models using specified method."""
|
| 212 |
+
if sim_method == "jaccard":
|
| 213 |
+
return similarity_clusters({m: bags[m] for m in missing}, threshold)
|
| 214 |
+
else:
|
| 215 |
+
return embedding_similarity_clusters(models_root, missing, threshold)
|
| 216 |
+
|
| 217 |
def build_graph_json(
|
| 218 |
transformers_dir: Path,
|
| 219 |
threshold: float = SIM_DEFAULT,
|
|
|
|
| 222 |
) -> dict:
|
| 223 |
"""Return the {nodes, links} dict that D3 needs."""
|
| 224 |
models_root = transformers_dir / "src/transformers/models"
|
| 225 |
+
|
| 226 |
+
# Get missing models and their data
|
| 227 |
+
missing, bags, pix_hits = get_missing_models(models_root, multimodal)
|
| 228 |
+
|
| 229 |
+
# Build dependency graph
|
| 230 |
mod_files = modular_files(models_root)
|
| 231 |
deps = dependency_graph(mod_files, models_root)
|
| 232 |
+
|
| 233 |
+
# Compute similarities
|
| 234 |
+
sims = compute_similarities(models_root, missing, bags, threshold, sim_method)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
# ---- assemble nodes & links ----
|
| 237 |
nodes: Set[str] = set()
|