Sarthak commited on May 31

Commit

1bc7e54

1 Parent(s): 454e47c

chore: remove unused scripts and update dependencies

This commit removes the MTEB evaluation script, distillation script, and evaluation script as they are no longer needed. Additionally, updates the pyproject.toml file to remove dependencies related to the removed scripts and adds typing-extensions to the dependencies.

Files changed (34) hide show

.codemap.yml +294 -0
MTEB_evaluate.py +0 -343
REPORT.md +299 -0
Taskfile.yml +23 -0
analysis_charts/batch_size_scaling.png +3 -0
analysis_charts/benchmark_performance.png +3 -0
analysis_charts/code_performance_radar.png +3 -0
analysis_charts/comparative_radar.png +3 -0
analysis_charts/efficiency_analysis.png +3 -0
analysis_charts/language_heatmap.png +3 -0
analysis_charts/memory_scaling.png +3 -0
analysis_charts/model_comparison.png +3 -0
analysis_charts/model_specifications.png +3 -0
analysis_charts/peer_comparison.png +3 -0
analysis_charts/radar_code_model2vec_Linq_Embed_Mistral.png +3 -0
analysis_charts/radar_code_model2vec_Qodo_Embed_1_15B.png +3 -0
analysis_charts/radar_code_model2vec_Reason_ModernColBERT.png +3 -0
analysis_charts/radar_code_model2vec_all_MiniLM_L6_v2.png +3 -0
analysis_charts/radar_code_model2vec_all_mpnet_base_v2.png +3 -0
analysis_charts/radar_code_model2vec_bge_m3.png +3 -0
analysis_charts/radar_code_model2vec_codebert_base.png +3 -0
analysis_charts/radar_code_model2vec_graphcodebert_base.png +3 -0
analysis_charts/radar_code_model2vec_gte_Qwen2_15B_instruct.png +3 -0
analysis_charts/radar_code_model2vec_gte_Qwen2_7B_instruct.png +3 -0
analysis_charts/radar_code_model2vec_jina_embeddings_v2_base_code.png +3 -0
analysis_charts/radar_code_model2vec_jina_embeddings_v3.png +3 -0
analysis_charts/radar_code_model2vec_nomic_embed_text_v2_moe.png +3 -0
analysis_charts/radar_code_model2vec_paraphrase_MiniLM_L6_v2.png +3 -0
distill.py +0 -116
evaluate.py +0 -422
pyproject.toml +37 -5
src/distiller/distill.py +419 -159
src/distiller/evaluate.py +371 -43
train_code_classification.py +0 -365

.codemap.yml ADDED Viewed

	@@ -0,0 +1,294 @@

+# CodeMap Configuration File
+# -------------------------
+# This file configures CodeMap's behavior. Uncomment and modify settings as needed.
+# LLM Configuration - Controls which model is used for AI operations
+llm:
+  # Format: "provider:model-name", e.g., "openai:gpt-4o", "anthropic:claude-3-opus"
+  model: "google-gla:gemini-2.0-flash"
+  temperature: 0.5  # Lower for more deterministic outputs, higher for creativity
+  max_input_tokens: 1000000  # Maximum tokens in input
+  max_output_tokens: 10000  # Maximum tokens in responses
+  max_requests: 25  # Maximum number of requests
+# Embedding Configuration - Controls vector embedding behavior
+embedding:
+  # Recommended models: "minishlab/potion-base-8M3", Only Model2Vec static models are supported
+  model_name: "minishlab/potion-base-8M"
+  dimension: 256
+  # dimension_metric: "cosine" # Metric for dimension calculation (e.g., "cosine", "euclidean")
+  # max_retries: 3 # Maximum retries for embedding requests
+  # retry_delay: 5 # Delay in seconds between retries
+  # max_content_length: 5000  # Maximum characters per file chunk
+  # Qdrant (Vector DB) settings
+  # qdrant_batch_size: 100 # Batch size for Qdrant uploads
+  # url: "http://localhost:6333" # Qdrant server URL
+  # timeout: 30 # Qdrant client timeout in seconds
+  # prefer_grpc: true # Prefer gRPC for Qdrant communication
+  # Advanced chunking settings - controls how code is split
+  # chunking:
+  #   max_hierarchy_depth: 2  # Maximum depth of code hierarchy to consider
+  #   max_file_lines: 1000  # Maximum lines per file before splitting
+  # Clustering settings for embeddings
+  # clustering:
+  #   method: "agglomerative"  # Clustering method: "agglomerative", "dbscan"
+  #   agglomerative: # Settings for Agglomerative Clustering
+  #     metric: "precomputed" # Metric: "cosine", "euclidean", "manhattan", "l1", "l2", "precomputed"
+  #     distance_threshold: 0.3 # Distance threshold for forming clusters
+  #     linkage: "complete" # Linkage criterion: "ward", "complete", "average", "single"
+  #   dbscan: # Settings for DBSCAN Clustering
+  #     eps: 0.3 # The maximum distance between two samples for one to be considered as in the neighborhood of the other
+  #     min_samples: 2 # The number of samples in a neighborhood for a point to be considered as a core point
+  #     algorithm: "auto" # Algorithm to compute pointwise distances: "auto", "ball_tree", "kd_tree", "brute"
+  #     metric: "precomputed" # Metric for distance computation: "cityblock", "cosine", "euclidean", "l1", "l2", "manhattan", "precomputed"
+# RAG (Retrieval Augmented Generation) Configuration
+rag:
+  max_context_length: 8000  # Maximum context length for the LLM
+  max_context_results: 100  # Maximum number of context results to return
+  similarity_threshold: 0.75  # Minimum similarity score (0-1) for relevance
+  # system_prompt: null # Optional system prompt to guide the RAG model (leave commented or set if needed)
+  include_file_content: true  # Include file content in context
+  include_metadata: true  # Include file metadata in context
+# Sync Configuration - Controls which files are excluded from processing
+sync:
+  exclude_patterns:
+    - "^node_modules/"
+    - "^\\.venv/"
+    - "^venv/"
+    - "^env/"
+    - "^__pycache__/"
+    - "^\\.mypy_cache/"
+    - "^\\.pytest_cache/"
+    - "^\\.ruff_cache/"
+    - "^dist/"
+    - "^build/"
+    - "^\\.git/"
+    - "^typings/"
+    - "^\\.pyc$"
+    - "^\\.pyo$"
+    - "^\\.so$"
+    - "^\\.dll$"
+    - "^\\.lib$"
+    - "^\\.a$"
+    - "^\\.o$"
+    - "^\\.class$"
+    - "^\\.jar$"
+# Generation Configuration - Controls documentation generation
+gen:
+  max_content_length: 5000  # Maximum content length per file for generation
+  use_gitignore: true  # Use .gitignore patterns to exclude files
+  output_dir: "documentation"  # Directory to store generated documentation
+  include_tree: true  # Include directory tree in output
+  include_entity_graph: true  # Include entity relationship graph
+  semantic_analysis: true  # Enable semantic analysis
+  lod_level: "skeleton"  # Level of detail: "signatures", "structure", "docs", "skeleton", "full"
+  # Mermaid diagram configuration for entity graphs
+  # mermaid_entities:
+  #   - "module"
+  #   - "class"
+  #   - "function"
+  #   - "method"
+  #   - "constant"
+  #   - "variable"
+  #   - "import"
+  # mermaid_relationships:
+  #   - "declares"
+  #   - "imports"
+  #   - "calls"
+  mermaid_show_legend: false
+  mermaid_remove_unconnected: true  # Show isolated nodes
+  mermaid_styled: false  # Style the mermaid diagram
+# Processor Configuration - Controls code processing behavior
+processor:
+  enabled: true  # Enable the processor
+  max_workers: 4  # Maximum number of parallel workers
+  ignored_patterns:  # Patterns to ignore during processing
+    - "**/.git/**"
+    - "**/__pycache__/**"
+    - "**/.venv/**"
+    - "**/node_modules/**"
+    - "**/*.pyc"
+    - "**/dist/**"
+    - "**/build/**"
+  default_lod_level: "signatures"  # Default level of detail: "signatures", "structure", "docs", "full"
+  # File watcher configuration
+  # watcher:
+  #   enabled: true  # Enable file watching
+  #   debounce_delay: 1.0  # Delay in seconds before processing changes
+# Commit Command Configuration
+commit:
+  strategy: "semantic"  # Strategy for splitting diffs: "file", "hunk", "semantic"
+  bypass_hooks: false  # Whether to bypass git hooks
+  use_lod_context: true  # Use level of detail context
+  is_non_interactive: false  # Run in non-interactive mode
+  # Diff splitter configuration
+  # diff_splitter:
+  #   similarity_threshold: 0.6  # Similarity threshold for grouping related changes
+  #   directory_similarity_threshold: 0.3 # Threshold for considering directories similar (e.g., for renames)
+  #   file_move_similarity_threshold: 0.85 # Threshold for detecting file moves/renames based on content
+  #   min_chunks_for_consolidation: 2 # Minimum number of small chunks to consider for consolidation
+  #   max_chunks_before_consolidation: 20 # Maximum number of chunks before forcing consolidation
+  #   max_file_size_for_llm: 50000  # Maximum file size (bytes) for LLM processing of individual files
+  #   max_log_diff_size: 1000 # Maximum size (lines) of diff log to pass to LLM for context
+  #   default_code_extensions: # File extensions considered as code for semantic splitting
+  #     - "js"
+  #     - "jsx"
+  #     - "ts"
+  #     - "tsx"
+  #     - "py"
+  #     - "java"
+  #     - "c"
+  #     - "cpp"
+  #     - "h"
+  #     - "hpp"
+  #     - "cc"
+  #     - "cs"
+  #     - "go"
+  #     - "rb"
+  #     - "php"
+  #     - "rs"
+  #     - "swift"
+  #     - "scala"
+  #     - "kt"
+  #     - "sh"
+  #     - "pl"
+  #     - "pm"
+  # Commit convention configuration (Conventional Commits)
+  convention:
+    types: # Allowed commit types
+      - "feat"
+      - "fix"
+      - "docs"
+      - "style"
+      - "refactor"
+      - "perf"
+      - "test"
+      - "build"
+      - "ci"
+      - "chore"
+    scopes: []  # Add project-specific scopes here, e.g., ["api", "ui", "db"]
+    max_length: 72  # Maximum length of commit message header
+  # Commit linting configuration (based on conventional-changelog-lint rules)
+  # lint:
+  #   # Rules are defined as: {level: "ERROR"|"WARNING"|"DISABLED", rule: "always"|"never", value: <specific_value_if_any>}
+  #   header_max_length:
+  #     level: "ERROR"
+  #     rule: "always"
+  #     value: 100
+  #   header_case: # e.g., 'lower-case', 'upper-case', 'camel-case', etc.
+  #     level: "DISABLED"
+  #     rule: "always"
+  #     value: "lower-case"
+  #   header_full_stop:
+  #     level: "ERROR"
+  #     rule: "never"
+  #     value: "."
+  #   type_enum: # Types must be from the 'convention.types' list
+  #     level: "ERROR"
+  #     rule: "always"
+  #   type_case:
+  #     level: "ERROR"
+  #     rule: "always"
+  #     value: "lower-case"
+  #   type_empty:
+  #     level: "ERROR"
+  #     rule: "never"
+  #   scope_case:
+  #     level: "ERROR"
+  #     rule: "always"
+  #     value: "lower-case"
+  #   scope_empty: # Set to "ERROR" if scopes are mandatory
+  #     level: "DISABLED"
+  #     rule: "never"
+  #   scope_enum: # Scopes must be from the 'convention.scopes' list if enabled
+  #     level: "DISABLED"
+  #     rule: "always"
+  #     # value: [] # Add allowed scopes here if rule is "always" and level is not DISABLED
+  #   subject_case: # Forbids specific cases in the subject
+  #     level: "ERROR"
+  #     rule: "never"
+  #     value: ["sentence-case", "start-case", "pascal-case", "upper-case"]
+  #   subject_empty:
+  #     level: "ERROR"
+  #     rule: "never"
+  #   subject_full_stop:
+  #     level: "ERROR"
+  #     rule: "never"
+  #     value: "."
+  #   subject_exclamation_mark:
+  #     level: "DISABLED"
+  #     rule: "never"
+  #   body_leading_blank: # Body must start with a blank line after subject
+  #     level: "WARNING"
+  #     rule: "always"
+  #   body_empty:
+  #     level: "DISABLED"
+  #     rule: "never"
+  #   body_max_line_length:
+  #     level: "ERROR"
+  #     rule: "always"
+  #     value: 100
+  #   footer_leading_blank: # Footer must start with a blank line after body
+  #     level: "WARNING"
+  #     rule: "always"
+  #   footer_empty:
+  #     level: "DISABLED"
+  #     rule: "never"
+  #   footer_max_line_length:
+  #     level: "ERROR"
+  #     rule: "always"
+  #     value: 100
+# Pull Request Configuration
+pr:
+  defaults:
+    base_branch: null  # Default base branch (null = auto-detect, e.g., main, master, develop)
+    feature_prefix: "feature/"  # Default feature branch prefix
+  strategy: "github-flow"  # Git workflow: "github-flow", "gitflow", "trunk-based"
+  # Branch mapping for different PR types (primarily used in gitflow strategy)
+  # branch_mapping:
+  #   feature:
+  #     base: "develop"
+  #     prefix: "feature/"
+  #   release:
+  #     base: "main"
+  #     prefix: "release/"
+  #   hotfix:
+  #     base: "main"
+  #     prefix: "hotfix/"
+  #   bugfix:
+  #     base: "develop"
+  #     prefix: "bugfix/"
+  # PR generation configuration
+  generate:
+    title_strategy: "llm"  # Strategy for generating PR titles: "commits" (from commit messages), "llm" (AI generated)
+    description_strategy: "llm"  # Strategy for descriptions: "commits", "llm"
+    # description_template: | # Template for PR description when using 'llm' strategy. Placeholders: {changes}, {testing_instructions}, {screenshots}
+    #   ## Changes
+    #   {changes}
+    #
+    #   ## Testing
+    #   {testing_instructions}
+    #
+    #   ## Screenshots
+    #   {screenshots}
+    use_workflow_templates: true  # Use workflow-specific templates if available (e.g., for GitHub PR templates)
+# Ask Command Configuration
+ask:
+  interactive_chat: false  # Enable interactive chat mode for the 'ask' command

MTEB_evaluate.py DELETED Viewed

@@ -1,343 +0,0 @@
-#!/usr/bin/env python
-"""
-MTEB Evaluation Script with Subprocess Isolation (Code Information Retrieval Tasks).
-This script evaluates models using MTEB with subprocess isolation to prevent
-memory issues and process killing.
-Features:
-- Each task runs in a separate subprocess to isolate memory
-- 1-minute timeout per task
-- No retries - if task fails or times out, move to next one
-- Memory monitoring and cleanup
-Note: Multi-threading is NOT used here because:
-1. Memory is the main bottleneck, not CPU
-2. Running multiple tasks simultaneously would increase memory pressure
-3. Many tasks are being killed (return code -9) due to OOM conditions
-4. Sequential processing with subprocess isolation is more stable
-"""
-import contextlib
-import json
-import logging
-import subprocess
-import sys
-import tempfile
-import time
-from pathlib import Path
-import psutil
-# =============================================================================
-# CONFIGURATION
-# =============================================================================
-MODEL_PATH = "."
-MODEL_NAME = "gte-Qwen2-7B-instruct-M2V-Distilled"
-OUTPUT_DIR = "mteb_results"
-TASK_TIMEOUT = 30  # 30 seconds timeout per task
-MAX_RETRIES = 0  # No retries - move to next task if failed/timeout
-# Constants
-SIGKILL_RETURN_CODE = -9  # Process killed by SIGKILL (usually OOM)
-# Configure logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-logger = logging.getLogger(__name__)
-# =============================================================================
-# SINGLE TASK RUNNER SCRIPT
-# =============================================================================
-TASK_RUNNER_SCRIPT = """
-import sys
-import os
-import json
-import tempfile
-import traceback
-from pathlib import Path
-# Add current directory to path
-sys.path.insert(0, ".")
-try:
-    import mteb
-    from model2vec import StaticModel
-    from mteb import ModelMeta
-    from evaluation import CustomMTEB
-    def run_single_task():
-        # Get arguments
-        model_path = sys.argv[1]
-        task_name = sys.argv[2]
-        output_dir = sys.argv[3]
-        model_name = sys.argv[4]
-        # Load model
-        model = StaticModel.from_pretrained(model_path)
-        model.mteb_model_meta = ModelMeta(
-            name=model_name, revision="distilled", release_date=None, languages=["eng"]
-        )
-        # Get and run task
-        task = mteb.get_task(task_name, languages=["eng"])
-        evaluation = CustomMTEB(tasks=[task])
-        results = evaluation.run(
-            model,
-            eval_splits=["test"],
-            output_folder=output_dir,
-            verbosity=0
-        )
-        # Save results to temp file for parent process
-        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as f:
-            json.dump({
-                "success": True,
-                "task_name": task_name,
-                "results": results
-            }, f)
-            temp_file = f.name
-        print(f"RESULT_FILE:{temp_file}")
-        return 0
-    if __name__ == "__main__":
-        exit(run_single_task())
-except Exception as e:
-    print(f"ERROR: {str(e)}")
-    print(f"TRACEBACK: {traceback.format_exc()}")
-    exit(1)
-"""
-def get_available_tasks() -> list[str]:
-	"""Get list of available tasks."""
-	try:
-		import mteb
-		import mteb.benchmarks
-		# Use main MTEB benchmark for comprehensive evaluation
-		benchmark = mteb.benchmarks.CoIR
-		return [str(task) for task in benchmark.tasks]  # All tasks
-	except Exception:
-		logger.exception("Failed to get tasks")
-		return []
-def check_existing_results(output_path: Path, task_names: list[str]) -> list[str]:
-	"""Check for existing results and return remaining tasks."""
-	remaining_tasks = []
-	for task_name in task_names:
-		result_file = output_path / MODEL_NAME / "distilled" / f"{task_name}.json"
-		if result_file.exists():
-			logger.info(f"Skipping {task_name} - results already exist")
-		else:
-			remaining_tasks.append(task_name)
-	return remaining_tasks
-def run_task_subprocess(task_name: str, output_dir: str) -> tuple[bool, str, float]:
-	"""Run a single task in a subprocess with memory and time limits."""
-	# Create temporary script file
-	with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
-		f.write(TASK_RUNNER_SCRIPT)
-		script_path = f.name
-	try:
-		logger.info(f"Running task: {task_name}")
-		start_time = time.time()
-		# Run subprocess with timeout
-		# subprocess security: We control all inputs (script path and known arguments)
-		cmd = [sys.executable, script_path, MODEL_PATH, task_name, output_dir, MODEL_NAME]
-		process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)  # noqa: S603
-		try:
-			stdout, stderr = process.communicate(timeout=TASK_TIMEOUT)
-			duration = time.time() - start_time
-			if process.returncode == 0:
-				# Check for result file
-				for line in stdout.split("\n"):
-					if line.startswith("RESULT_FILE:"):
-						result_file = line.split(":", 1)[1]
-						try:
-							with Path(result_file).open() as f:
-								json.load(f)
-							Path(result_file).unlink()  # Clean up temp file
-							logger.info(f"✓ Completed {task_name} in {duration:.2f}s")
-							return True, task_name, duration
-						except (json.JSONDecodeError, OSError):
-							logger.exception("Failed to read result file")
-				logger.info(f"✓ Completed {task_name} in {duration:.2f}s")
-				return True, task_name, duration
-			if process.returncode == SIGKILL_RETURN_CODE:
-				logger.error(f"✗ Task {task_name} killed (OOM) - return code {process.returncode}")
-			else:
-				logger.error(f"✗ Task {task_name} failed with return code {process.returncode}")
-			if stderr:
-				logger.error(f"Error output: {stderr}")
-			return False, task_name, duration
-		except subprocess.TimeoutExpired:
-			logger.warning(f"⏱ Task {task_name} timed out after {TASK_TIMEOUT}s")
-			process.kill()
-			process.wait()
-			return False, task_name, TASK_TIMEOUT
-	except Exception:
-		logger.exception(f"✗ Failed to run task {task_name}")
-		return False, task_name, 0.0
-	finally:
-		# Clean up script file
-		with contextlib.suppress(Exception):
-			Path(script_path).unlink()
-def collect_results(output_path: Path) -> dict:
-	"""Collect all results from completed tasks."""
-	results_dir = output_path / MODEL_NAME / "distilled"
-	if not results_dir.exists():
-		return {}
-	task_results = {}
-	for result_file in results_dir.glob("*.json"):
-		if result_file.name == "model_meta.json":
-			continue
-		try:
-			with result_file.open() as f:
-				data = json.load(f)
-				task_name = result_file.stem
-				task_results[task_name] = data
-		except (json.JSONDecodeError, OSError) as e:
-			logger.warning(f"Could not load {result_file}: {e}")
-	return task_results
-def save_summary(output_path: Path, results: dict, stats: dict) -> None:
-	"""Save evaluation summary."""
-	summary = {
-		"model_name": MODEL_NAME,
-		"timestamp": time.time(),
-		"task_timeout": TASK_TIMEOUT,
-		"stats": stats,
-		"task_results": results,
-	}
-	summary_file = output_path / "mteb_summary.json"
-	with summary_file.open("w") as f:
-		json.dump(summary, f, indent=2, default=str)
-	logger.info(f"Summary saved to {summary_file}")
-def main() -> None:
-	"""Main evaluation function."""
-	logger.info(f"Starting MTEB evaluation for {MODEL_NAME}")
-	logger.info(f"Task timeout: {TASK_TIMEOUT}s (no retries)")
-	logger.info("Memory isolation: Each task runs in separate subprocess")
-	# Log system info
-	memory_info = psutil.virtual_memory()
-	logger.info(f"System memory: {memory_info.total / (1024**3):.1f} GB total")
-	output_path = Path(OUTPUT_DIR)
-	output_path.mkdir(parents=True, exist_ok=True)
-	# Get tasks
-	all_tasks = get_available_tasks()
-	if not all_tasks:
-		logger.error("No tasks found!")
-		return
-	logger.info(f"Found {len(all_tasks)} tasks")
-	# Check existing results
-	remaining_tasks = check_existing_results(output_path, all_tasks)
-	logger.info(f"Will evaluate {len(remaining_tasks)} remaining tasks")
-	if not remaining_tasks:
-		logger.info("All tasks already completed!")
-		return
-	# Process tasks sequentially (no retries)
-	start_time = time.time()
-	successful_tasks = []
-	failed_tasks = []
-	timed_out_tasks = []
-	for i, task_name in enumerate(remaining_tasks):
-		logger.info(f"[{i + 1}/{len(remaining_tasks)}] Processing: {task_name}")
-		# Run task once (no retries)
-		success, name, duration = run_task_subprocess(task_name, str(output_path))
-		if success:
-			successful_tasks.append((name, duration))
-		elif duration == TASK_TIMEOUT:
-			timed_out_tasks.append(name)
-		else:
-			failed_tasks.append(name)
-			# Check if it was OOM killed (this is logged in run_task_subprocess)
-		# Progress update
-		progress = ((i + 1) / len(remaining_tasks)) * 100
-		logger.info(f"Progress: {i + 1}/{len(remaining_tasks)} ({progress:.1f}%)")
-		# Brief pause between tasks
-		time.sleep(1)
-	total_time = time.time() - start_time
-	# Log final summary
-	logger.info("=" * 80)
-	logger.info("EVALUATION SUMMARY")
-	logger.info("=" * 80)
-	logger.info(f"Total tasks: {len(remaining_tasks)}")
-	logger.info(f"Successful: {len(successful_tasks)}")
-	logger.info(f"Failed: {len(failed_tasks)}")
-	logger.info(f"Timed out: {len(timed_out_tasks)}")
-	logger.info(f"Total time: {total_time:.2f}s")
-	if successful_tasks:
-		avg_time = sum(duration for _, duration in successful_tasks) / len(successful_tasks)
-		logger.info(f"Average successful task time: {avg_time:.2f}s")
-	if failed_tasks:
-		logger.warning(f"Failed tasks: {failed_tasks}")
-	if timed_out_tasks:
-		logger.warning(f"Timed out tasks: {timed_out_tasks}")
-	logger.info("=" * 80)
-	# Collect and save results
-	all_results = collect_results(output_path)
-	stats = {
-		"total_tasks": len(remaining_tasks),
-		"successful": len(successful_tasks),
-		"failed": len(failed_tasks),
-		"timed_out": len(timed_out_tasks),
-		"total_time": total_time,
-		"avg_time": avg_time if successful_tasks else 0,
-		"successful_task_details": successful_tasks,
-		"failed_tasks": failed_tasks,
-		"timed_out_tasks": timed_out_tasks,
-	}
-	save_summary(output_path, all_results, stats)
-	logger.info("Evaluation completed!")
-if __name__ == "__main__":
-	main()

REPORT.md ADDED Viewed

	@@ -0,0 +1,299 @@

+# Code-Specialized Model2Vec Distillation Analysis
+## 🎯 Executive Summary
+This report presents a comprehensive analysis of Model2Vec distillation experiments using different teacher models for code-specialized embedding generation.
+### Evaluated Models Overview
+**Simplified Distillation Models:** 13
+**Peer Comparison Models:** 19
+**Total Models Analyzed:** 32
+### Best Performing Simplified Model: code_model2vec_all_mpnet_base_v2
+**Overall CodeSearchNet Performance:**
+- **NDCG@10**: 0.7387
+- **Mean Reciprocal Rank (MRR)**: 0.7010
+- **Recall@5**: 0.8017
+- **Mean Rank**: 6.4
+## 📊 Comprehensive Model Comparison
+### All Simplified Distillation Models Performance
+| Model | Teacher | NDCG@10 | MRR | Recall@5 | Status |
+|-------|---------|---------|-----|----------|--------|
+| code_model2vec_all_mpnet_base_v2 | [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) | 0.7387 | 0.7010 | 0.8017 | 🥇 Best |
+| code_model2vec_all_MiniLM_L6_v2 | [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | 0.7385 | 0.7049 | 0.7910 | 🥈 2nd |
+| code_model2vec_jina_embeddings_v2_base_code | [jina-embeddings-v2-base-code](https://huggingface.co/jina-embeddings-v2-base-code) | 0.7381 | 0.6996 | 0.8130 | 🥉 3rd |
+| code_model2vec_paraphrase_MiniLM_L6_v2 | [sentence-transformers/paraphrase-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2) | 0.7013 | 0.6638 | 0.7665 | #4 |
+| code_model2vec_Reason_ModernColBERT | [lightonai/Reason-ModernColBERT](https://huggingface.co/lightonai/Reason-ModernColBERT) | 0.6598 | 0.6228 | 0.7260 | #5 |
+| code_model2vec_bge_m3 | [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3) | 0.4863 | 0.4439 | 0.5514 | #6 |
+| code_model2vec_jina_embeddings_v3 | [jinaai/jina-embeddings-v3](https://huggingface.co/jinaai/jina-embeddings-v3) | 0.4755 | 0.4416 | 0.5456 | #7 |
+| code_model2vec_nomic_embed_text_v2_moe | [nomic-ai/nomic-embed-text-v2-moe](https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe) | 0.4532 | 0.4275 | 0.5094 | #8 |
+| code_model2vec_gte_Qwen2_1.5B_instruct | [Alibaba-NLP/gte-Qwen2-1.5B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct) | 0.4238 | 0.3879 | 0.4719 | #9 |
+| code_model2vec_Qodo_Embed_1_1.5B | [Qodo/Qodo-Embed-1-1.5B](https://huggingface.co/Qodo/Qodo-Embed-1-1.5B) | 0.4101 | 0.3810 | 0.4532 | #10 |
+| code_model2vec_graphcodebert_base | [microsoft/codebert-base](https://huggingface.co/microsoft/codebert-base) | 0.3420 | 0.3140 | 0.3704 | #11 |
+| code_model2vec_Linq_Embed_Mistral | [Linq-AI-Research/Linq-Embed-Mistral](https://huggingface.co/Linq-AI-Research/Linq-Embed-Mistral) | 0.2868 | 0.2581 | 0.3412 | #12 |
+| code_model2vec_codebert_base | [microsoft/codebert-base](https://huggingface.co/microsoft/codebert-base) | 0.2779 | 0.2534 | 0.3136 | #13 |
+### 📊 Model Specifications Analysis
+Our distilled models exhibit consistent architectural characteristics across different teacher models:
+| Model | Vocabulary Size | Parameters | Embedding Dim | Disk Size |
+|-------|----------------|------------|---------------|-----------|
+| all_mpnet_base_v2 | 29,528 | 7.6M | 256 | 14.4MB |
+| all_MiniLM_L6_v2 | 29,525 | 7.6M | 256 | 14.4MB |
+| jina_embeddings_v2_base_code | 61,053 | 15.6M | 256 | 29.8MB |
+| paraphrase_MiniLM_L6_v2 | 29,525 | 7.6M | 256 | 14.4MB |
+| Reason_ModernColBERT | 50,254 | 12.9M | 256 | 24.5MB |
+| bge_m3 | 249,999 | 64.0M | 256 | 122.1MB |
+| jina_embeddings_v3 | 249,999 | 64.0M | 256 | 122.1MB |
+| nomic_embed_text_v2_moe | 249,999 | 64.0M | 256 | 122.1MB |
+| gte_Qwen2_1.5B_instruct | 151,644 | 38.8M | 256 | 74.0MB |
+| Qodo_Embed_1_1.5B | 151,644 | 38.8M | 256 | 74.0MB |
+| graphcodebert_base | 50,262 | 12.9M | 256 | 24.5MB |
+| Linq_Embed_Mistral | 31,999 | 8.2M | 256 | 15.6MB |
+| codebert_base | 50,262 | 12.9M | 256 | 24.5MB |
+![Model Specifications](analysis_charts/model_specifications.png)
+*Comprehensive analysis of our distilled models showing vocabulary size, parameter count, embedding dimensions, and storage requirements.*
+#### Key Insights from Model Specifications:
+- **Vocabulary Consistency**: All models use vocabulary sizes ranging from 29,525 to 249,999 tokens (avg: 106,592)
+- **Parameter Efficiency**: Models range from 7.6M to 64.0M parameters (avg: 27.3M)
+- **Storage Efficiency**: Disk usage ranges from 14.4MB to 122.1MB (avg: 52.0MB)
+- **Embedding Dimensions**: Consistent 256 dimensions across all models (optimized for efficiency)
+### Key Findings
+- **Best Teacher Model**: code_model2vec_all_mpnet_base_v2 (NDCG@10: 0.7387)
+- **Least Effective Teacher**: code_model2vec_codebert_base (NDCG@10: 0.2779)
+- **Performance Range**: 62.4% difference between best and worst
+- **Average Performance**: 0.5178 NDCG@10
+## 🎯 Language Performance Radar Charts
+### Best Model vs Peer Models Comparison
+![Comparative Radar Chart](analysis_charts/comparative_radar.png)
+*Comparative view showing how the best simplified distillation model performs against top peer models across programming languages.*
+### Individual Model Performance by Language
+#### code_model2vec_all_mpnet_base_v2 (Teacher: [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)) - NDCG@10: 0.7387
+![code_model2vec_all_mpnet_base_v2 Radar Chart](analysis_charts/radar_code_model2vec_all_mpnet_base_v2.png)
+#### code_model2vec_all_MiniLM_L6_v2 (Teacher: [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)) - NDCG@10: 0.7385
+![code_model2vec_all_MiniLM_L6_v2 Radar Chart](analysis_charts/radar_code_model2vec_all_MiniLM_L6_v2.png)
+#### code_model2vec_jina_embeddings_v2_base_code (Teacher: [jina-embeddings-v2-base-code](https://huggingface.co/jina-embeddings-v2-base-code)) - NDCG@10: 0.7381
+![code_model2vec_jina_embeddings_v2_base_code Radar Chart](analysis_charts/radar_code_model2vec_jina_embeddings_v2_base_code.png)
+#### code_model2vec_paraphrase_MiniLM_L6_v2 (Teacher: [sentence-transformers/paraphrase-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2)) - NDCG@10: 0.7013
+![code_model2vec_paraphrase_MiniLM_L6_v2 Radar Chart](analysis_charts/radar_code_model2vec_paraphrase_MiniLM_L6_v2.png)
+#### code_model2vec_Reason_ModernColBERT (Teacher: [lightonai/Reason-ModernColBERT](https://huggingface.co/lightonai/Reason-ModernColBERT)) - NDCG@10: 0.6598
+![code_model2vec_Reason_ModernColBERT Radar Chart](analysis_charts/radar_code_model2vec_Reason_ModernColBERT.png)
+#### code_model2vec_bge_m3 (Teacher: [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3)) - NDCG@10: 0.4863
+![code_model2vec_bge_m3 Radar Chart](analysis_charts/radar_code_model2vec_bge_m3.png)
+#### code_model2vec_jina_embeddings_v3 (Teacher: [jinaai/jina-embeddings-v3](https://huggingface.co/jinaai/jina-embeddings-v3)) - NDCG@10: 0.4755
+![code_model2vec_jina_embeddings_v3 Radar Chart](analysis_charts/radar_code_model2vec_jina_embeddings_v3.png)
+#### code_model2vec_nomic_embed_text_v2_moe (Teacher: [nomic-ai/nomic-embed-text-v2-moe](https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe)) - NDCG@10: 0.4532
+![code_model2vec_nomic_embed_text_v2_moe Radar Chart](analysis_charts/radar_code_model2vec_nomic_embed_text_v2_moe.png)
+#### code_model2vec_gte_Qwen2_1.5B_instruct (Teacher: [Alibaba-NLP/gte-Qwen2-1.5B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct)) - NDCG@10: 0.4238
+![code_model2vec_gte_Qwen2_1.5B_instruct Radar Chart](analysis_charts/radar_code_model2vec_gte_Qwen2_15B_instruct.png)
+#### code_model2vec_Qodo_Embed_1_1.5B (Teacher: [Qodo/Qodo-Embed-1-1.5B](https://huggingface.co/Qodo/Qodo-Embed-1-1.5B)) - NDCG@10: 0.4101
+![code_model2vec_Qodo_Embed_1_1.5B Radar Chart](analysis_charts/radar_code_model2vec_Qodo_Embed_1_15B.png)
+#### code_model2vec_graphcodebert_base (Teacher: [microsoft/codebert-base](https://huggingface.co/microsoft/codebert-base)) - NDCG@10: 0.3420
+![code_model2vec_graphcodebert_base Radar Chart](analysis_charts/radar_code_model2vec_graphcodebert_base.png)
+#### code_model2vec_Linq_Embed_Mistral (Teacher: [Linq-AI-Research/Linq-Embed-Mistral](https://huggingface.co/Linq-AI-Research/Linq-Embed-Mistral)) - NDCG@10: 0.2868
+![code_model2vec_Linq_Embed_Mistral Radar Chart](analysis_charts/radar_code_model2vec_Linq_Embed_Mistral.png)
+#### code_model2vec_codebert_base (Teacher: [microsoft/codebert-base](https://huggingface.co/microsoft/codebert-base)) - NDCG@10: 0.2779
+![code_model2vec_codebert_base Radar Chart](analysis_charts/radar_code_model2vec_codebert_base.png)
+## 🏆 Peer Model Comparison
+![Peer Comparison](analysis_charts/peer_comparison.png)
+*Comparison with established code-specialized embedding models using actual evaluation results.*
+### Complete Model Ranking
+| Rank | Model | Type | NDCG@10 | MRR | Recall@5 |
+|------|-------|------|---------|-----|----------|
+| 1 | Alibaba-NLP/gte-Qwen2-1.5B-instruct | General | 0.9729 | 0.9676 | 0.9825 |
+| 2 | Qodo/Qodo-Embed-1-1.5B | General | 0.9715 | 0.9659 | 0.9875 |
+| 3 | jina-embeddings-v2-base-code | General | 0.9677 | 0.9618 | 0.9849 |
+| 4 | jinaai/jina-embeddings-v3 | General | 0.9640 | 0.9573 | 0.9839 |
+| 5 | sentence-transformers/all-mpnet-base-v2 | General | 0.9477 | 0.9358 | 0.9732 |
+| 6 | nomic-ai/nomic-embed-text-v2-moe | General | 0.9448 | 0.9357 | 0.9659 |
+| 7 | sentence-transformers/all-MiniLM-L12-v2 | General | 0.9398 | 0.9265 | 0.9732 |
+| 8 | BAAI/bge-m3 | General | 0.9383 | 0.9295 | 0.9643 |
+| 9 | sentence-transformers/all-MiniLM-L6-v2 | General | 0.9255 | 0.9099 | 0.9642 |
+| 10 | lightonai/Reason-ModernColBERT | General | 0.9188 | 0.9036 | 0.9486 |
+| 11 | Linq-AI-Research/Linq-Embed-Mistral | General | 0.9080 | 0.8845 | 0.9650 |
+| 12 | sentence-transformers/paraphrase-MiniLM-L6-v2 | General | 0.8297 | 0.8016 | 0.8828 |
+| 13 | minishlab/potion-base-8M | Model2Vec | 0.8162 | 0.7817 | 0.8931 |
+| 14 | minishlab/potion-retrieval-32M | Model2Vec | 0.8137 | 0.7810 | 0.8792 |
+| 15 | code_model2vec_all_mpnet_base_v2 | **🔥 Simplified Distillation** | 0.7387 | 0.7010 | 0.8017 |
+| 16 | code_model2vec_all_MiniLM_L6_v2 | **🔥 Simplified Distillation** | 0.7385 | 0.7049 | 0.7910 |
+| 17 | code_model2vec_jina_embeddings_v2_base_code | **🔥 Simplified Distillation** | 0.7381 | 0.6996 | 0.8130 |
+| 18 | code_model2vec_paraphrase_MiniLM_L6_v2 | **🔥 Simplified Distillation** | 0.7013 | 0.6638 | 0.7665 |
+| 19 | code_model2vec_Reason_ModernColBERT | **🔥 Simplified Distillation** | 0.6598 | 0.6228 | 0.7260 |
+| 20 | potion-multilingual-128M | Model2Vec | 0.6124 | 0.5683 | 0.7017 |
+| 21 | huggingface/CodeBERTa-small-v1 | Code-Specific | 0.5903 | 0.5350 | 0.6779 |
+| 22 | Salesforce/codet5-base | Code-Specific | 0.4872 | 0.4500 | 0.5742 |
+| 23 | code_model2vec_bge_m3 | **🔥 Simplified Distillation** | 0.4863 | 0.4439 | 0.5514 |
+| 24 | code_model2vec_jina_embeddings_v3 | **🔥 Simplified Distillation** | 0.4755 | 0.4416 | 0.5456 |
+| 25 | code_model2vec_nomic_embed_text_v2_moe | **🔥 Simplified Distillation** | 0.4532 | 0.4275 | 0.5094 |
+| 26 | code_model2vec_gte_Qwen2_1.5B_instruct | **🔥 Simplified Distillation** | 0.4238 | 0.3879 | 0.4719 |
+| 27 | code_model2vec_Qodo_Embed_1_1.5B | **🔥 Simplified Distillation** | 0.4101 | 0.3810 | 0.4532 |
+| 28 | microsoft/graphcodebert-base | Code-Specific | 0.4039 | 0.3677 | 0.4650 |
+| 29 | code_model2vec_graphcodebert_base | **🔥 Simplified Distillation** | 0.3420 | 0.3140 | 0.3704 |
+| 30 | code_model2vec_Linq_Embed_Mistral | **🔥 Simplified Distillation** | 0.2868 | 0.2581 | 0.3412 |
+| 31 | code_model2vec_codebert_base | **🔥 Simplified Distillation** | 0.2779 | 0.2534 | 0.3136 |
+| 32 | microsoft/codebert-base | Code-Specific | 0.1051 | 0.1058 | 0.1105 |
+## 📈 Performance Analysis
+### Multi-Model Comparison Charts
+![Model Comparison](analysis_charts/model_comparison.png)
+*Comprehensive comparison across all evaluation metrics.*
+### Language Performance Analysis
+![Language Heatmap](analysis_charts/language_heatmap.png)
+*Performance heatmap showing how different models perform across programming languages.*
+### Efficiency Analysis
+![Efficiency Analysis](analysis_charts/efficiency_analysis.png)
+*Performance vs model size analysis showing the efficiency benefits of distillation.*
+## ⚡ Operational Performance Analysis
+![Benchmark Performance](analysis_charts/benchmark_performance.png)
+*Comprehensive performance benchmarking across multiple operational metrics.*
+### Performance Scaling Analysis
+![Batch Size Scaling](analysis_charts/batch_size_scaling.png)
+*How performance scales with different batch sizes for optimal throughput.*
+![Memory Scaling](analysis_charts/memory_scaling.png)
+*Memory usage patterns across different batch sizes.*
+## 🔍 Language-Specific Analysis
+### Performance by Programming Language
+| Language | Best Model Performance | Average Performance | Language Difficulty |
+|----------|------------------------|--------------------|--------------------|
+| Go | 0.9780 | 0.6950 | Easy |
+| Java | 0.9921 | 0.6670 | Easy |
+| Javascript | 0.9550 | 0.5847 | Easy |
+| Php | 1.0000 | 0.6379 | Easy |
+| Python | 1.0000 | 0.8604 | Easy |
+| Ruby | 0.9493 | 0.6372 | Easy |
+## 🎯 Conclusions and Recommendations
+### Teacher Model Analysis
+Based on the evaluation results across all simplified distillation models:
+1. **Best Teacher Model**: sentence-transformers/all-mpnet-base-v2 (NDCG@10: 0.7387)
+2. **Least Effective Teacher**: microsoft/codebert-base (NDCG@10: 0.2779)
+3. **Teacher Model Impact**: Choice of teacher model affects performance by 62.4%
+### Recommendations
+- **For Production**: Use sentence-transformers/all-mpnet-base-v2 as teacher model for best performance
+- **For Efficiency**: Model2Vec distillation provides significant size reduction with competitive performance
+- **For Code Tasks**: Specialized models consistently outperform general-purpose models
+## 📄 Methodology
+### Evaluation Protocol
+- **Dataset**: CodeSearchNet test sets for 6 programming languages
+- **Metrics**: NDCG@k, MRR, Recall@k following CodeSearchNet methodology
+- **Query Format**: Natural language documentation strings
+- **Corpus Format**: Function code strings
+- **Evaluation**: Retrieval of correct code for each documentation query
+### Teacher Models Tested
+- [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) (proven baseline)
+- [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) (general purpose)
+- [sentence-transformers/paraphrase-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2) (paraphrase model)
+- [microsoft/codebert-base](https://huggingface.co/microsoft/codebert-base) (code-specialized)
+- [microsoft/graphcodebert-base](https://huggingface.co/microsoft/graphcodebert-base) (graph-aware code model)
+- [Alibaba-NLP/gte-Qwen2-1.5B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct) (instruction model)
+- [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3) (multilingual model)
+- [jinaai/jina-embeddings-v3](https://huggingface.co/jinaai/jina-embeddings-v3) (modern embedding model)
+- [nomic-ai/nomic-embed-text-v2-moe](https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe) (mixture of experts)
+- [Qodo/Qodo-Embed-1-1.5B](https://huggingface.co/Qodo/Qodo-Embed-1-1.5B) (code-specialized)
+- [lightonai/Reason-ModernColBERT](https://huggingface.co/lightonai/Reason-ModernColBERT) (ColBERT architecture)
+- [Linq-AI-Research/Linq-Embed-Mistral](https://huggingface.co/Linq-AI-Research/Linq-Embed-Mistral) (Mistral-based)
+- [BAAI/bge-code-v1](https://huggingface.co/BAAI/bge-code-v1) (code-specialized BGE)
+- [Salesforce/SFR-Embedding-Code-2B_R](https://huggingface.co/Salesforce/SFR-Embedding-Code-2B_R) (large code model)
+### Distillation Method
+- **Technique**: Model2Vec static embedding generation
+- **Parameters**: PCA dims=256, SIF coefficient=1e-3, Zipf weighting=True
+- **Training Data**: CodeSearchNet comment-code pairs
+- **Languages**: Python, JavaScript, Java, PHP, Ruby, Go
+---
+*Report generated on 2025-05-31 11:39:39 using automated analysis pipeline.*
+*For questions about methodology or results, please refer to the CodeSearchNet documentation.*

Taskfile.yml ADDED Viewed

	@@ -0,0 +1,23 @@

+version: "3"
+tasks:
+  default:
+    desc: List all available tasks
+    cmds:
+      - task -l
+  lint:
+    desc: Run all linting checks
+    cmds:
+      - uv run ruff check src --fix --unsafe-fixes
+  type:
+    desc: Run type checker
+    cmds:
+      - find src/distiller -name "*.py" | xargs uv run mypy
+  format:
+    desc: Run all formatters
+    cmds:
+      - uv run ruff format src

analysis_charts/batch_size_scaling.png ADDED Viewed

Git LFS Details

SHA256: ef5210a43a6c09db766650ba45a577916557f05384d127541eceebe1a8f42615
Pointer size: 132 Bytes
Size of remote file: 1.02 MB

analysis_charts/benchmark_performance.png ADDED Viewed

Git LFS Details

SHA256: 04f5572ddb072b8085e3d22d621c0497de35623703eb5e3698a0bd0b32e2ed9a
Pointer size: 132 Bytes
Size of remote file: 1.99 MB

analysis_charts/code_performance_radar.png ADDED Viewed

Git LFS Details

SHA256: 3acee93605e989a6cd8fa31cf3f4e3708df977fcf56ce08da40dd687096d9101
Pointer size: 131 Bytes
Size of remote file: 190 kB

analysis_charts/comparative_radar.png ADDED Viewed

Git LFS Details

SHA256: 902f53602bc6780da9a136b38fc8ea6b90d13c0fd2005e02cbd6f58e6f8f9b05
Pointer size: 131 Bytes
Size of remote file: 285 kB

analysis_charts/efficiency_analysis.png ADDED Viewed

Git LFS Details

SHA256: a8c5628d39cdd19ee9e77b38545647b80e1092d8cad9a0c7da1cfdb506651e35
Pointer size: 131 Bytes
Size of remote file: 221 kB

analysis_charts/language_heatmap.png ADDED Viewed

Git LFS Details

SHA256: a408507a764878d0959da3ce4a972380cdcdba952515aef46292addb4052e182
Pointer size: 132 Bytes
Size of remote file: 1.17 MB

analysis_charts/memory_scaling.png ADDED Viewed

Git LFS Details

SHA256: d4506ecf38a454ffd5cac28b96a1fb1b287370bb576a74cd34aea8e8eb0b36e5
Pointer size: 131 Bytes
Size of remote file: 620 kB

analysis_charts/model_comparison.png ADDED Viewed

Git LFS Details

SHA256: 2e5d458ff85a706db56ff7774adbc26fa6645e94e9154352e3b45e0db890051c
Pointer size: 132 Bytes
Size of remote file: 1.18 MB

analysis_charts/model_specifications.png ADDED Viewed

Git LFS Details

SHA256: 685fe86b51d7175d783948ae1928a03959e9ed67073da720cd4cc881869140d7
Pointer size: 131 Bytes
Size of remote file: 617 kB

analysis_charts/peer_comparison.png ADDED Viewed

Git LFS Details

SHA256: a95bc3dc495b1e6ef637b5719611581ca9890b689d72bdf3399a29fca23619ce
Pointer size: 131 Bytes
Size of remote file: 679 kB

analysis_charts/radar_code_model2vec_Linq_Embed_Mistral.png ADDED Viewed

Git LFS Details

SHA256: 2b229a777e69b37402b2a0ac77b878436d3a6321912767c20da84bfb6ab893f1
Pointer size: 131 Bytes
Size of remote file: 190 kB

analysis_charts/radar_code_model2vec_Qodo_Embed_1_15B.png ADDED Viewed

Git LFS Details

SHA256: 5d124f47dc851222959b8160313eeec01d0fb9071cb5c8b987536e4478d190e7
Pointer size: 131 Bytes
Size of remote file: 194 kB

analysis_charts/radar_code_model2vec_Reason_ModernColBERT.png ADDED Viewed

Git LFS Details

SHA256: ae23be629335dd36c443cd60ef96ced638c411b6c348433a605d16e3a55212ec
Pointer size: 131 Bytes
Size of remote file: 213 kB

analysis_charts/radar_code_model2vec_all_MiniLM_L6_v2.png ADDED Viewed

Git LFS Details

SHA256: 3a63eefc044329f66b19e3ec6f289d41863949c769642f195ec340456ad0ced2
Pointer size: 131 Bytes
Size of remote file: 193 kB

analysis_charts/radar_code_model2vec_all_mpnet_base_v2.png ADDED Viewed

Git LFS Details

SHA256: 3acee93605e989a6cd8fa31cf3f4e3708df977fcf56ce08da40dd687096d9101
Pointer size: 131 Bytes
Size of remote file: 190 kB

analysis_charts/radar_code_model2vec_bge_m3.png ADDED Viewed

Git LFS Details

SHA256: 530395605197f1c3e69454eafd61a944ffddb16a203478dbc672fb013a4a4685
Pointer size: 131 Bytes
Size of remote file: 215 kB

analysis_charts/radar_code_model2vec_codebert_base.png ADDED Viewed

Git LFS Details

SHA256: b016f7e05ceec769707b54ba62d523d731996a2ecd66d4d119474b34bbd63e41
Pointer size: 131 Bytes
Size of remote file: 194 kB

analysis_charts/radar_code_model2vec_graphcodebert_base.png ADDED Viewed

Git LFS Details

SHA256: 41bbe7c6aee8133e6caa0195b124474767dfa678663925fd19d671db58503a39
Pointer size: 131 Bytes
Size of remote file: 198 kB

analysis_charts/radar_code_model2vec_gte_Qwen2_15B_instruct.png ADDED Viewed

Git LFS Details

SHA256: 5d1ce04352fad4653d0854c7045f4d02ae720c6f504313b7ffc9ab6b570fe3f6
Pointer size: 131 Bytes
Size of remote file: 206 kB

analysis_charts/radar_code_model2vec_gte_Qwen2_7B_instruct.png ADDED Viewed

Git LFS Details

SHA256: 4b8399fe5ad1247ac4a69d730510d1fd668ae774739b81a437f3ca27fadc1749
Pointer size: 131 Bytes
Size of remote file: 180 kB

analysis_charts/radar_code_model2vec_jina_embeddings_v2_base_code.png ADDED Viewed

Git LFS Details

SHA256: 3ea35308256248d44dd9b3a00fbee3bfcfb24bfd282ce2826d55d4dc46c62661
Pointer size: 131 Bytes
Size of remote file: 182 kB

analysis_charts/radar_code_model2vec_jina_embeddings_v3.png ADDED Viewed

Git LFS Details

SHA256: f8434a8cebda7e3fc1455b7b3225ca4af945508a72b42af0a54b3810dacd5c3a
Pointer size: 131 Bytes
Size of remote file: 215 kB

analysis_charts/radar_code_model2vec_nomic_embed_text_v2_moe.png ADDED Viewed

Git LFS Details

SHA256: 7a16b12430f75580d8003d1e888f6f2b9c8ee84e5f4aba492c2cce5f9594652c
Pointer size: 131 Bytes
Size of remote file: 205 kB

analysis_charts/radar_code_model2vec_paraphrase_MiniLM_L6_v2.png ADDED Viewed

Git LFS Details

SHA256: b0ea6532f6e426d474a5f7826c66b350928c230e41855fc3f2fcbdbcfd3b34c4
Pointer size: 131 Bytes
Size of remote file: 183 kB

distill.py DELETED Viewed

@@ -1,116 +0,0 @@
-#!/usr/bin/env python
-"""
-Script to distill Alibaba-NLP/gte-Qwen2-7B-instruct using Model2Vec.
-This script performs the following operations:
-1. Downloads the Alibaba-NLP/gte-Qwen2-7B-instruct model
-2. Distills it using Model2Vec to create a smaller, faster static model
-3. Saves the distilled model for further use
-"""
-import logging
-import shutil
-import time
-from pathlib import Path
-from model2vec.distill import distill
-# =============================================================================
-# CONFIGURATION CONSTANTS
-# =============================================================================
-# Model Configuration
-MODEL_NAME = "Alibaba-NLP/gte-Qwen2-7B-instruct"  # Model name or path for the source model
-OUTPUT_DIR = "."  # Directory to save the distilled model (current directory)
-PCA_DIMS = 256  # Dimensions for PCA reduction (smaller = faster but less accurate)
-# Hub Configuration
-SAVE_TO_HUB = False  # Whether to push the model to HuggingFace Hub
-HUB_MODEL_ID = None  # Model ID for HuggingFace Hub (if saving to hub)
-# Generation Configuration
-SKIP_README = True  # Skip generating the README file
-# =============================================================================
-# Configure logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-logger = logging.getLogger(__name__)
-def main() -> None:
-	"""Run the distillation process for Alibaba-NLP/gte-Qwen2-7B-instruct."""
-	# Create output directory if it doesn't exist
-	output_dir = Path(OUTPUT_DIR)
-	output_dir.mkdir(parents=True, exist_ok=True)
-	logger.info(f"Starting distillation of {MODEL_NAME}")
-	logger.info(f"Distilled model will be saved to {output_dir}")
-	logger.info(f"Using PCA dimensions: {PCA_DIMS}")
-	logger.info(f"Skipping README generation: {SKIP_README}")
-	# Record start time for benchmarking
-	start_time = time.time()
-	# Run the distillation
-	try:
-		logger.info("Starting Model2Vec distillation...")
-		m2v_model = distill(
-			model_name=MODEL_NAME,
-			pca_dims=PCA_DIMS,
-		)
-		distill_time = time.time() - start_time
-		logger.info(f"Distillation completed in {distill_time:.2f} seconds")
-		# Save the distilled model
-		m2v_model.save_pretrained(OUTPUT_DIR)
-		logger.info(f"Model saved to {OUTPUT_DIR}")
-		# Remove README.md if it was created and we want to skip it
-		if SKIP_README and (output_dir / "README.md").exists():
-			(output_dir / "README.md").unlink()
-			logger.info("Removed auto-generated README.md")
-		# Get model size information
-		model_size_mb = sum(
-			f.stat().st_size for f in output_dir.glob("**/*") if f.is_file() and f.name != "README.md"
-		) / (1024 * 1024)
-		logger.info(f"Distilled model size: {model_size_mb:.2f} MB")
-		# Push to hub if requested
-		if SAVE_TO_HUB:
-			if HUB_MODEL_ID:
-				logger.info(f"Pushing model to HuggingFace Hub as {HUB_MODEL_ID}")
-				# Create a temporary README for Hub upload if needed
-				readme_path = output_dir / "README.md"
-				had_readme = readme_path.exists()
-				if SKIP_README and had_readme:
-					# Backup the README
-					shutil.move(readme_path, output_dir / "README.md.bak")
-				# Push to Hub
-				m2v_model.push_to_hub(HUB_MODEL_ID)
-				# Restore state
-				if SKIP_README:
-					if had_readme:
-						# Restore the backup
-						shutil.move(output_dir / "README.md.bak", readme_path)
-					elif (output_dir / "README.md").exists():
-						# Remove README created during push_to_hub
-						(output_dir / "README.md").unlink()
-			else:
-				logger.error("HUB_MODEL_ID must be specified when SAVE_TO_HUB is True")
-		logger.info("Distillation process completed successfully!")
-	except Exception:
-		logger.exception("Error during distillation")
-		raise
-if __name__ == "__main__":
-	main()

evaluate.py DELETED Viewed

@@ -1,422 +0,0 @@
-#!/usr/bin/env python
-"""
-Script to evaluate the performance of the distilled Qodo-Embed model.
-This script performs the following:
-1. Loads both the original Qodo-Embed-1-1.5B model and the distilled version
-2. Compares them on:
-   - Embedding similarity
-   - Inference speed
-   - Memory usage
-3. Outputs a comprehensive evaluation report
-"""
-import argparse
-import gc
-import logging
-import os
-import time
-from pathlib import Path
-from typing import Any, cast
-import matplotlib.pyplot as plt
-import numpy as np
-import psutil  # type: ignore [import]
-import torch
-from model2vec import StaticModel
-from sentence_transformers import SentenceTransformer
-from sklearn.metrics.pairwise import cosine_similarity  # type: ignore [import]
-# For transformer models
-from transformers import AutoModel, AutoTokenizer
-from transformers.modeling_utils import PreTrainedModel
-# =============================================================================
-# CONFIGURATION CONSTANTS
-# =============================================================================
-# Model Configuration
-ORIGINAL_MODEL = "Alibaba-NLP/gte-Qwen2-7B-instruct"  # Original model name or path
-DISTILLED_MODEL = "."  # Path to the distilled model (current directory)
-OUTPUT_DIR = "evaluation"  # Directory to save evaluation results
-# =============================================================================
-# Constants
-BYTES_PER_KB = 1024.0
-TEXT_TRUNCATE_LENGTH = 20
-# Configure logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-logger = logging.getLogger(__name__)
-# Sample texts for evaluation
-SAMPLE_TEXTS = [
-	"def process_data_stream(source_iterator):",
-	"implement binary search tree",
-	"how to handle memory efficient data streaming",
-	"""class LazyLoader:
-        def __init__(self, source):
-            self.generator = iter(source)
-            self._cache = []""",
-	"""def dfs_traversal(root):
-        if not root:
-            return []
-        visited = []
-        stack = [root]
-        while stack:
-            node = stack.pop()
-            visited.append(node.val)
-            if node.right:
-                stack.append(node.right)
-            if node.left:
-                stack.append(node.left)
-        return visited""",
-]
-def load_models(
-	original_model_name: str, distilled_model_path: str
-) -> tuple[tuple[SentenceTransformer | PreTrainedModel, str], StaticModel]:
-	"""Load both the original and distilled models."""
-	logger.info(f"Loading original model: {original_model_name}")
-	try:
-		# Try to load as a sentence transformer first
-		original_model = SentenceTransformer(original_model_name)
-		model_type = "sentence_transformer"
-	except (ValueError, OSError, ImportError) as e:
-		# If that fails, try loading as a Hugging Face transformer
-		logger.info(f"Failed to load as SentenceTransformer: {e}")
-		AutoTokenizer.from_pretrained(original_model_name)
-		original_model = AutoModel.from_pretrained(original_model_name)
-		model_type = "huggingface"
-	logger.info(f"Loading distilled model from: {distilled_model_path}")
-	distilled_model = StaticModel.from_pretrained(distilled_model_path)
-	return (original_model, model_type), distilled_model
-def measure_memory_usage(model: SentenceTransformer | PreTrainedModel | StaticModel) -> float:
-	"""Measure memory usage of a model in MB."""
-	gc.collect()
-	torch.cuda.empty_cache() if torch.cuda.is_available() else None
-	process = psutil.Process(os.getpid())
-	memory_before = process.memory_info().rss / (1024 * 1024)  # MB
-	# Force model to allocate memory if it hasn't already
-	if isinstance(model, StaticModel | SentenceTransformer):
-		_ = model.encode(["Test"])
-	else:
-		# For HF models, we need to handle differently
-		pass
-	gc.collect()
-	torch.cuda.empty_cache() if torch.cuda.is_available() else None
-	process = psutil.Process(os.getpid())
-	memory_after = process.memory_info().rss / (1024 * 1024)  # MB
-	return memory_after - memory_before
-def compute_embeddings(
-	original_model: SentenceTransformer | PreTrainedModel,
-	original_model_type: str,
-	distilled_model: StaticModel,
-	texts: list[str],
-	original_model_name: str = "unknown",
-) -> tuple[np.ndarray, np.ndarray]:
-	"""Compute embeddings using both models."""
-	# Original model embeddings
-	if original_model_type == "sentence_transformer":
-		# Type narrowing: we know it's a SentenceTransformer here
-		sentence_model = cast("SentenceTransformer", original_model)
-		original_embeddings = sentence_model.encode(texts)
-	else:
-		# Type narrowing: we know it's a PreTrainedModel here
-		auto_model = original_model  # AutoModel.from_pretrained returns a PreTrainedModel instance
-		# For HF models, we need more custom code
-		# Simple mean pooling function for HF models
-		def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
-			token_embeddings = model_output
-			input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-			return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
-				input_mask_expanded.sum(1), min=1e-9
-			)
-		# Get model name for tokenizer
-		model_name = getattr(auto_model.config, "name_or_path", original_model_name)
-		tokenizer = AutoTokenizer.from_pretrained(model_name)
-		encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
-		with torch.no_grad():
-			model_output = auto_model(**encoded_input)
-			original_embeddings = mean_pooling(model_output.last_hidden_state, encoded_input["attention_mask"]).numpy()
-	# Distilled model embeddings
-	distilled_embeddings = distilled_model.encode(texts)
-	return original_embeddings, distilled_embeddings
-def measure_inference_speed(
-	model: SentenceTransformer | PreTrainedModel | StaticModel, model_type: str, texts: list[str], n_runs: int = 5
-) -> float:
-	"""Measure inference speed in texts/second."""
-	# Warmup
-	if model_type in {"sentence_transformer", "static_model"}:
-		# Type narrowing: we know it has encode method here
-		encode_model = cast("SentenceTransformer | StaticModel", model)
-		_ = encode_model.encode(texts[:1])
-	else:
-		# Type narrowing: we know it's a PreTrainedModel here
-		auto_model = cast("PreTrainedModel", model)
-		# Warmup for HF models
-		model_name = getattr(auto_model.config, "name_or_path", "unknown")
-		tokenizer = AutoTokenizer.from_pretrained(model_name)
-		encoded_input = tokenizer(texts[:1], padding=True, truncation=True, return_tensors="pt")
-		with torch.no_grad():
-			_ = auto_model(**encoded_input)
-	# Measure speed
-	start_time = time.time()
-	if model_type in {"sentence_transformer", "static_model"}:
-		# Type narrowing: we know it has encode method here
-		encode_model = cast("SentenceTransformer | StaticModel", model)
-		for _ in range(n_runs):
-			_ = encode_model.encode(texts)
-	else:
-		# Type narrowing: we know it's a PreTrainedModel here
-		auto_model = cast("PreTrainedModel", model)
-		# For HF models
-		model_name = getattr(auto_model.config, "name_or_path", "unknown")
-		tokenizer = AutoTokenizer.from_pretrained(model_name)
-		for _ in range(n_runs):
-			encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
-			with torch.no_grad():
-				_ = auto_model(**encoded_input)
-	total_time = time.time() - start_time
-	return (len(texts) * n_runs) / total_time
-def compute_cosine_similarity(embeddings1: np.ndarray, embeddings2: np.ndarray) -> np.ndarray:
-	"""Compute cosine similarity between embeddings, handling different dimensions.
-	For embeddings with different dimensions, we compute similarity by comparing
-	how they rank the same texts (semantically equivalent).
-	"""
-	# Ensure embeddings1 and embeddings2 are 2D arrays with shapes (n_samples, n_features)
-	if embeddings1.ndim == 1:
-		embeddings1 = embeddings1.reshape(1, -1)
-	if embeddings2.ndim == 1:
-		embeddings2 = embeddings2.reshape(1, -1)
-	# Check and transpose if needed to ensure samples are in rows
-	if embeddings2.shape[0] != len(SAMPLE_TEXTS) and embeddings2.shape[1] == len(SAMPLE_TEXTS):
-		embeddings2 = embeddings2.T
-	logger.info(f"Embeddings shapes: original={embeddings1.shape}, distilled={embeddings2.shape}")
-	# If dimensions differ, we compute similarity matrix based on how each model ranks text pairs
-	# This is a form of semantic similarity evaluation rather than direct vector comparison
-	similarity_matrix = np.zeros((len(SAMPLE_TEXTS), len(SAMPLE_TEXTS)))
-	# Compute similarity matrices within each embedding space
-	sim1 = cosine_similarity(embeddings1)
-	sim2 = cosine_similarity(embeddings2)
-	# The similarity between samples i and j is the correlation between how they rank other samples
-	for i in range(len(SAMPLE_TEXTS)):
-		for j in range(len(SAMPLE_TEXTS)):
-			# For diagonal elements (same sample), use a direct measure of how similar
-			# the two models rank that sample against all others
-			if i == j:
-				# Pearson correlation between the rankings (excluding self-comparison)
-				rankings1 = np.delete(sim1[i], i)
-				rankings2 = np.delete(sim2[i], i)
-				# Higher correlation means the models agree on the semantic similarity
-				similarity_matrix[i, j] = np.corrcoef(rankings1, rankings2)[0, 1]
-			else:
-				# For off-diagonal elements, compare how similarly both models relate samples i and j
-				similarity_matrix[i, j] = 1 - abs(sim1[i, j] - sim2[i, j])
-	return similarity_matrix
-def format_size(size_bytes: float) -> str:
-	"""Format size in bytes to human-readable format."""
-	for unit in ["B", "KB", "MB", "GB"]:
-		if size_bytes < BYTES_PER_KB:
-			return f"{size_bytes:.2f} {unit}"
-		size_bytes /= BYTES_PER_KB
-	return f"{size_bytes:.2f} TB"
-def plot_comparison(results: dict[str, Any], output_dir: str) -> None:
-	"""Generate comparison plots and save them."""
-	output_path = Path(output_dir)
-	output_path.mkdir(exist_ok=True, parents=True)
-	# Speed comparison
-	plt.figure(figsize=(10, 6))
-	models = ["Original", "Distilled"]
-	speeds = [results["original_speed"], results["distilled_speed"]]
-	plt.bar(models, speeds, color=["#1f77b4", "#ff7f0e"])
-	plt.ylabel("Texts per second")
-	plt.title("Inference Speed Comparison")
-	plt.savefig(output_path / "speed_comparison.png", dpi=300, bbox_inches="tight")
-	# Memory comparison
-	plt.figure(figsize=(10, 6))
-	memories = [results["original_memory"], results["distilled_memory"]]
-	plt.bar(models, memories, color=["#1f77b4", "#ff7f0e"])
-	plt.ylabel("Memory Usage (MB)")
-	plt.title("Memory Usage Comparison")
-	plt.savefig(output_path / "memory_comparison.png", dpi=300, bbox_inches="tight")
-	# Size comparison
-	plt.figure(figsize=(10, 6))
-	sizes = [results["original_size"], results["distilled_size"]]
-	plt.bar(models, sizes, color=["#1f77b4", "#ff7f0e"])
-	plt.ylabel("Model Size (MB)")
-	plt.title("Model Size Comparison")
-	plt.savefig(output_path / "size_comparison.png", dpi=300, bbox_inches="tight")
-	# Similarity matrix heatmap
-	plt.figure(figsize=(8, 6))
-	plt.imshow(results["similarity_matrix"], cmap="viridis", interpolation="nearest")
-	plt.colorbar(label="Cosine Similarity")
-	plt.title("Embedding Similarity Between Original and Distilled Models")
-	plt.xticks([])
-	plt.yticks(
-		range(len(SAMPLE_TEXTS)),
-		[t[:TEXT_TRUNCATE_LENGTH] + "..." if len(t) > TEXT_TRUNCATE_LENGTH else t for t in SAMPLE_TEXTS],
-	)
-	plt.savefig(output_path / "similarity_matrix.png", dpi=300, bbox_inches="tight")
-def evaluate_models(original_model_name: str, distilled_model_path: str, output_dir: str) -> dict[str, Any]:
-	"""Evaluate the original and distilled models."""
-	# Load models
-	(original_model, original_model_type), distilled_model = load_models(original_model_name, distilled_model_path)
-	# Measure model sizes
-	if isinstance(original_model, SentenceTransformer):
-		# For SentenceTransformer, get parameters from all modules
-		total_params = 0
-		for module in original_model.modules():
-			if hasattr(module, "parameters"):
-				for param in module.parameters():
-					total_params += param.numel()
-		original_model_size = total_params * 4 / (1024 * 1024)  # MB (assuming float32)
-	else:
-		# For PreTrainedModel
-		auto_model = original_model  # AutoModel.from_pretrained returns a PreTrainedModel instance
-		original_model_size = sum(p.numel() * 4 for p in auto_model.parameters()) / (
-			1024 * 1024
-		)  # MB (assuming float32)
-	# Calculate distilled model size - only count actual model files
-	model_files = ["model.safetensors", "config.json", "modules.json", "tokenizer.json"]
-	distilled_model_size = 0.0
-	for file_name in model_files:
-		file_path = Path(distilled_model_path) / file_name
-		if file_path.exists():
-			distilled_model_size += file_path.stat().st_size
-	distilled_model_size = distilled_model_size / (1024 * 1024)  # Convert to MB
-	# Measure memory usage
-	original_memory = measure_memory_usage(original_model)
-	distilled_memory = measure_memory_usage(distilled_model)
-	# Compute embeddings
-	original_embeddings, distilled_embeddings = compute_embeddings(
-		original_model, original_model_type, distilled_model, SAMPLE_TEXTS, original_model_name
-	)
-	# Compute similarity between embeddings
-	similarity_matrix = compute_cosine_similarity(original_embeddings, distilled_embeddings)
-	similarity_diagonal = np.diag(similarity_matrix)
-	avg_similarity = np.mean(similarity_diagonal)
-	# Measure inference speed
-	original_speed = measure_inference_speed(original_model, original_model_type, SAMPLE_TEXTS, n_runs=5)
-	distilled_speed = measure_inference_speed(distilled_model, "static_model", SAMPLE_TEXTS, n_runs=5)
-	# Collect results
-	results = {
-		"original_size": original_model_size,
-		"distilled_size": distilled_model_size,
-		"original_memory": original_memory,
-		"distilled_memory": distilled_memory,
-		"similarity_matrix": similarity_matrix,
-		"avg_similarity": avg_similarity,
-		"original_speed": original_speed,
-		"distilled_speed": distilled_speed,
-		"speed_improvement": distilled_speed / original_speed if original_speed > 0 else float("inf"),
-		"size_reduction": original_model_size / distilled_model_size if distilled_model_size > 0 else float("inf"),
-		"memory_reduction": original_memory / distilled_memory if distilled_memory > 0 else float("inf"),
-	}
-	# Generate plots
-	plot_comparison(results, output_dir)
-	# Print results
-	separator = "=" * 50
-	logger.info("\n%s", separator)
-	logger.info("Model Evaluation Results")
-	logger.info("%s", separator)
-	logger.info(f"Original Model Size: {results['original_size']:.2f} MB")
-	logger.info(f"Distilled Model Size: {results['distilled_size']:.2f} MB")
-	logger.info(f"Size Reduction Factor: {results['size_reduction']:.2f}x")
-	logger.info("\n")
-	logger.info(f"Original Model Memory: {results['original_memory']:.2f} MB")
-	logger.info(f"Distilled Model Memory: {results['distilled_memory']:.2f} MB")
-	logger.info(f"Memory Reduction Factor: {results['memory_reduction']:.2f}x")
-	logger.info("\n")
-	logger.info(f"Original Model Speed: {results['original_speed']:.2f} texts/second")
-	logger.info(f"Distilled Model Speed: {results['distilled_speed']:.2f} texts/second")
-	logger.info(f"Speed Improvement Factor: {results['speed_improvement']:.2f}x")
-	logger.info("\n")
-	logger.info(f"Average Embedding Similarity: {results['avg_similarity']:.4f}")
-	logger.info("%s", separator)
-	return results
-def main() -> None:
-	"""Run the evaluation process."""
-	parser = argparse.ArgumentParser(description="Evaluate the distilled model against the original")
-	parser.add_argument("--original_model", default=ORIGINAL_MODEL, help="Original model name or path")
-	parser.add_argument("--distilled_model", default=DISTILLED_MODEL, help="Path to the distilled model")
-	parser.add_argument("--output_dir", default=OUTPUT_DIR, help="Directory to save evaluation results")
-	args = parser.parse_args()
-	# Validate configuration
-	if not args.distilled_model:
-		logger.error("Distilled model path must be provided")
-		logger.error("Use --distilled_model to specify the path or set DISTILLED_MODEL constant")
-		return
-	# Create output directory
-	output_dir = Path(args.output_dir)
-	output_dir.mkdir(parents=True, exist_ok=True)
-	# Run evaluation
-	try:
-		evaluate_models(args.original_model, args.distilled_model, args.output_dir)
-		logger.info(f"Evaluation completed. Results saved to {args.output_dir}")
-	except Exception:
-		logger.exception("Error during evaluation")
-		raise
-if __name__ == "__main__":
-	main()

pyproject.toml CHANGED Viewed

@@ -1,30 +1,57 @@
 [project]
 name = "gte-qwen2-7b-instruct-m2v"
 version = "0.1.0"
-description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
     "datasets>=3.6.0",
-    "evaluation",
     "iso639>=0.1.4",
     "lightning>=2.5.1.post0",
     "matplotlib>=3.10.3",
     "model2vec[train]>=0.5.0",
     "mteb>=1.14.15",
     "numpy>=1.26.4",
     "psutil>=7.0.0",
     "scikit-learn>=1.6.1",
     "sentence-transformers>=4.1.0",
     "torch>=2.7.0",
 ]
 [dependency-groups]
 dev = [
     "mypy>=1.15.0",
     "ruff>=0.11.6",
 ]
 [tool.mypy]
 exclude = [
     ".git",
@@ -79,6 +106,14 @@ ignore = [
     "E101",  # Indentation contains mixed spaces and tabs
     "W191",  # indentation contains tabs
     "D206",  # indent with spaces, not tabs
 ]
 [tool.ruff.lint.mccabe]
@@ -97,6 +132,3 @@ quote-style = "double"
 indent-style = "tab"
 skip-magic-trailing-comma = false
 line-ending = "auto"
-[tool.uv.sources]
-evaluation = { git = "https://github.com/MinishLab/evaluation.git", rev = "main" }

+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
 [project]
 name = "gte-qwen2-7b-instruct-m2v"
 version = "0.1.0"
+description = "Model2Vec distillation pipeline for gte-Qwen2-7B-instruct"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
+    "accelerate>=1.7.0",
+    "beam-client>=0.2.155",
+    "boto3>=1.38.23",
     "datasets>=3.6.0",
+    "dotenv>=0.9.9",
+    "editables>=0.5",
+    "einops>=0.8.1",
+    "flash-attn>=2.7.4.post1",
+    "hatchling>=1.27.0",
     "iso639>=0.1.4",
+    "kaleido==1.0.0rc13",
     "lightning>=2.5.1.post0",
     "matplotlib>=3.10.3",
     "model2vec[train]>=0.5.0",
     "mteb>=1.14.15",
     "numpy>=1.26.4",
+    "plotly>=6.1.1",
     "psutil>=7.0.0",
+    "pydantic>=2.11.5",
+    "requests>=2.32.3",
     "scikit-learn>=1.6.1",
+    "seaborn>=0.13.2",
     "sentence-transformers>=4.1.0",
+    "setuptools>=80.8.0",
+    "smart-open[s3]>=7.1.0",
+    "statsmodels>=0.14.4",
+    "tokenlearn>=0.2.0",
     "torch>=2.7.0",
+    "typer>=0.16.0",
 ]
+[project.scripts]
+distiller = "distiller.__main__:app"
 [dependency-groups]
 dev = [
     "mypy>=1.15.0",
     "ruff>=0.11.6",
 ]
+[tool.hatch.build.targets.wheel]
+packages = ["src/distiller"]
 [tool.mypy]
 exclude = [
     ".git",
     "E101",  # Indentation contains mixed spaces and tabs
     "W191",  # indentation contains tabs
     "D206",  # indent with spaces, not tabs
+    "PD901", # Avoid using the generic variable name `df` for DataFrames
+    "ANN401", # Dynamically typed expressions (typing.Any) are disallowed
+    "D103",   # Missing docstring in public function
+    "BLE001", # Do not catch blind exception: `Exception`
+    "T201", # Use `logger.info` instead of `print`
+    "E501", # Line too long
+    "PLR2004",
+    "RUF001",
 ]
 [tool.ruff.lint.mccabe]
 indent-style = "tab"
 skip-magic-trailing-comma = false
 line-ending = "auto"

src/distiller/distill.py CHANGED Viewed

@@ -20,23 +20,22 @@ Usage:
     distiller distill [--use-beam] [--train]  # Basic distillation or with training
 """
 import json
 import logging
 import time
 from pathlib import Path
 from typing import Annotated, Any
-import numpy as np
 import torch
 import typer
-from beam import Volume, function
 from datasets import load_dataset
 from model2vec.distill import distill
-from model2vec.train.base import FinetunableStaticModel, TextDataset
 from sentence_transformers import SentenceTransformer
-from sklearn.model_selection import train_test_split
-from torch import nn, optim
 from .beam_utils import (
 	BeamCheckpointManager,
 	create_beam_utilities,
@@ -46,16 +45,17 @@ from .beam_utils import (
 	upload_model_to_beam,
 )
 from .config import (
-	BEAM_ENV_SETTINGS,
-	GPU_NAME,
-	IMAGE,
 	codesearchnet_config,
 	directories,
 	distillation_config,
 	get_volume_config,
 	languages_config,
 )
 # =============================================================================
 # CONFIGURATION
 # =============================================================================
@@ -70,6 +70,75 @@ logger = logging.getLogger(__name__)
 # Teacher models for distillation
 DEFAULT_TEACHER_MODELS = list(distillation_config.code_teacher_models)
 # =============================================================================
 # UTILITY FUNCTIONS
 # =============================================================================
@@ -106,13 +175,11 @@ def get_current_config_hash(enable_training: bool) -> str:
 	}
 	if enable_training:
-		config_params.update(
-			{
-				"training_epochs": distillation_config.training_epochs,
-				"learning_rate": distillation_config.learning_rate,
-				"max_samples": distillation_config.max_training_samples,
-			}
 		)
 	config_str = str(sorted(config_params.items()))
 	return hashlib.md5(config_str.encode()).hexdigest()[:12]  # noqa: S324
@@ -345,13 +412,10 @@ def simple_distillation(
 def load_codesearchnet_dataset(
-	max_samples: int | None = None,
 	checkpoint_manager: BeamCheckpointManager | None = None,
 ) -> list[str]:
-	"""Load and format the CodeSearchNet dataset for training with balanced language distribution."""
-	if max_samples is None:
-		max_samples = int(distillation_config.max_training_samples)
 	logger.info(f"Loading CodeSearchNet dataset from {codesearchnet_config.dataset_name}")
 	logger.info(f"Limiting to {max_samples} samples for training efficiency")
 	logger.info(f"Languages: {', '.join(languages_config.all)}")
@@ -542,7 +606,7 @@ def generate_teacher_embeddings(
 	# Generate embeddings from scratch
 	logger.info("Generating fresh teacher embeddings...")
-	batch_size = int(distillation_config.teacher_model_config.get("batch_size", 16))
 	embeddings_list = []
 	for i in range(0, len(texts), batch_size):
@@ -614,146 +678,351 @@ def generate_teacher_embeddings(
 	return teacher_embeddings
-def advanced_training(
 	student_model: Any,
 	teacher_model: SentenceTransformer,
-	checkpoint_manager: BeamCheckpointManager | None = None,
 ) -> Any:
-	"""Perform advanced code specialization training."""
-	logger.info("🎓 Starting advanced code specialization training...")
-	# Load CodeSearchNet training data
-	training_texts = load_codesearchnet_dataset(checkpoint_manager=checkpoint_manager)
-	if not training_texts:
-		logger.warning("No training data available, skipping advanced training")
-		return student_model
-	# Generate teacher embeddings
-	teacher_embeddings = generate_teacher_embeddings(teacher_model, training_texts, checkpoint_manager)
-	# Create trainable model
-	student_embedding_dim = student_model.embedding.shape[1]
-	teacher_embedding_dim = teacher_embeddings.shape[1]
-	# Project teacher embeddings if needed
-	if teacher_embedding_dim != student_embedding_dim:
-		from sklearn.decomposition import PCA
-		logger.info("Performing PCA projection for dimension matching...")
-		pca = PCA(n_components=student_embedding_dim)
-		teacher_embeddings_np = teacher_embeddings.cpu().numpy().astype(np.float64)
-		teacher_embeddings_projected = pca.fit_transform(teacher_embeddings_np)
-		teacher_embeddings = torch.tensor(teacher_embeddings_projected.astype(np.float32), dtype=torch.float32)
-	# Create trainable model
-	trainable_model = FinetunableStaticModel.from_static_model(
-		model=student_model,
-		out_dim=student_embedding_dim,
-	)
-	trainable_model = trainable_model.float()
-	# Tokenize texts
-	tokenized_texts = []
-	for text in training_texts:
-		tokens = trainable_model.tokenize([text])
-		if tokens.shape[1] > 0:
-			tokenized_texts.append(tokens[0].tolist())
-	# Prepare training data
-	targets = teacher_embeddings[: len(tokenized_texts)].to(torch.float32)
-	train_texts, val_texts, train_targets, val_targets = train_test_split(
-		tokenized_texts, targets, test_size=0.2, random_state=42
-	)
-	# Training setup
-	train_dataset = TextDataset(train_texts, train_targets)
-	val_dataset = TextDataset(val_texts, val_targets)
-	optimizer = optim.Adam(trainable_model.parameters(), lr=float(distillation_config.learning_rate))
-	mse_loss = nn.MSELoss()
-	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-	trainable_model = trainable_model.to(device)
-	batch_size = int(distillation_config.batch_size)
-	epochs = int(distillation_config.training_epochs)
-	# Training loop
-	for epoch in range(epochs):
-		trainable_model.train()
 		try:
-			train_loader = train_dataset.to_dataloader(shuffle=True, batch_size=batch_size)
-			epoch_loss = 0.0
-			num_batches = 0
-			for _batch_idx, (tokens, targets_batch) in enumerate(train_loader):
-				batch_tokens = tokens.to(device)
-				batch_targets = targets_batch.to(device).to(torch.float32)
-				optimizer.zero_grad()
-				_, student_embeddings = trainable_model(batch_tokens)
-				student_embeddings = student_embeddings.to(torch.float32)
-				loss = mse_loss(student_embeddings, batch_targets)
-				loss.backward()
-				optimizer.step()
-				epoch_loss += loss.item()
-				num_batches += 1
-		except torch.cuda.OutOfMemoryError:
-			logger.warning(f"Training OOM with batch size {batch_size}, reducing...")
-			batch_size = max(1, batch_size // 2)
-			torch.cuda.empty_cache()
-			continue
-		avg_train_loss = epoch_loss / num_batches if num_batches > 0 else 0.0
-		# Validation
-		trainable_model.eval()
-		val_loader = val_dataset.to_dataloader(shuffle=False, batch_size=batch_size)
-		val_loss = 0.0
-		val_batches = 0
-		with torch.no_grad():
-			for tokens, targets_batch in val_loader:
-				batch_tokens = tokens.to(device)
-				batch_targets = targets_batch.to(device).to(torch.float32)
-				_, student_embeddings = trainable_model(batch_tokens)
-				student_embeddings = student_embeddings.to(torch.float32)
-				loss = mse_loss(student_embeddings, batch_targets)
-				val_loss += loss.item()
-				val_batches += 1
-		avg_val_loss = val_loss / val_batches if val_batches > 0 else 0.0
-		logger.info(f"Epoch {epoch + 1}/{epochs} - Train: {avg_train_loss:.6f}, Val: {avg_val_loss:.6f}")
-		# Save checkpoint
-		if checkpoint_manager:
-			checkpoint_data = {
-				"config_hash": get_current_config_hash(enable_training=True),
-				"stage": "training",
-				"step": epoch + 1,
-				"timestamp": time.time(),
-				"data": {
-					"model_state": trainable_model.state_dict(),
-					"optimizer_state": optimizer.state_dict(),
-					"train_loss": avg_train_loss,
-					"val_loss": avg_val_loss,
-				},
-			}
-			checkpoint_manager.save_checkpoint("training", checkpoint_data, epoch + 1)
-	# Convert back to static model
-	refined_model = trainable_model.to_static_model()
-	logger.info("✅ Advanced training completed")
-	return refined_model
 def distill_single_teacher(
@@ -884,15 +1153,15 @@ def distill_single_teacher(
 		# Step 3: Handle final model creation
 		if enable_training and base_model is not None:
-			# Perform advanced training
-			logger.info(f"🎓 Starting advanced training for {teacher_name}")
 			# Load teacher model for training
 			device = "cuda" if torch.cuda.is_available() else "cpu"
-			teacher_st_model = SentenceTransformer(teacher_model, device=device, trust_remote_code=True)
-			# Perform advanced training
-			final_model = advanced_training(base_model, teacher_st_model, checkpoint_mgr)
 			# Save final model
 			final_dir.mkdir(parents=True, exist_ok=True)
@@ -1031,14 +1300,7 @@ def run_local_distillation(
 	return results_summary
-@function(
-	gpu=GPU_NAME,
-	volumes=[Volume(name=VOLUME_CONFIG.name, mount_path=VOLUME_CONFIG.mount_path)],
-	image=IMAGE,
-	secrets=["HF_ACCESS_TOKEN"],
-	env=BEAM_ENV_SETTINGS,
-	timeout=3600 * 12,  # 12 hours
-)
 def _beam_distill_models(
 	teacher_models: list[str] | None = None,
 	enable_training: bool = False,
@@ -1194,7 +1456,8 @@ def main(
 ) -> None:
 	"""Unified distillation command with optional training."""
 	logger.info("🚀 Starting unified Model2Vec distillation workflow")
-	logger.info(f"🎓 Training mode: {'Advanced (CodeSearchNet fine-tuning)' if train else 'Basic distillation only'}")
 	logger.info(f"☁️  Execution: {'Beam' if use_beam else 'Local'}")
 	# Use default models if none specified
@@ -1355,7 +1618,6 @@ def salesforce_model_distillation(
 	try:
 		import torch
-		from sentence_transformers import SentenceTransformer
 		from transformers import AutoModel, AutoTokenizer
 		# Enhanced custom model loading for Salesforce models
@@ -1395,9 +1657,8 @@ def salesforce_model_distillation(
 			# Method 2: Try SentenceTransformer with specific settings
 			logger.info("🔄 Falling back to SentenceTransformer method...")
-			sentence_model = SentenceTransformer(
 				teacher_model,
-				trust_remote_code=True,
 				device="cpu",  # Force CPU loading first
 			)
@@ -1470,7 +1731,6 @@ def baai_bge_model_distillation(
 	try:
 		import torch
-		from sentence_transformers import SentenceTransformer
 		from transformers import AutoModel, AutoTokenizer
 		logger.info("🔧 Loading BAAI model with tokenizer workaround...")
@@ -1481,7 +1741,7 @@ def baai_bge_model_distillation(
 		# Method 1: Try SentenceTransformer first (often handles tokenizer issues better)
 		try:
 			logger.info("🔄 Attempting with SentenceTransformer wrapper...")
-			sentence_model = SentenceTransformer(teacher_model, trust_remote_code=True)
 			# Extract components
 			model = sentence_model[0].auto_model

     distiller distill [--use-beam] [--train]  # Basic distillation or with training
 """
+import importlib.util
 import json
 import logging
+import os
 import time
 from pathlib import Path
 from typing import Annotated, Any
 import torch
 import typer
+from beam import function
 from datasets import load_dataset
 from model2vec.distill import distill
 from sentence_transformers import SentenceTransformer
+# Try to import flash_attn to check if it's available
 from .beam_utils import (
 	BeamCheckpointManager,
 	create_beam_utilities,
 	upload_model_to_beam,
 )
 from .config import (
 	codesearchnet_config,
 	directories,
 	distillation_config,
+	get_distillation_function_kwargs,
 	get_volume_config,
 	languages_config,
 )
+# Check if flash_attn is available and compatible
+FLASH_ATTN_AVAILABLE = importlib.util.find_spec("flash_attn") is not None
 # =============================================================================
 # CONFIGURATION
 # =============================================================================
 # Teacher models for distillation
 DEFAULT_TEACHER_MODELS = list(distillation_config.code_teacher_models)
+# =============================================================================
+# FLASH ATTENTION UTILITIES
+# =============================================================================
+def configure_flash_attention() -> dict[str, Any]:
+	"""Configure flash attention settings and return model kwargs."""
+	model_kwargs: dict[str, Any] = {}
+	if not FLASH_ATTN_AVAILABLE:
+		logger.info("⚠️ Flash attention not available - using standard attention")
+		return model_kwargs
+	# Set environment variables for flash attention
+	os.environ["FLASH_ATTENTION_FORCE_USE"] = "1"
+	# Disable torch compile for flash attention compatibility
+	os.environ["TORCH_COMPILE_DISABLE"] = "1"
+	# Enable flash attention in transformers
+	os.environ["TOKENIZERS_PARALLELISM"] = "false"
+	# Check if we're on a compatible GPU
+	try:
+		if torch.cuda.is_available():
+			device_capability = torch.cuda.get_device_capability()
+			# Flash attention requires compute capability >= 7.5 (Turing, Ampere, Ada, Hopper)
+			if device_capability[0] >= 7 and (device_capability[0] > 7 or device_capability[1] >= 5):
+				logger.info("✅ Flash attention enabled - compatible GPU detected")
+				model_kwargs.update(
+					{
+						"model_kwargs": {
+							"attn_implementation": "flash_attention_2",
+							"torch_dtype": torch.float16,  # Flash attention works best with fp16
+							"use_flash_attention_2": True,
+							"_attn_implementation": "flash_attention_2",  # Alternative key for some models
+						}
+					}
+				)
+			else:
+				logger.info(f"⚠️ GPU compute capability {device_capability} < 7.5 - flash attention disabled")
+		else:
+			logger.info("⚠️ No CUDA available - flash attention disabled")
+	except Exception as e:
+		logger.warning(f"⚠️ Failed to check GPU compatibility: {e} - flash attention disabled")
+	return model_kwargs
+def load_model_with_flash_attention(model_path: str, device: str = "auto") -> SentenceTransformer:
+	"""Load a SentenceTransformer model with flash attention if available."""
+	flash_kwargs = configure_flash_attention()
+	try:
+		# Try loading with flash attention first
+		if flash_kwargs and "model_kwargs" in flash_kwargs:
+			logger.info(f"🚀 Loading model with flash attention: {Path(model_path).name}")
+			model = SentenceTransformer(model_path, device=device, trust_remote_code=True, **flash_kwargs)
+			logger.info("✅ Model loaded successfully with flash attention")
+			return model
+	except Exception as e:
+		logger.warning(f"⚠️ Failed to load with flash attention: {e}")
+		logger.info("🔄 Falling back to standard attention")
+	# Fallback to standard loading
+	logger.info(f"📂 Loading model with standard attention: {Path(model_path).name}")
+	model = SentenceTransformer(model_path, device=device, trust_remote_code=True)
+	logger.info("✅ Model loaded successfully with standard attention")
+	return model
 # =============================================================================
 # UTILITY FUNCTIONS
 # =============================================================================
 	}
 	if enable_training:
+		# Add a simple hash of tokenlearn parameters for config validation
+		tokenlearn_hash = hash(
+			f"{distillation_config.tokenlearn_dataset}_{distillation_config.tokenlearn_dataset_name}_{distillation_config.tokenlearn_text_key}"
 		)
+		config_params["tokenlearn_hash"] = float(abs(tokenlearn_hash) % 1000000)  # Convert to float for consistency
 	config_str = str(sorted(config_params.items()))
 	return hashlib.md5(config_str.encode()).hexdigest()[:12]  # noqa: S324
 def load_codesearchnet_dataset(
+	max_samples: int = 50000,
 	checkpoint_manager: BeamCheckpointManager | None = None,
 ) -> list[str]:
+	"""Load and format the CodeSearchNet dataset for token frequency computation."""
 	logger.info(f"Loading CodeSearchNet dataset from {codesearchnet_config.dataset_name}")
 	logger.info(f"Limiting to {max_samples} samples for training efficiency")
 	logger.info(f"Languages: {', '.join(languages_config.all)}")
 	# Generate embeddings from scratch
 	logger.info("Generating fresh teacher embeddings...")
+	batch_size = 16  # Fixed batch size for teacher embedding generation
 	embeddings_list = []
 	for i in range(0, len(texts), batch_size):
 	return teacher_embeddings
+def compute_token_frequencies_for_sif(
+	teacher_model: SentenceTransformer,
+	features_dir: Path,
+) -> None:
+	"""
+	Compute token frequencies from the training corpus for SIF weighting.
+	This follows the POTION approach for post-training re-regularization.
+	"""
+	import json
+	from collections import Counter
+	logger.info("📊 Computing token frequencies for SIF weighting...")
+	try:
+		# Load CodeSearchNet dataset to compute frequencies (limited sample for efficiency)
+		dataset_texts = load_codesearchnet_dataset(max_samples=10000)
+		logger.info(f"📊 Computing frequencies on {len(dataset_texts)} texts...")
+		# Tokenize all texts and count token frequencies
+		tokenizer = teacher_model.tokenizer
+		token_counts: Counter[int] = Counter()
+		# Process in batches to avoid memory issues
+		batch_size = 100
+		for i in range(0, len(dataset_texts), batch_size):
+			batch_texts = dataset_texts[i : i + batch_size]
+			for text in batch_texts:
+				# Tokenize the text
+				tokens = tokenizer.encode(text, add_special_tokens=False)
+				token_counts.update(tokens)
+			if i % (batch_size * 10) == 0:
+				logger.info(f"  Processed {i + len(batch_texts)}/{len(dataset_texts)} texts...")
+		# Convert to frequencies (token_id -> count)
+		token_frequencies = dict(token_counts)
+		# Save token frequencies to features directory for post-training regularization
+		freq_file = features_dir / "token_frequencies.json"
+		with freq_file.open("w") as f:
+			json.dump(token_frequencies, f, indent=2)
+		logger.info(f"✅ Token frequencies saved to {freq_file}")
+		logger.info(f"📊 Total unique tokens: {len(token_frequencies)}")
+		logger.info(f"📊 Total token occurrences: {sum(token_frequencies.values())}")
+	except Exception as e:
+		logger.warning(f"⚠️ Failed to compute token frequencies: {e}")
+		logger.warning("⚠️ Post-training re-regularization will use default Zipf weighting")
+def apply_post_training_regularization(
+	model: Any,
+	features_dir: Path,
+	pca_dims: int = 256,
+) -> Any:
+	"""
+	Apply post-training re-regularization following the POTION approach.
+	This includes:
+	1. Token frequency weighting using corpus frequencies
+	2. PCA application
+	3. SIF weighting using formula: w = 1e-3 / (1e-3 + proba)
+	"""
+	import json
+	import numpy as np
+	from sklearn.decomposition import PCA
+	logger.info("🔧 Starting post-training re-regularization (POTION Step 4)")
+	# Step 4a: Load token frequencies from the training corpus
+	logger.info("📊 Computing token frequencies from training corpus...")
+	# Try to load token frequencies from features directory
+	freq_file = features_dir / "token_frequencies.json"
+	if freq_file.exists():
+		with freq_file.open("r") as f:
+			token_frequencies = json.load(f)
+		logger.info(f"✅ Loaded token frequencies from {freq_file}")
+	else:
+		logger.warning("⚠️ Token frequencies not found - using default Zipf weighting")
+		# Fallback to basic frequency estimation based on rank
+		vocab_size = model.embedding.shape[0]
+		token_frequencies = {str(i): 1.0 / (i + 1) for i in range(vocab_size)}
+	# Step 4b: Apply PCA to the embeddings
+	logger.info(f"🔄 Applying PCA with {pca_dims} dimensions...")
+	# Get current embeddings
+	embeddings = model.embedding.cpu().numpy().astype(np.float64)
+	original_shape = embeddings.shape
+	logger.info(f"Original embedding shape: {original_shape}")
+	# Apply PCA if dimensions don't match
+	if original_shape[1] != pca_dims:
+		pca = PCA(n_components=pca_dims, random_state=42)
+		embeddings_pca = pca.fit_transform(embeddings)
+		logger.info(f"PCA applied: {original_shape} → {embeddings_pca.shape}")
+		# Explained variance ratio
+		explained_var = pca.explained_variance_ratio_.sum()
+		logger.info(f"PCA explained variance ratio: {explained_var:.4f}")
+	else:
+		embeddings_pca = embeddings
+		logger.info("PCA dimensions match - no PCA transformation needed")
+	# Step 4c: Apply SIF weighting using corpus frequencies
+	logger.info("⚖️ Applying SIF weighting based on token frequencies...")
+	# Convert token frequencies to probabilities
+	total_tokens = sum(token_frequencies.values())
+	token_probs = {token: freq / total_tokens for token, freq in token_frequencies.items()}
+	# Apply SIF weighting: w = 1e-3 / (1e-3 + proba)
+	sif_coefficient = 1e-3  # Standard SIF coefficient
+	for i in range(embeddings_pca.shape[0]):
+		token_id = str(i)
+		prob = token_probs[token_id] if token_id in token_probs else 1.0 / len(token_probs)
+		# Apply SIF weighting formula
+		sif_weight = sif_coefficient / (sif_coefficient + prob)
+		embeddings_pca[i] *= sif_weight
+	logger.info("✅ SIF weighting applied successfully")
+	# Step 4d: Create new model with re-regularized embeddings
+	logger.info("📦 Creating final model with re-regularized embeddings...")
+	# Convert back to float32 numpy array
+	final_embeddings = embeddings_pca.astype(np.float32)
+	# Create new model with updated embeddings
+	from model2vec.model import StaticModel
+	# Save tokenizer and config from original model
+	tokenizer = model.tokenizer
+	config = model.config
+	# Create new model with re-regularized embeddings
+	final_model = StaticModel(vectors=final_embeddings, tokenizer=tokenizer, config=config)
+	logger.info("✅ Post-training re-regularization completed successfully")
+	logger.info(f"Final model embedding shape: {final_model.embedding.shape}")
+	return final_model
+def tokenlearn_training(
 	student_model: Any,
 	teacher_model: SentenceTransformer,
+	checkpoint_manager: BeamCheckpointManager | None = None,  # noqa: ARG001
 ) -> Any:
+	"""
+	Perform tokenlearn training following the official POTION approach.
+	This follows the 4-step process:
+	1. Model2Vec distillation (already done - student_model)
+	2. Sentence transformer inference (create features)
+	3. Tokenlearn training
+	4. Post-training re-regularization (PCA + SIF weighting)
+	"""
+	import subprocess
+	import tempfile
+	from pathlib import Path
+	logger.info("🧪 Starting tokenlearn training (POTION approach)...")
+	# Create temporary directories for tokenlearn workflow
+	with tempfile.TemporaryDirectory() as temp_dir:
+		temp_path = Path(temp_dir)
+		features_dir = temp_path / "features"
+		model_dir = temp_path / "base_model"
+		trained_dir = temp_path / "trained_model"
+		features_dir.mkdir(exist_ok=True)
+		model_dir.mkdir(exist_ok=True)
+		trained_dir.mkdir(exist_ok=True)
+		# Save the base distilled model for tokenlearn
+		student_model.save_pretrained(str(model_dir))
+		logger.info(f"💾 Saved base model to {model_dir}")
+		# Step 2: Create features using tokenlearn-featurize
+		logger.info("🔍 Step 2: Creating features using sentence transformer...")
+		# Get teacher model name/path for tokenlearn
+		teacher_model_name = getattr(teacher_model, "model_name", None)
+		if not teacher_model_name and hasattr(teacher_model, "_modules") and len(teacher_model._modules) > 0:  # noqa: SLF001
+			# Try to extract from the first module if it's a SentenceTransformer
+			# _modules is a dict-like container, get the first module by iterating
+			first_module = next(iter(teacher_model._modules.values()))  # noqa: SLF001
+			if hasattr(first_module, "auto_model") and hasattr(first_module.auto_model, "name_or_path"):
+				teacher_model_name = first_module.auto_model.name_or_path
+		if not teacher_model_name:
+			logger.warning("⚠️ Could not determine teacher model name, using fallback")
+			teacher_model_name = "BAAI/bge-base-en-v1.5"  # Fallback to a common model
+		logger.info(f"📊 Using teacher model: {teacher_model_name}")
 		try:
+			# Use configured dataset for code specialization
+			featurize_cmd = [
+				"python",
+				"-m",
+				"tokenlearn.featurize",
+				"--model-name",
+				str(teacher_model_name),
+				"--output-dir",
+				str(features_dir),
+				"--dataset-path",
+				str(distillation_config.tokenlearn_dataset),
+				"--dataset-name",
+				str(distillation_config.tokenlearn_dataset_name),
+				"--dataset-split",
+				"train",
+				"--key",
+				str(distillation_config.tokenlearn_text_key),  # Use configured text field
+				"--batch-size",
+				"1024",  # Optimized batch size for A100-40G
+			]
+			logger.info("🔄 Running tokenlearn featurization...")
+			logger.info(
+				f"📊 Dataset: {distillation_config.tokenlearn_dataset} (config: {distillation_config.tokenlearn_dataset_name})"
+			)
+			logger.info(f"📝 Text field: {distillation_config.tokenlearn_text_key}")
+			logger.info(f"Command: {' '.join(featurize_cmd)}")
+			print(f"\n🔄 Executing: {' '.join(featurize_cmd)}\n")
+			result = subprocess.run(  # noqa: S603
+				featurize_cmd,
+				text=True,
+				timeout=distillation_config.tokenlearn_timeout_featurize,
+				check=False,
+			)
+			if result.returncode != 0:
+				logger.error(f"❌ Featurization failed with return code: {result.returncode}")
+				logger.error("💥 Tokenlearn featurization is required for training - cannot proceed")
+				msg = f"Tokenlearn featurization failed with return code: {result.returncode}"
+				raise RuntimeError(msg)
+			logger.info("✅ Featurization completed successfully")
+			# Generate token frequencies for post-training re-regularization
+			logger.info("📊 Computing token frequencies for SIF weighting...")
+			compute_token_frequencies_for_sif(teacher_model, features_dir)
+		except Exception as e:
+			logger.exception("💥 Tokenlearn featurization failed")
+			logger.exception("💥 Tokenlearn featurization is required for training - cannot proceed")
+			msg = f"Tokenlearn featurization failed: {e}"
+			raise RuntimeError(msg) from e
+		# Step 3: Train using tokenlearn-train
+		logger.info("🎓 Step 3: Training using tokenlearn...")
+		try:
+			train_cmd = [
+				"python",
+				"-m",
+				"tokenlearn.train",
+				"--model-name",
+				str(teacher_model_name),
+				"--data-path",
+				str(features_dir),
+				"--save-path",
+				str(trained_dir),
+			]
+			logger.info("🔄 Running tokenlearn training...")
+			logger.info(f"Command: {' '.join(train_cmd)}")
+			print(f"\n🎓 Executing: {' '.join(train_cmd)}\n")
+			result = subprocess.run(  # noqa: S603
+				train_cmd,
+				text=True,
+				timeout=distillation_config.tokenlearn_timeout_train,
+				check=False,
+			)
+			if result.returncode != 0:
+				logger.error(f"❌ Tokenlearn training failed with return code: {result.returncode}")
+				logger.error("💥 Tokenlearn training is required - cannot proceed")
+				msg = f"Tokenlearn training failed with return code: {result.returncode}"
+				raise RuntimeError(msg)
+			logger.info("✅ Tokenlearn training completed successfully")
+		except Exception as e:
+			logger.exception("💥 Tokenlearn training failed")
+			logger.exception("💥 Tokenlearn training is required - cannot proceed")
+			msg = f"Tokenlearn training failed: {e}"
+			raise RuntimeError(msg) from e
+		# Step 4: Load the trained model and apply post-training re-regularization
+		logger.info("📦 Step 4: Loading trained model and applying post-training re-regularization...")
+		try:
+			from model2vec.model import StaticModel
+			# Load the trained model from tokenlearn
+			trained_model_path = trained_dir / "model"
+			if not trained_model_path.exists():
+				# Try alternative paths
+				possible_paths = [
+					trained_dir / "model_weighted",
+					trained_dir,
+				]
+				for path in possible_paths:
+					if path.exists() and any(path.glob("*.json")):
+						trained_model_path = path
+						break
+				else:
+					logger.error(f"❌ Could not find trained model in {trained_dir}")
+					msg = f"Tokenlearn training failed - no model found in {trained_dir}"
+					raise RuntimeError(msg)
+			# Load the model before re-regularization
+			logger.info("🔄 Loading model from tokenlearn training...")
+			trained_model = StaticModel.from_pretrained(str(trained_model_path))
+			# Apply post-training re-regularization (POTION Step 4)
+			logger.info("🔧 Applying post-training re-regularization (PCA + SIF weighting)...")
+			final_model = apply_post_training_regularization(
+				trained_model, features_dir, pca_dims=distillation_config.optimal_pca_dims
+			)
+			logger.info("✅ Tokenlearn training pipeline with post-training re-regularization completed successfully")
+			return final_model
+		except Exception as e:
+			logger.exception("💥 Failed to load tokenlearn trained model")
+			logger.exception("💥 Cannot load trained model - training failed")
+			msg = f"Failed to load tokenlearn trained model: {e}"
+			raise RuntimeError(msg) from e
 def distill_single_teacher(
 		# Step 3: Handle final model creation
 		if enable_training and base_model is not None:
+			# Perform tokenlearn training (POTION approach)
+			logger.info(f"🧪 Starting tokenlearn training for {teacher_name}")
 			# Load teacher model for training
 			device = "cuda" if torch.cuda.is_available() else "cpu"
+			teacher_st_model = load_model_with_flash_attention(teacher_model, device)
+			# Perform tokenlearn training (POTION approach)
+			final_model = tokenlearn_training(base_model, teacher_st_model, checkpoint_mgr)
 			# Save final model
 			final_dir.mkdir(parents=True, exist_ok=True)
 	return results_summary
+@function(**get_distillation_function_kwargs())
 def _beam_distill_models(
 	teacher_models: list[str] | None = None,
 	enable_training: bool = False,
 ) -> None:
 	"""Unified distillation command with optional training."""
 	logger.info("🚀 Starting unified Model2Vec distillation workflow")
+	logger.info(f"🎓 Training mode: {'Tokenlearn (POTION) training' if train else 'Basic distillation only'}")
 	logger.info(f"☁️  Execution: {'Beam' if use_beam else 'Local'}")
 	# Use default models if none specified
 	try:
 		import torch
 		from transformers import AutoModel, AutoTokenizer
 		# Enhanced custom model loading for Salesforce models
 			# Method 2: Try SentenceTransformer with specific settings
 			logger.info("🔄 Falling back to SentenceTransformer method...")
+			sentence_model = load_model_with_flash_attention(
 				teacher_model,
 				device="cpu",  # Force CPU loading first
 			)
 	try:
 		import torch
 		from transformers import AutoModel, AutoTokenizer
 		logger.info("🔧 Loading BAAI model with tokenizer workaround...")
 		# Method 1: Try SentenceTransformer first (often handles tokenizer issues better)
 		try:
 			logger.info("🔄 Attempting with SentenceTransformer wrapper...")
+			sentence_model = load_model_with_flash_attention(teacher_model)
 			# Extract components
 			model = sentence_model[0].auto_model

src/distiller/evaluate.py CHANGED Viewed

@@ -17,8 +17,12 @@ Usage:
     distiller evaluate [--use-beam] [--skip-benchmark]  # Run evaluation locally or on Beam
 """
 import json
 import logging
 import time
 import traceback
 from pathlib import Path
@@ -29,7 +33,7 @@ import pandas as pd
 import psutil
 import torch
 import typer
-from beam import Volume, function
 from datasets import Dataset, load_dataset
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
@@ -37,24 +41,25 @@ from tqdm import tqdm
 from .beam_utils import download_specific_evaluation_file
 from .config import (
-	BEAM_ENV_SETTINGS,
 	DEFAULT_EVALUATION_MODELS,
-	GPU_NAME,
-	IMAGE,
 	codesearchnet_config,
 	directories,
 	get_safe_model_name,
 	get_volume_config,
 	languages_config,
 )
 logger = logging.getLogger(__name__)
 # =============================================================================
 # EVALUATION CONFIGURATION
 # =============================================================================
-BATCH_SIZE = 32
 LOCAL_EVALUATION_DIR = directories.evaluation_results
 LOCAL_BENCHMARK_DIR = directories.benchmark_results
 LOCAL_MODELS_DIR = directories.final
@@ -92,6 +97,200 @@ def complex_algorithm(data, config):
 }
 class PerformanceBenchmark:
 	"""Comprehensive performance benchmarking for embedding models."""
@@ -109,7 +308,7 @@ class PerformanceBenchmark:
 		start_time = time.time()
 		try:
-			self.model = SentenceTransformer(self.model_path, device=self.device, trust_remote_code=True)
 			load_time = time.time() - start_time
 			logger.info(f"✅ Model loaded in {load_time:.2f}s on {self.device}")
@@ -321,7 +520,7 @@ class PerformanceBenchmark:
 			logger.info(f"  📊 Testing on {device.upper()}")
 			try:
-				model = SentenceTransformer(self.model_path, device=device, trust_remote_code=True)
 				# Warmup
 				_ = model.encode(test_texts[:4], convert_to_tensor=False)
@@ -401,36 +600,71 @@ class CodeSearchNetEvaluator:
 		"""Load the embedding model."""
 		logger.info(f"Loading model from {self.model_path}")
 		try:
-			self.model = SentenceTransformer(self.model_path, trust_remote_code=True)
 			logger.info(f"Successfully loaded model: {self.model_name}")
 		except Exception:
 			logger.exception(f"Failed to load model from {self.model_path}")
 			raise
 	def encode_texts(self, texts: list[str], desc: str = "Encoding") -> np.ndarray:
-		"""Encode texts into embeddings with batching."""
 		if self.model is None:
 			msg = "Model not loaded"
 			raise RuntimeError(msg)
 		embeddings = []
-		for i in tqdm(range(0, len(texts), BATCH_SIZE), desc=desc):
-			batch = texts[i : i + BATCH_SIZE]
-			batch_embeddings = self.model.encode(batch, convert_to_tensor=False, normalize_embeddings=True)
-			embeddings.append(batch_embeddings)
 		return np.vstack(embeddings)
-	def evaluate_language(self, language: str, max_queries: int = 1000) -> dict[str, Any]:
 		"""Evaluate on a specific programming language."""
 		logger.info(f"Evaluating on {language} language (max {max_queries} queries)")
 		try:
-			# Load test split for the language
 			dataset = load_dataset(
 				codesearchnet_config.dataset_name,
 				language,
-				split="test",
 				trust_remote_code=True,
 			)
@@ -438,17 +672,17 @@ class CodeSearchNetEvaluator:
 				logger.error(f"Unexpected dataset type for {language}: {type(dataset)}")
 				return {}
-			# Sample queries for evaluation
-			if len(dataset) > max_queries:
-				rng = np.random.default_rng(42)
-				indices = rng.choice(len(dataset), max_queries, replace=False)
-				dataset = dataset.select(indices)
-			queries = []
-			codes = []
-			query_ids = []
 			for i, example in enumerate(dataset):
 				doc_string = example.get("func_documentation_string", "").strip()
 				code_string = example.get("func_code_string", "").strip()
@@ -461,8 +695,23 @@ class CodeSearchNetEvaluator:
 				logger.warning(f"No valid query-code pairs found for {language}")
 				return {}
 			logger.info(f"Found {len(queries)} valid query-code pairs for {language}")
 			# Encode queries and codes
 			start_time = time.time()
 			query_embeddings = self.encode_texts(queries, f"Encoding {language} queries")
@@ -548,7 +797,7 @@ class CodeSearchNetEvaluator:
 		return 0.0
 	def evaluate_all_languages(
-		self, max_queries_per_lang: int = 1000, languages: list[str] | None = None
 	) -> dict[str, Any]:
 		"""Evaluate on all specified languages."""
 		eval_languages = languages or languages_config.all
@@ -613,7 +862,7 @@ class ComprehensiveModelEvaluator:
 	def run_comprehensive_evaluation(
 		self,
-		max_queries_per_lang: int = 1000,
 		languages: list[str] | None = None,
 		skip_benchmark: bool = False,
 	) -> dict[str, Any]:
@@ -901,7 +1150,7 @@ def create_comparison_report(all_results: list[dict[str, Any]], output_dir: str
 def run_evaluation(
 	models: list[str],
-	max_queries: int = 1000,
 	languages: list[str] | None = None,
 	use_beam: bool = False,
 	skip_benchmark: bool = False,
@@ -962,7 +1211,7 @@ def run_evaluation(
 def _run_local_evaluation(
 	models: list[str],
-	max_queries: int = 1000,
 	languages: list[str] | None = None,
 	skip_benchmark: bool = False,
 ) -> list[dict[str, Any]]:
@@ -993,30 +1242,52 @@ def _run_local_evaluation(
 	return results
-@function(
-	gpu=GPU_NAME,
-	volumes=[Volume(name=VOLUME_CONFIG.name, mount_path=VOLUME_CONFIG.mount_path)],
-	image=IMAGE,
-	secrets=["HF_ACCESS_TOKEN"],
-	env=BEAM_ENV_SETTINGS,
-	timeout=3600 * 8,  # 8 hours for comprehensive evaluation
-)
 def _beam_evaluate_single_model(
 	model_path: str,
-	max_queries: int = 1000,
 	languages: list[str] | None = None,
 	skip_benchmark: bool = False,
 ) -> dict[str, Any]:
 	"""Beam function to comprehensively evaluate a single model."""
-	model_name = Path(model_path).name
 	logger.info(f"🚀 Beam comprehensive evaluation starting for {model_name}")
 	try:
 		evaluator = ComprehensiveModelEvaluator(model_path, model_name)
 		results = evaluator.run_comprehensive_evaluation(max_queries, languages, skip_benchmark)
 		# Save to Beam volume as single comprehensive file
 		volume_results_dir = Path(VOLUME_CONFIG.mount_path) / "evaluation_results"
 		volume_results_dir.mkdir(parents=True, exist_ok=True)
@@ -1026,17 +1297,65 @@ def _beam_evaluate_single_model(
 		with result_file.open("w") as f:
 			json.dump(results, f, indent=2, default=str)
-		logger.info(f"💾 Saved Beam comprehensive evaluation results for {model_name}")
 		return results
-	except Exception:
 		logger.exception(f"❌ Beam comprehensive evaluation failed for {model_name}")
-		return {}
 def _run_beam_evaluation(
 	models: list[str],
-	max_queries: int = 1000,
 	languages: list[str] | None = None,
 	skip_benchmark: bool = False,
 ) -> list[dict[str, Any]]:
@@ -1054,6 +1373,13 @@ def _run_beam_evaluation(
 			result = _beam_evaluate_single_model.remote(model_path, max_queries, languages, skip_benchmark)
 			if result:
 				# Download the comprehensive result file from Beam
 				success = download_specific_evaluation_file(
 					VOLUME_CONFIG.name,
@@ -1069,6 +1395,8 @@ def _run_beam_evaluation(
 					results.append(result)
 				else:
 					logger.warning(f"⚠️ Could not download results for {model_name}")
 		except Exception:
 			logger.exception(f"❌ Beam comprehensive evaluation failed for {model_name}")
@@ -1086,7 +1414,7 @@ def main(
 	use_beam: bool = typer.Option(default=False, help="Use Beam for evaluation"),
 	skip_third_party: bool = typer.Option(default=False, help="Skip third-party models"),
 	skip_benchmark: bool = typer.Option(default=False, help="Skip performance benchmarking"),
-	max_queries: int = typer.Option(default=1000, help="Maximum queries per language"),
 ) -> None:
 	"""Main comprehensive evaluation function."""
 	logger.info("🚀 Starting comprehensive model evaluation (CodeSearchNet + Performance)")

     distiller evaluate [--use-beam] [--skip-benchmark]  # Run evaluation locally or on Beam
 """
+# Try to import flash_attn to check if it's available
+import contextlib
+import importlib.util
 import json
 import logging
+import os
 import time
 import traceback
 from pathlib import Path
 import psutil
 import torch
 import typer
+from beam import function
 from datasets import Dataset, load_dataset
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from .beam_utils import download_specific_evaluation_file
 from .config import (
 	DEFAULT_EVALUATION_MODELS,
 	codesearchnet_config,
 	directories,
+	get_evaluation_function_kwargs,
 	get_safe_model_name,
 	get_volume_config,
 	languages_config,
 )
+# Check if flash_attn is available and compatible
+FLASH_ATTN_AVAILABLE = importlib.util.find_spec("flash_attn") is not None
 logger = logging.getLogger(__name__)
 # =============================================================================
 # EVALUATION CONFIGURATION
 # =============================================================================
+BATCH_SIZE = 10
 LOCAL_EVALUATION_DIR = directories.evaluation_results
 LOCAL_BENCHMARK_DIR = directories.benchmark_results
 LOCAL_MODELS_DIR = directories.final
 }
+def reset_cuda_state() -> None:
+	"""Aggressively reset CUDA state after memory allocation errors."""
+	if not torch.cuda.is_available():
+		return
+	try:
+		# Clear all CUDA caches
+		torch.cuda.empty_cache()
+		torch.cuda.ipc_collect()
+		torch.cuda.reset_peak_memory_stats()
+		# Try to force garbage collection
+		import gc
+		gc.collect()
+		logger.info("🧹 CUDA state reset completed")
+	except Exception as e:
+		logger.warning(f"⚠️ Could not fully reset CUDA state: {e}")
+def configure_flash_attention() -> dict[str, Any]:
+	"""Configure flash attention settings and return model kwargs."""
+	model_kwargs: dict[str, Any] = {}
+	if not FLASH_ATTN_AVAILABLE:
+		logger.info("⚠️ Flash attention not available - using standard attention")
+		return model_kwargs
+	# Set environment variables for flash attention and CUDA memory management
+	os.environ["TOKENIZERS_PARALLELISM"] = "false"
+	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+	# Check if we're on a compatible GPU
+	try:
+		if torch.cuda.is_available():
+			device_capability = torch.cuda.get_device_capability()
+			# Flash attention requires compute capability >= 7.5 (Turing, Ampere, Ada, Hopper)
+			if device_capability[0] >= 7 and (device_capability[0] > 7 or device_capability[1] >= 5):
+				logger.info("✅ Flash attention available - compatible GPU detected")
+				# For SentenceTransformer, we'll use environment variables to enable flash attention
+				os.environ["TRANSFORMERS_FLASH_ATTENTION"] = "1"
+			else:
+				logger.info(f"⚠️ GPU compute capability {device_capability} < 7.5 - flash attention disabled")
+		else:
+			logger.info("⚠️ No CUDA available - flash attention disabled")
+	except Exception as e:
+		logger.warning(f"⚠️ Failed to check GPU compatibility: {e} - flash attention disabled")
+	return model_kwargs
+def load_model_with_flash_attention(model_path: str, device: str = "auto") -> SentenceTransformer:
+	"""Load a SentenceTransformer model with flash attention if available."""
+	# Convert "auto" device to actual device
+	target_device = "cuda" if device == "auto" and torch.cuda.is_available() else device
+	if device == "auto" and not torch.cuda.is_available():
+		target_device = "cpu"
+	# Configure flash attention via environment variables
+	configure_flash_attention()
+	# Load model with standard SentenceTransformer initialization
+	logger.info(f"📂 Loading model: {Path(model_path).name}")
+	try:
+		# Try loading directly to target device first
+		model = SentenceTransformer(model_path, device=target_device, trust_remote_code=True)
+		logger.info(f"✅ Model loaded successfully on {target_device}")
+		return model
+	except (torch.OutOfMemoryError, RuntimeError) as oom_error:
+		# Handle both torch.OutOfMemoryError and RuntimeError (CUDA driver errors)
+		is_oom = isinstance(oom_error, torch.OutOfMemoryError) or "out of memory" in str(oom_error).lower()
+		if is_oom and target_device != "cpu":
+			logger.warning(f"⚠️ OOM loading directly to {target_device}, trying CPU first: {oom_error}")
+			try:
+				# Clear CUDA cache more aggressively after OOM
+				reset_cuda_state()
+				logger.info("🔄 Loading model on CPU first, then trying to move to GPU...")
+				model = SentenceTransformer(model_path, device="cpu", trust_remote_code=True)
+				logger.info("📦 Model loaded on CPU, attempting GPU transfer...")
+				# Try moving to GPU with additional error handling
+				try:
+					model = model.to(target_device)
+					logger.info(f"✅ Model successfully moved to {target_device}")
+					return model
+				except (RuntimeError, AssertionError) as gpu_move_error:
+					# Handle PyTorch internal errors and CUDA allocator issues
+					logger.warning(f"⚠️ GPU transfer failed: {gpu_move_error}")
+					if "INTERNAL ASSERT FAILED" in str(gpu_move_error) or "handles_" in str(gpu_move_error):
+						logger.warning("🔧 Detected CUDA allocator corruption, resetting and staying on CPU")
+						# Try to reset CUDA context
+						reset_cuda_state()
+					else:
+						# Re-raise unexpected GPU transfer errors
+						raise
+					logger.info("✅ Model will remain on CPU due to GPU memory issues")
+					return model
+			except Exception as fallback_error:
+				logger.warning(f"⚠️ CPU fallback failed: {fallback_error}, loading fresh on CPU")
+				# Clear any remaining CUDA state
+				reset_cuda_state()
+				model = SentenceTransformer(model_path, device="cpu", trust_remote_code=True)
+				logger.info("✅ Model loaded on CPU (all GPU attempts failed)")
+				return model
+		else:
+			# Re-raise if not OOM or already on CPU
+			raise
+	except ValueError as e:
+		if "'MaxSim' is not a valid SimilarityFunction" in str(e):
+			logger.warning(f"⚠️ Model {Path(model_path).name} uses unsupported MaxSim similarity function")
+			logger.info("🔧 Attempting workaround by loading with custom config...")
+			# Try loading with similarity function override
+			try:
+				# Load model components manually and override similarity function
+				import json
+				import tempfile
+				from pathlib import Path as PathLib
+				# Create temporary directory for modified config
+				with tempfile.TemporaryDirectory() as temp_dir:
+					temp_path = PathLib(temp_dir) / "temp_model"
+					# Download/copy model files
+					if model_path.startswith("http") or ("/" in model_path and not PathLib(model_path).exists()):
+						# It's a HuggingFace model ID
+						from huggingface_hub import snapshot_download
+						snapshot_download(model_path, local_dir=temp_path, ignore_patterns=["*.bin"])
+					else:
+						# It's a local path
+						import shutil
+						shutil.copytree(model_path, temp_path)
+					# Modify config to use supported similarity function
+					config_path = temp_path / "config_sentence_transformers.json"
+					if config_path.exists():
+						with config_path.open() as f:
+							config = json.load(f)
+						# Override similarity function to 'cosine' (supported)
+						if "similarity_fn_name" in config:
+							logger.info(
+								f"🔧 Changing similarity function from '{config['similarity_fn_name']}' to 'cosine'"
+							)
+							config["similarity_fn_name"] = "cosine"
+						with config_path.open("w") as f:
+							json.dump(config, f, indent=2)
+					# Load model with modified config
+					model = SentenceTransformer(str(temp_path), device=device, trust_remote_code=True)
+					logger.info("✅ Model loaded successfully with similarity function workaround")
+					return model
+			except Exception as workaround_error:
+				logger.warning(f"⚠️ Similarity function workaround failed: {workaround_error}")
+				logger.info("🔧 Attempting direct model component loading...")
+				# Last resort: try loading model components directly
+				try:
+					from sentence_transformers.models import Pooling, Transformer
+					# Load model components manually
+					logger.info("🔄 Loading model components directly...")
+					# Create SentenceTransformer components using model path
+					transformer = Transformer(model_path)
+					pooling = Pooling(transformer.get_word_embedding_dimension())
+					# Create SentenceTransformer with manual components
+					model = SentenceTransformer(modules=[transformer, pooling], device=device)
+					logger.info("✅ Model loaded successfully with direct component loading")
+					return model
+				except Exception as direct_error:
+					logger.warning(f"⚠️ Direct component loading failed: {direct_error}")
+					logger.exception(f"❌ All loading methods failed for {Path(model_path).name}")
+					raise e from direct_error
+		else:
+			raise
+	except Exception:
+		logger.exception(f"❌ Failed to load model {Path(model_path).name}")
+		raise
 class PerformanceBenchmark:
 	"""Comprehensive performance benchmarking for embedding models."""
 		start_time = time.time()
 		try:
+			self.model = load_model_with_flash_attention(self.model_path, self.device)
 			load_time = time.time() - start_time
 			logger.info(f"✅ Model loaded in {load_time:.2f}s on {self.device}")
 			logger.info(f"  📊 Testing on {device.upper()}")
 			try:
+				model = load_model_with_flash_attention(self.model_path, device)
 				# Warmup
 				_ = model.encode(test_texts[:4], convert_to_tensor=False)
 		"""Load the embedding model."""
 		logger.info(f"Loading model from {self.model_path}")
 		try:
+			self.model = load_model_with_flash_attention(self.model_path)
 			logger.info(f"Successfully loaded model: {self.model_name}")
 		except Exception:
 			logger.exception(f"Failed to load model from {self.model_path}")
 			raise
 	def encode_texts(self, texts: list[str], desc: str = "Encoding") -> np.ndarray:
+		"""Encode texts into embeddings with batching and memory management."""
 		if self.model is None:
 			msg = "Model not loaded"
 			raise RuntimeError(msg)
 		embeddings = []
+		# Use smaller batch size to avoid OOM
+		effective_batch_size = min(BATCH_SIZE, 5)  # Limit to 5 for large models
+		for i in tqdm(range(0, len(texts), effective_batch_size), desc=desc):
+			batch = texts[i : i + effective_batch_size]
+			try:
+				batch_embeddings = self.model.encode(batch, convert_to_tensor=False, normalize_embeddings=True)
+				embeddings.append(batch_embeddings)
+				# Clear CUDA cache periodically to prevent memory buildup
+				if torch.cuda.is_available() and i > 0 and i % (effective_batch_size * 4) == 0:
+					torch.cuda.empty_cache()
+			except (torch.OutOfMemoryError, RuntimeError) as e:
+				# Handle both torch.OutOfMemoryError and RuntimeError (CUDA driver errors)
+				is_oom = isinstance(e, torch.OutOfMemoryError) or "out of memory" in str(e).lower()
+				if is_oom:
+					logger.warning(
+						f"⚠️ OOM during encoding batch {i // effective_batch_size + 1}, trying smaller batch..."
+					)
+					# Try encoding one at a time
+					for single_text in batch:
+						try:
+							single_embedding = self.model.encode(
+								[single_text], convert_to_tensor=False, normalize_embeddings=True
+							)
+							embeddings.append(single_embedding)
+							if torch.cuda.is_available():
+								torch.cuda.empty_cache()
+						except (torch.OutOfMemoryError, RuntimeError) as single_e:
+							if isinstance(single_e, torch.OutOfMemoryError) or "out of memory" in str(single_e).lower():
+								logger.exception("❌ Cannot encode even single text, model too large for GPU")
+								raise
+							raise
+				else:
+					raise
 		return np.vstack(embeddings)
+	def evaluate_language(self, language: str, max_queries: int = 100) -> dict[str, Any]:
 		"""Evaluate on a specific programming language."""
 		logger.info(f"Evaluating on {language} language (max {max_queries} queries)")
 		try:
+			# Load ONLY test split for the language with streaming to avoid loading full dataset
+			logger.info(f"📥 Loading test split for {language}...")
 			dataset = load_dataset(
 				codesearchnet_config.dataset_name,
 				language,
+				split=f"test[:{max_queries * 10}]",  # Load 10x more than needed to ensure we get enough valid pairs
 				trust_remote_code=True,
 			)
 				logger.error(f"Unexpected dataset type for {language}: {type(dataset)}")
 				return {}
+			logger.info(f"📊 Loaded {len(dataset)} examples from {language} test split")
+			queries: list[str] = []
+			codes: list[str] = []
+			query_ids: list[str] = []
+			# Process examples and stop once we have enough valid pairs
 			for i, example in enumerate(dataset):
+				if len(queries) >= max_queries:  # Stop once we have enough
+					break
 				doc_string = example.get("func_documentation_string", "").strip()
 				code_string = example.get("func_code_string", "").strip()
 				logger.warning(f"No valid query-code pairs found for {language}")
 				return {}
+			# Truncate to exactly max_queries if we have more
+			if len(queries) > max_queries:
+				queries = queries[:max_queries]
+				codes = codes[:max_queries]
+				query_ids = query_ids[:max_queries]
 			logger.info(f"Found {len(queries)} valid query-code pairs for {language}")
+			# Check available memory before encoding
+			if torch.cuda.is_available():
+				free_memory = torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated()
+				free_gb = free_memory / (1024**3)
+				logger.info(f"💾 Available GPU memory before encoding: {free_gb:.1f} GB")
+				if free_gb < 2.0:  # Less than 2GB free
+					logger.warning(f"⚠️ Low GPU memory ({free_gb:.1f} GB), using conservative encoding")
+					torch.cuda.empty_cache()
 			# Encode queries and codes
 			start_time = time.time()
 			query_embeddings = self.encode_texts(queries, f"Encoding {language} queries")
 		return 0.0
 	def evaluate_all_languages(
+		self, max_queries_per_lang: int = 100, languages: list[str] | None = None
 	) -> dict[str, Any]:
 		"""Evaluate on all specified languages."""
 		eval_languages = languages or languages_config.all
 	def run_comprehensive_evaluation(
 		self,
+		max_queries_per_lang: int = 100,
 		languages: list[str] | None = None,
 		skip_benchmark: bool = False,
 	) -> dict[str, Any]:
 def run_evaluation(
 	models: list[str],
+	max_queries: int = 100,
 	languages: list[str] | None = None,
 	use_beam: bool = False,
 	skip_benchmark: bool = False,
 def _run_local_evaluation(
 	models: list[str],
+	max_queries: int = 100,
 	languages: list[str] | None = None,
 	skip_benchmark: bool = False,
 ) -> list[dict[str, Any]]:
 	return results
+@function(**get_evaluation_function_kwargs())
 def _beam_evaluate_single_model(
 	model_path: str,
+	max_queries: int = 100,
 	languages: list[str] | None = None,
 	skip_benchmark: bool = False,
 ) -> dict[str, Any]:
 	"""Beam function to comprehensively evaluate a single model."""
+	# Set CUDA memory settings BEFORE any CUDA operations
+	import os
+	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+	model_name = Path(model_path).name
 	logger.info(f"🚀 Beam comprehensive evaluation starting for {model_name}")
+	# Clear CUDA cache if available
+	try:
+		import torch
+		if torch.cuda.is_available():
+			torch.cuda.empty_cache()
+			torch.cuda.reset_peak_memory_stats()
+			logger.info(
+				f"🧹 Cleared CUDA cache. Available memory: {torch.cuda.get_device_properties(0).total_memory // (1024**3)} GB"
+			)
+	except Exception as e:
+		logger.warning(f"⚠️ Could not clear CUDA cache: {e}")
 	try:
+		logger.info("🔧 Creating ComprehensiveModelEvaluator...")
 		evaluator = ComprehensiveModelEvaluator(model_path, model_name)
+		logger.info("✅ ComprehensiveModelEvaluator created successfully")
+		logger.info("🚀 Starting comprehensive evaluation...")
 		results = evaluator.run_comprehensive_evaluation(max_queries, languages, skip_benchmark)
+		logger.info("✅ Comprehensive evaluation completed")
+		# Validate results
+		if not results or "model_name" not in results:
+			logger.error(f"❌ Invalid evaluation results for {model_name}: {results}")
+			return {"error": "Invalid evaluation results", "model_name": model_name}
 		# Save to Beam volume as single comprehensive file
+		logger.info("💾 Saving results to Beam volume...")
 		volume_results_dir = Path(VOLUME_CONFIG.mount_path) / "evaluation_results"
 		volume_results_dir.mkdir(parents=True, exist_ok=True)
 		with result_file.open("w") as f:
 			json.dump(results, f, indent=2, default=str)
+		logger.info(f"💾 Saved Beam comprehensive evaluation results for {model_name} to {result_file}")
+		logger.info(f"🎯 Final results summary: {len(results.get('codesearch_languages', {}))} languages evaluated")
 		return results
+	except (torch.OutOfMemoryError, RuntimeError, AssertionError) as e:
+		# Handle CUDA errors including OOM, driver errors, and PyTorch internal assertion failures
+		is_oom = isinstance(e, torch.OutOfMemoryError) or "out of memory" in str(e).lower()
+		is_cuda_error = is_oom or "cuda" in str(e).lower() or "INTERNAL ASSERT FAILED" in str(e) or "handles_" in str(e)
+		if is_cuda_error:
+			error_type = "CUDA OOM" if is_oom else "CUDA Error"
+			logger.exception(f"❌ {error_type} during evaluation of {model_name}")
+			# Try to clear memory and reset CUDA state more aggressively
+			with contextlib.suppress(Exception):
+				reset_cuda_state()
+			return {
+				"error": f"{error_type}: {e!s}",
+				"model_name": model_name,
+				"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+				"evaluation_failed": True,
+				"oom": is_oom,
+				"cuda_error": True,
+			}
+		# Re-raise if not a CUDA-related error
+		raise
+	except Exception as e:
 		logger.exception(f"❌ Beam comprehensive evaluation failed for {model_name}")
+		# Return error info in a structured way
+		error_result = {
+			"error": str(e),
+			"model_name": model_name,
+			"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+			"evaluation_failed": True,
+		}
+		# Try to save error result to volume
+		try:
+			volume_results_dir = Path(VOLUME_CONFIG.mount_path) / "evaluation_results"
+			volume_results_dir.mkdir(parents=True, exist_ok=True)
+			safe_model_name = get_safe_model_name(model_name)
+			error_file = volume_results_dir / f"error_eval_{safe_model_name}.json"
+			with error_file.open("w") as f:
+				json.dump(error_result, f, indent=2, default=str)
+			logger.info(f"💾 Saved error info to {error_file}")
+		except Exception:
+			logger.exception("❌ Could not save error info")
+		return error_result
 def _run_beam_evaluation(
 	models: list[str],
+	max_queries: int = 100,
 	languages: list[str] | None = None,
 	skip_benchmark: bool = False,
 ) -> list[dict[str, Any]]:
 			result = _beam_evaluate_single_model.remote(model_path, max_queries, languages, skip_benchmark)
 			if result:
+				# Check if this is an error result
+				if result.get("evaluation_failed", False):
+					logger.error(f"❌ Beam evaluation failed for {model_name}: {result.get('error', 'Unknown error')}")
+					if result.get("oom", False):
+						logger.error("💥 Out of memory error - model may be too large for available GPU")
+					continue
 				# Download the comprehensive result file from Beam
 				success = download_specific_evaluation_file(
 					VOLUME_CONFIG.name,
 					results.append(result)
 				else:
 					logger.warning(f"⚠️ Could not download results for {model_name}")
+			else:
+				logger.warning(f"⚠️ No result returned for {model_name}")
 		except Exception:
 			logger.exception(f"❌ Beam comprehensive evaluation failed for {model_name}")
 	use_beam: bool = typer.Option(default=False, help="Use Beam for evaluation"),
 	skip_third_party: bool = typer.Option(default=False, help="Skip third-party models"),
 	skip_benchmark: bool = typer.Option(default=False, help="Skip performance benchmarking"),
+	max_queries: int = typer.Option(default=100, help="Maximum queries per language"),
 ) -> None:
 	"""Main comprehensive evaluation function."""
 	logger.info("🚀 Starting comprehensive model evaluation (CodeSearchNet + Performance)")

train_code_classification.py DELETED Viewed

@@ -1,365 +0,0 @@
-#!/usr/bin/env python
-"""
-Script to train a code classification model using CodeSearchNet dataset with Model2Vec.
-This script performs the following operations:
-1. Downloads the Alibaba-NLP/gte-Qwen2-7B-instruct model
-2. Optionally distills it using Model2Vec to create a smaller, faster static model
-3. Trains a programming language classifier on CodeSearchNet dataset
-4. Evaluates the classifier and saves the trained model
-Based on the official CodeSearchNet dataset: https://github.com/github/CodeSearchNet
-"""
-import json
-import logging
-import re
-import time
-from pathlib import Path
-from time import perf_counter
-from typing import Any, cast
-from datasets import Dataset, DatasetDict, load_dataset  # type: ignore [import]
-from model2vec.distill import distill
-from model2vec.train import StaticModelForClassification
-# =============================================================================
-# CONFIGURATION CONSTANTS
-# =============================================================================
-# Model Configuration
-MODEL_NAME = "Alibaba-NLP/gte-Qwen2-7B-instruct"  # Source model to distill
-OUTPUT_DIR = "."  # Directory to save the trained model
-# Distillation Configuration
-SKIP_DISTILLATION = False  # Set to True to skip distillation and use existing model
-DISTILLED_MODEL_PATH = "."  # Path to existing distilled model (if skipping distillation)
-PCA_DIMS = 256  # Dimensions for PCA reduction (smaller = faster but less accurate)
-# Dataset Configuration
-DATASET_NAME = "code-search-net/code_search_net"  # CodeSearchNet dataset
-CLASSIFICATION_TASK = "language"  # Task: classify programming language
-MAX_SAMPLES_PER_LANGUAGE = 5000  # Limit samples per language for balanced training
-MIN_CODE_LENGTH = 50  # Minimum code length in characters
-MAX_CODE_LENGTH = 2000  # Maximum code length in characters (for memory efficiency)
-# Text processing constants
-MAX_COMMENT_LENGTH = 200  # Maximum length for comment lines before truncation
-# Training Configuration
-MAX_EPOCHS = 30  # Maximum number of training epochs
-PATIENCE = 5  # Early stopping patience
-BATCH_SIZE = 32  # Training batch size
-LEARNING_RATE = 1e-3  # Learning rate
-# Saving Configuration
-SAVE_PIPELINE = True  # Save as scikit-learn compatible pipeline
-SAVE_TO_HUB = False  # Whether to push the model to HuggingFace Hub
-HUB_MODEL_ID = None  # Model ID for HuggingFace Hub (if saving to hub)
-# =============================================================================
-# Configure logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-logger = logging.getLogger(__name__)
-def clean_code_text(code: str) -> str:
-	"""Clean and normalize code text for better classification."""
-	if not code:
-		return ""
-	# Remove excessive whitespace while preserving structure
-	code = re.sub(r"\n\s*\n\s*\n", "\n\n", code)  # Remove multiple empty lines
-	code = re.sub(r" +", " ", code)  # Replace multiple spaces with single space
-	# Remove very long comments that might bias classification
-	lines = code.split("\n")
-	cleaned_lines = []
-	for original_line in lines:
-		line = original_line
-		# Keep comment lines but limit their length
-		if line.strip().startswith(("#", "//", "/*", "*", "--")) and len(line) > MAX_COMMENT_LENGTH:
-			line = line[:MAX_COMMENT_LENGTH] + "..."
-		cleaned_lines.append(line)
-	return "\n".join(cleaned_lines)
-def load_codesearchnet_dataset() -> tuple[Dataset, Dataset, str, str]:
-	"""Load and preprocess the CodeSearchNet dataset for programming language classification."""
-	logger.info("Loading CodeSearchNet dataset...")
-	try:
-		# Load the dataset with trust_remote_code=True
-		logger.info("Downloading and loading CodeSearchNet data...")
-		ds = cast(
-			"DatasetDict",
-			load_dataset(
-				DATASET_NAME,
-				trust_remote_code=True,
-				# Load a reasonable sample for training
-			),
-		)
-		logger.info(f"Available splits: {list(ds.keys())}")
-		# Use train/test splits if available, otherwise split the data
-		if "train" in ds and "test" in ds:
-			train_dataset = ds["train"]
-			test_dataset = ds["test"]
-		elif "train" in ds:
-			# Split the train set
-			split_ds = ds["train"].train_test_split(test_size=0.2, seed=42)
-			train_dataset = split_ds["train"]
-			test_dataset = split_ds["test"]
-		else:
-			# Use all data and split
-			all_data = ds[next(iter(ds.keys()))]
-			split_ds = all_data.train_test_split(test_size=0.2, seed=42)
-			train_dataset = split_ds["train"]
-			test_dataset = split_ds["test"]
-		logger.info(f"Raw dataset sizes - Train: {len(train_dataset)}, Test: {len(test_dataset)}")
-		# Filter and preprocess the data
-		def filter_and_clean(dataset: Dataset) -> Dataset:
-			# Filter examples with valid code and language
-			filtered = dataset.filter(
-				lambda x: (
-					x["func_code_string"] is not None
-					and x["language"] is not None
-					and len(x["func_code_string"]) >= MIN_CODE_LENGTH
-					and len(x["func_code_string"]) <= MAX_CODE_LENGTH
-					and x["language"] in ["python", "java", "javascript", "go", "php", "ruby"]
-				)
-			)
-			# Balance the dataset by limiting samples per language
-			if len(filtered) > MAX_SAMPLES_PER_LANGUAGE * 6:  # 6 languages
-				# Group by language and sample
-				language_samples: dict[str, list[dict[str, Any]]] = {}
-				for example in filtered:
-					lang = example["language"]
-					if lang not in language_samples:
-						language_samples[lang] = []
-					if len(language_samples[lang]) < MAX_SAMPLES_PER_LANGUAGE:
-						language_samples[lang].append(example)
-				# Combine all samples
-				balanced_examples = []
-				for lang_examples in language_samples.values():
-					balanced_examples.extend(lang_examples)
-				# Convert back to dataset format
-				if balanced_examples:
-					filtered = Dataset.from_list(balanced_examples)
-			# Clean the code text
-			def clean_example(example: dict[str, Any]) -> dict[str, Any]:
-				example["func_code_string"] = clean_code_text(example["func_code_string"])
-				return example
-			return filtered.map(clean_example)
-		train_dataset = filter_and_clean(train_dataset)
-		test_dataset = filter_and_clean(test_dataset)
-		logger.info(f"Filtered dataset sizes - Train: {len(train_dataset)}, Test: {len(test_dataset)}")
-		# Show language distribution
-		if len(train_dataset) > 0:
-			from collections import Counter
-			train_lang_dist = Counter(train_dataset["language"])
-			test_lang_dist = Counter(test_dataset["language"])
-			logger.info(f"Training language distribution: {dict(train_lang_dist)}")
-			logger.info(f"Test language distribution: {dict(test_lang_dist)}")
-		return train_dataset, test_dataset, "func_code_string", "language"
-	except Exception:
-		logger.exception("Error loading CodeSearchNet dataset")
-		raise
-def main() -> None:
-	"""Run the code classification training pipeline."""
-	# Create output directory if it doesn't exist
-	output_dir = Path(OUTPUT_DIR)
-	output_dir.mkdir(parents=True, exist_ok=True)
-	logger.info(f"Starting CodeSearchNet code classification pipeline for {MODEL_NAME}")
-	logger.info(f"Classification task: {CLASSIFICATION_TASK}")
-	logger.info(f"Trained model will be saved to {output_dir}")
-	# Record start time for benchmarking
-	total_start_time = time.time()
-	try:
-		# Step 1: Get the static model (either distill or load existing)
-		static_model = None
-		if SKIP_DISTILLATION:
-			if DISTILLED_MODEL_PATH:
-				logger.info(f"Loading existing distilled model from {DISTILLED_MODEL_PATH}")
-				# Note: We'll create the classifier from pretrained instead
-			else:
-				logger.error("DISTILLED_MODEL_PATH must be specified when SKIP_DISTILLATION is True")
-				return
-		else:
-			logger.info("Starting Model2Vec distillation...")
-			distill_start_time = time.time()
-			static_model = distill(
-				model_name=MODEL_NAME,
-				pca_dims=PCA_DIMS,
-			)
-			distill_time = time.time() - distill_start_time
-			logger.info(f"Distillation completed in {distill_time:.2f} seconds")
-		# Step 2: Create the classifier
-		logger.info("Creating classifier...")
-		if static_model is not None:
-			# From a distilled model
-			classifier = StaticModelForClassification.from_static_model(model=static_model)
-		else:
-			# From a pre-trained model path
-			classifier = StaticModelForClassification.from_pretrained(model_name=DISTILLED_MODEL_PATH)
-		# Step 3: Load the CodeSearchNet dataset
-		train_dataset, test_dataset, text_column, label_column = load_codesearchnet_dataset()
-		if len(train_dataset) == 0 or len(test_dataset) == 0:
-			logger.error("No valid data found after filtering. Please check dataset configuration.")
-			return
-		logger.info(f"Training dataset size: {len(train_dataset)}")
-		logger.info(f"Test dataset size: {len(test_dataset)}")
-		# Get unique languages for reference
-		unique_languages = sorted(set(train_dataset[label_column]))
-		logger.info(f"Programming languages to classify: {unique_languages}")
-		# Step 4: Train the classifier
-		logger.info("Starting training...")
-		train_start_time = perf_counter()
-		classifier = classifier.fit(
-			train_dataset[text_column],
-			train_dataset[label_column],
-			max_epochs=MAX_EPOCHS,
-			batch_size=BATCH_SIZE,
-			learning_rate=LEARNING_RATE,
-			early_stopping_patience=PATIENCE,
-		)
-		train_time = perf_counter() - train_start_time
-		logger.info(f"Training completed in {int(train_time)} seconds")
-		# Step 5: Evaluate the classifier
-		logger.info("Evaluating classifier...")
-		eval_start_time = perf_counter()
-		classification_report = classifier.evaluate(test_dataset[text_column], test_dataset[label_column])
-		eval_time = perf_counter() - eval_start_time
-		logger.info(f"Evaluation completed in {int(eval_time * 1000)} milliseconds")
-		logger.info(f"Classification results:\n{classification_report}")
-		# Step 6: Test with some examples
-		logger.info("Testing with sample code snippets...")
-		# Test examples for different languages
-		test_examples = [
-			'def hello_world():\n    print("Hello, World!")\n    return True',  # Python
-			(
-				"public class HelloWorld {\n"
-				"    public static void main(String[] args) {\n"
-				'        System.out.println("Hello, World!");\n'
-				"    }\n"
-				"}"
-			),  # Java
-			'function helloWorld() {\n    console.log("Hello, World!");\n    return true;\n}',  # JavaScript
-			'package main\n\nimport "fmt"\n\nfunc main() {\n    fmt.Println("Hello, World!")\n}',  # Go
-			'<?php\nfunction hello_world() {\n    echo "Hello, World!";\n    return true;\n}\n?>',  # PHP
-			'def hello_world\n    puts "Hello, World!"\n    true\nend',  # Ruby
-		]
-		predictions = classifier.predict(test_examples)
-		for i, (code, pred) in enumerate(zip(test_examples, predictions, strict=False)):
-			logger.info(f"Example {i + 1}: {pred}")
-			logger.info(f"Code snippet: {code[:100]}...")
-		# Step 7: Benchmark inference speed
-		logger.info("Benchmarking inference speed...")
-		inference_start_time = perf_counter()
-		_ = classifier.predict(test_dataset[text_column][:100])  # Test on first 100 samples
-		inference_time = perf_counter() - inference_start_time
-		logger.info(f"Inference took {int(inference_time * 1000)} milliseconds for 100 code snippets on CPU")
-		# Step 8: Save the model
-		if SAVE_PIPELINE:
-			logger.info("Converting to scikit-learn pipeline...")
-			pipeline = classifier.to_pipeline()
-			# Save locally
-			pipeline_path = output_dir / "pipeline"
-			pipeline.save_pretrained(str(pipeline_path))
-			logger.info(f"Pipeline saved to {pipeline_path}")
-			# Save additional metadata
-			metadata = {
-				"model_name": MODEL_NAME,
-				"dataset": DATASET_NAME,
-				"task": "programming_language_classification",
-				"languages": unique_languages,
-				"pca_dims": PCA_DIMS,
-				"train_samples": len(train_dataset),
-				"test_samples": len(test_dataset),
-			}
-			metadata_path = output_dir / "metadata.json"
-			with metadata_path.open("w") as f:
-				json.dump(metadata, f, indent=2)
-			logger.info("Metadata saved to metadata.json")
-			# Push to hub if requested
-			if SAVE_TO_HUB and HUB_MODEL_ID:
-				logger.info(f"Pushing pipeline to HuggingFace Hub as {HUB_MODEL_ID}")
-				pipeline.push_to_hub(HUB_MODEL_ID)
-		else:
-			# Save the classifier directly
-			classifier_path = output_dir / "classifier"
-			classifier_path.mkdir(exist_ok=True)
-			# Note: StaticModelForClassification might not have save_pretrained
-			# We'll save the underlying static model and create instructions
-			if static_model is not None:
-				static_model.save_pretrained(str(classifier_path / "static_model"))
-			logger.info(f"Classifier components saved to {classifier_path}")
-		# Summary
-		total_time = time.time() - total_start_time
-		logger.info("=" * 60)
-		logger.info("CODE CLASSIFICATION TRAINING COMPLETED SUCCESSFULLY!")
-		logger.info(f"Total time: {total_time:.2f} seconds")
-		if not SKIP_DISTILLATION:
-			logger.info(f"Distillation time: {distill_time:.2f} seconds")
-		logger.info(f"Training time: {int(train_time)} seconds")
-		logger.info(f"Dataset: {DATASET_NAME}")
-		logger.info("Task: Programming Language Classification")
-		logger.info(f"Languages: {', '.join(unique_languages)}")
-		logger.info(f"Model saved to: {output_dir}")
-		logger.info("=" * 60)
-	except Exception:
-		logger.exception("Error during code classification training pipeline")
-		raise
-if __name__ == "__main__":
-	main()