feat: overhaul distiller package with unified CLI, enhanced evaluation, and modular structure

Files changed (12) hide show

src/distiller/__init__.py +64 -4
src/distiller/__main__.py +41 -167
src/distiller/analyze.py +361 -138
src/distiller/beam_utils.py +443 -21
src/distiller/benchmark.py +0 -1181
src/distiller/config.py +339 -0
src/distiller/distill.py +1248 -988
src/distiller/distill_simplified.py +0 -413
src/distiller/evaluate.py +897 -593
src/distiller/patch_utils.py +119 -0
src/distiller/sync.py +0 -262
src/distiller/utils.py +373 -0

src/distiller/__init__.py CHANGED Viewed

@@ -1,7 +1,67 @@
-"""Model2Vec Distillation Pipeline for gte-Qwen2-7B-instruct."""
-__version__ = "0.1.0"
-from .distill import beam_code_distillation, code_specialized_distillation
-__all__ = ["beam_code_distillation", "code_specialized_distillation"]

+"""
+Distiller package for code-specialized embedding model distillation and evaluation.
+This package provides a complete pipeline for:
+1. Distilling code-specialized embedding models using Model2Vec
+2. Comprehensive evaluation including CodeSearchNet and performance benchmarks
+3. Analysis and reporting of model performance
+Main modules:
+- distill: Model2Vec distillation with optional advanced training
+- evaluate: Comprehensive evaluation (CodeSearchNet + performance benchmarks)
+- analyze: Analysis and reporting tools
+- config: Centralized configuration management
+- beam_utils: Beam cloud utilities for distributed processing
+Usage:
+    from distiller import distill, evaluate, analyze
+"""
+from . import analyze, config, distill, evaluate
+from .analyze import CodeSearchNetAnalyzer
+from .config import (
+	BEAM_ENV_SETTINGS,
+	DEFAULT_EVALUATION_MODELS,
+	GPU_NAME,
+	IMAGE,
+	codesearchnet_config,
+	directories,
+	distillation_config,
+	get_volume_config,
+	languages_config,
+)
+from .distill import (
+	run_beam_distillation,
+	run_local_distillation,
+)
+from .evaluate import (
+	CodeSearchNetEvaluator,
+	ComprehensiveModelEvaluator,
+	run_evaluation,
+)
+__all__ = [
+	# Configuration
+	"BEAM_ENV_SETTINGS",
+	"DEFAULT_EVALUATION_MODELS",
+	"GPU_NAME",
+	"IMAGE",
+	# Main classes
+	"CodeSearchNetAnalyzer",
+	"CodeSearchNetEvaluator",
+	"ComprehensiveModelEvaluator",
+	# Modules
+	"analyze",
+	"codesearchnet_config",
+	"config",
+	"directories",
+	"distill",
+	"distillation_config",
+	"evaluate",
+	"get_volume_config",
+	"languages_config",
+	# Main functions
+	"run_beam_distillation",
+	"run_evaluation",
+	"run_local_distillation",
+]

src/distiller/__main__.py CHANGED Viewed

@@ -1,183 +1,57 @@
 """Main entry point for the distiller package."""
-import argparse
-import sys
-def main() -> None:
-	"""Main entry point for the distiller package."""
-	parser = argparse.ArgumentParser(description="Model2Vec Code-Specialized Distillation Pipeline")
-	subparsers = parser.add_subparsers(dest="command", help="Available commands")
-	# Distillation command
-	distill_parser = subparsers.add_parser("distill", help="Run code-specialized model distillation")
-	distill_parser.add_argument("--model", default="Alibaba-NLP/gte-Qwen2-7B-instruct", help="Model to distill")
-	distill_parser.add_argument("--output-dir", default="gte_qwen2_m2v_code", help="Output directory")
-	distill_parser.add_argument("--pca-dims", type=int, default=512, help="PCA dimensions")
-	distill_parser.add_argument("--max-samples", type=int, default=50000, help="Max CodeSearchNet samples")
-	distill_parser.add_argument("--use-beam", action="store_true", help="Use Beam for cloud GPU distillation")
-	# Simplified distillation command
-	simple_parser = subparsers.add_parser("distill-simple", help="Run simplified Model2Vec distillation (local)")
-	simple_parser.add_argument(
-		"--teacher", default="sentence-transformers/all-MiniLM-L6-v2", help="Teacher model to distill from"
-	)
-	simple_parser.add_argument("--output-dir", default="gte_qwen2_m2v_code_simplified", help="Output directory")
-	simple_parser.add_argument("--pca-dims", type=int, default=256, help="PCA dimensions")
-	# CodeSearchNet evaluation command
-	evaluate_parser = subparsers.add_parser("evaluate", help="Run CodeSearchNet evaluation on all default models")
-	evaluate_parser.add_argument("--use-beam", action="store_true", help="Use Beam for cloud evaluation")
-	# CodeSearchNet evaluation command (simplified models only)
-	evaluate_simple_parser = subparsers.add_parser(
-		"evaluate-simple", help="Run CodeSearchNet evaluation on simplified models only"
-	)
-	evaluate_simple_parser.add_argument("--use-beam", action="store_true", help="Use Beam for cloud evaluation")
-	# Analysis command
-	analysis_parser = subparsers.add_parser("analyze", help="Generate CodeSearchNet analysis report")
-	analysis_parser.add_argument("--results-dir", default="code_evaluation_results", help="Results directory")
-	analysis_parser.add_argument("--results-file", help="Single results file to analyze")
-	analysis_parser.add_argument("--model-name", default="gte_qwen2_m2v_code", help="Model name for report")
-	analysis_parser.add_argument("--output", default="README.md", help="Output report file")
-	analysis_parser.add_argument("--export-csv", help="Export comparison results to CSV")
-	analysis_parser.add_argument("--use-beam", action="store_true", help="Use Beam for cloud analysis")
-	# Sync command
-	sync_parser = subparsers.add_parser("sync", help="Download files from Beam volume to local directory")
-	sync_parser.add_argument("--model-files", action="store_true", help="Download final model files")
-	sync_parser.add_argument(
-		"--analysis-files",
-		action="store_true",
-		help="Download analysis reports and charts",
-	)
-	sync_parser.add_argument("--all", action="store_true", help="Download all generated files")
-	sync_parser.add_argument("--output-dir", default=".", help="Local output directory")
-	# Benchmark command
-	benchmark_parser = subparsers.add_parser("benchmark", help="Run performance benchmarking on all default models")
-	benchmark_parser.add_argument("--use-beam", action="store_true", help="Use Beam for cloud benchmarking")
-	# Benchmark command (simplified models only)
-	benchmark_simple_parser = subparsers.add_parser(
-		"benchmark-simple", help="Run performance benchmarking on simplified models only"
-	)
-	benchmark_simple_parser.add_argument("--use-beam", action="store_true", help="Use Beam for cloud benchmarking")
-	args = parser.parse_args()
-	if args.command == "distill":
-		from .distill_simplified import run_local_distillation, beam_distill_all_teachers
-		if args.use_beam:
-			# Run on Beam
-			print("Running comprehensive teacher model distillation on Beam...")
-			results = beam_distill_all_teachers()
-		else:
-			# Run locally
-			print("Running comprehensive teacher model distillation locally...")
-			results = run_local_distillation()
-		print(f"✅ Distillation complete! Created {results['total_successful']} models")
-		print("📁 Models location: ./code_model2vec/final/")
-		print("\n✅ Created models:")
-		for model_name in results["successful_models"]:
-			model_info = results["all_results"][model_name]
-			print(f"   • {model_name} (from {model_info['teacher_model']})")
-	elif args.command == "distill-simple":
-		from .distill_simplified import run_local_distillation
-		# Run simplified distillation for all teacher models locally
-		print("Running comprehensive teacher model distillation locally...")
-		results = run_local_distillation()
-		print(f"✅ Distillation complete! Created {results['total_successful']} models")
-		print("📁 Models location: ./code_model2vec/final/")
-		print("\n✅ Created models:")
-		for model_name in results["successful_models"]:
-			model_info = results["all_results"][model_name]
-			print(f"   • {model_name} (from {model_info['teacher_model']})")
-	elif args.command == "evaluate":
-		from .evaluate import main as evaluate_main, run_local_evaluation
-		if args.use_beam:
-			# Run on Beam with all default models
-			print("Running comprehensive evaluation on Beam...")
-			evaluate_main()
-		else:
-			# Run locally with all default models
-			print("Running comprehensive evaluation locally...")
-			run_local_evaluation()
-	elif args.command == "evaluate-simple":
-		from .evaluate import evaluate_simplified_only, run_local_evaluation_simplified
-		if args.use_beam:
-			# Run on Beam with simplified models only
-			print("Running simplified model evaluation on Beam...")
-			evaluate_simplified_only()
-		else:
-			# Run locally with simplified models only
-			print("Running simplified model evaluation locally...")
-			run_local_evaluation_simplified()
-	elif args.command == "analyze":
-		from .analyze import main as analyze_main
-		# Run locally - Override sys.argv to pass arguments to the analyze script
-		sys.argv = ["analyze.py"]
-		if args.results_dir != "code_evaluation_results":
-			sys.argv.extend(["--results-dir", args.results_dir])
-		if args.results_file:
-			sys.argv.extend(["--results-file", args.results_file])
-		if args.model_name != "gte_qwen2_m2v_code":
-			sys.argv.extend(["--model-name", args.model_name])
-		if args.output != "README.md":
-			sys.argv.extend(["--output", args.output])
-		if args.export_csv:
-			sys.argv.extend(["--export-csv", args.export_csv])
-		analyze_main()
-	elif args.command == "sync":
-		from .sync import sync_files
-		# Run locally
-		sync_files(
-			model_files=args.model_files,
-			analysis_files=args.analysis_files,
-			all_files=args.all,
-			output_dir=args.output_dir,
-		)
-	elif args.command == "benchmark":
-		from .benchmark import main as benchmark_main, run_local_benchmark
-		if args.use_beam:
-			# Run on Beam with all default models
-			print("Running comprehensive benchmarking on Beam...")
-			benchmark_main()
-		else:
-			# Run locally with all default models
-			print("Running comprehensive benchmarking locally...")
-			run_local_benchmark()
-	elif args.command == "benchmark-simple":
-		from .benchmark import benchmark_simplified_only, run_local_benchmark_simplified
-		if args.use_beam:
-			# Run on Beam with simplified models only
-			print("Running simplified model benchmarking on Beam...")
-			benchmark_simplified_only()
-		else:
-			# Run locally with simplified models only
-			print("Running simplified model benchmarking locally...")
-			run_local_benchmark_simplified()
-	else:
-		parser.print_help()
 if __name__ == "__main__":
-	main()

 """Main entry point for the distiller package."""
+from typing import Annotated
+import typer
+app = typer.Typer(
+	help="Model2Vec Code-Specialized Distillation Pipeline",
+	no_args_is_help=True,
+	context_settings={"help_option_names": ["-h", "--help"]},
+)
+@app.command()
+def distill(
+	use_beam: Annotated[bool, typer.Option(help="Use Beam for distillation")] = False,
+	train: Annotated[bool, typer.Option(help="Enable advanced training (CodeSearchNet fine-tuning)")] = False,
+	teacher_models: Annotated[list[str] | None, typer.Option(help="Specific teacher models to distill")] = None,
+	pca_dims: Annotated[int | None, typer.Option(help="PCA dimensions (uses config default if not specified)")] = None,
+) -> None:
+	"""Run unified Model2Vec distillation with optional training."""
+	from .distill import main as distill_main
+	# Call the distill main function with arguments
+	distill_main(use_beam, train, teacher_models, pca_dims)
+@app.command()
+def evaluate(
+	use_beam: Annotated[bool, typer.Option(help="Use Beam for evaluation")] = False,
+	skip_third_party: Annotated[bool, typer.Option(help="Skip third-party models")] = False,
+	skip_benchmark: Annotated[bool, typer.Option(help="Skip performance benchmarking")] = False,
+	max_queries: Annotated[int, typer.Option(help="Maximum queries per language")] = 1000,
+) -> None:
+	"""Run CodeSearchNet evaluation on models."""
+	from .evaluate import main as evaluate_main
+	# Call the evaluate main function with arguments
+	evaluate_main(use_beam, skip_third_party, skip_benchmark, max_queries)
+@app.command()
+def analyze(
+	results_dir: Annotated[str | None, typer.Option(help="Results directory")] = None,
+	model_name: Annotated[str, typer.Option(help="Model name for analysis")] = "gte_qwen2_m2v_code (Ours)",
+	output: Annotated[str, typer.Option(help="Output report file")] = "REPORT.md",
+	export_csv: Annotated[str | None, typer.Option(help="Export results to CSV")] = None,
+) -> None:
+	"""Generate comprehensive analysis reports."""
+	from .analyze import main as analyze_main
+	# Call the analyze main function with arguments
+	analyze_main(results_dir or "code_model2vec/evaluation_results", model_name, output, export_csv)
 if __name__ == "__main__":
+	app()

src/distiller/analyze.py CHANGED Viewed

@@ -23,7 +23,6 @@ Usage:
     distiller analyze --results-dir evaluation_results
 """
-import argparse
 import json
 import logging
 import time
@@ -35,6 +34,8 @@ import numpy as np
 import pandas as pd
 import seaborn as sns
 # Optional Plotly import with fallback
 PLOTLY_AVAILABLE = True
 try:
@@ -65,48 +66,140 @@ OUTPUT_DIR = Path("analysis_results")
 IMAGES_DIR = Path("analysis_charts")
 REPORT_FILE = Path("REPORT.md")  # Changed from README.md
-# Local directories for results - updated for new structure
-DEFAULT_EVALUATION_DIR = "code_model2vec/evaluation_results"
-DEFAULT_BENCHMARK_DIR = "code_model2vec/benchmark_results"
 # CodeSearchNet Languages
 CODE_LANGUAGES = ["python", "javascript", "java", "php", "ruby", "go"]
 # Model name mapping from the default models in evaluate.py and benchmark.py
 MODEL_NAME_MAPPING = {
-	# File names to display names
-	"gte_qwen2_m2v_code": "gte_qwen2_m2v_code (Ours)",
-	"all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
-	"codebert-base": "microsoft/codebert-base",
-	"graphcodebert-base": "microsoft/graphcodebert-base",
-	"CodeBERTa-small-v1": "huggingface/CodeBERTa-small-v1",
-	"all-mpnet-base-v2": "sentence-transformers/all-mpnet-base-v2",
-	"all-MiniLM-L12-v2": "sentence-transformers/all-MiniLM-L12-v2",
-	"potion-base-8M": "minishlab/potion-base-8M",
-	"potion-retrieval-32M": "minishlab/potion-retrieval-32M",
-	"codet5-base": "Salesforce/codet5-base",
 }
-# Reverse mapping for lookups
-DISPLAY_NAME_TO_FILE = {v: k for k, v in MODEL_NAME_MAPPING.items()}
 # Peer models for comparison (code-specialized models)
 PEER_MODELS = {
-	"sentence-transformers/all-MiniLM-L6-v2": {"overall_ndcg": 0.25, "type": "General"},
-	"microsoft/codebert-base": {"overall_ndcg": 0.32, "type": "Code-Specific"},
-	"microsoft/graphcodebert-base": {"overall_ndcg": 0.35, "type": "Code-Specific"},
-	"huggingface/CodeBERTa-small-v1": {"overall_ndcg": 0.28, "type": "Code-Specific"},
-	"sentence-transformers/all-mpnet-base-v2": {"overall_ndcg": 0.27, "type": "General"},
 }
 # Model specifications for efficiency analysis
 MODEL_SPECS = {
-	"sentence-transformers/all-MiniLM-L6-v2": {"parameters": 22.7, "size_mb": 90},
-	"microsoft/codebert-base": {"parameters": 125.0, "size_mb": 500},
-	"microsoft/graphcodebert-base": {"parameters": 125.0, "size_mb": 500},
-	"huggingface/CodeBERTa-small-v1": {"parameters": 84.0, "size_mb": 340},
-	"sentence-transformers/all-mpnet-base-v2": {"parameters": 109.0, "size_mb": 440},
-	"Alibaba-NLP/gte-Qwen2-7B-instruct": {"parameters": 7000.0, "size_mb": 13000},
 }
 # Distilled model specifications
@@ -134,13 +227,12 @@ def setup_directories(base_path: Path | None = None) -> tuple[Path, Path, Path]:
 		images_dir = base_path / "analysis_results" / "charts"
 		reports_dir = base_path / "analysis_results" / "reports"
 	else:
-		output_dir = OUTPUT_DIR
-		images_dir = IMAGES_DIR
-		reports_dir = OUTPUT_DIR / "reports"
-	output_dir.mkdir(parents=True, exist_ok=True)
 	images_dir.mkdir(parents=True, exist_ok=True)
-	reports_dir.mkdir(parents=True, exist_ok=True)
 	return output_dir, images_dir, reports_dir
@@ -152,17 +244,94 @@ def extract_model_name_from_filename(filename: str) -> str:
 	# Check if it's in our mapping
 	if name in MODEL_NAME_MAPPING:
-		return MODEL_NAME_MAPPING[name]
 	# Try to find partial matches
-	for file_key, display_name in MODEL_NAME_MAPPING.items():
 		if file_key in name or name in file_key:
-			return display_name
 	# If no mapping found, return the cleaned name
 	return name
 class CodeSearchNetAnalyzer:
 	"""Analyzer for CodeSearchNet evaluation results and performance benchmarks."""
@@ -182,33 +351,43 @@ class CodeSearchNetAnalyzer:
 		self.benchmark_df: pd.DataFrame | None = None
 	def load_benchmark_results(self) -> None:
-		"""Load benchmark results from local directory."""
-		logger.info("📊 Loading benchmark results...")
-		if not self.benchmark_dir.exists():
-			logger.warning(f"Benchmark directory not found: {self.benchmark_dir}")
 			return
-		logger.info(f"🔍 Searching for benchmark files in: {self.benchmark_dir}")
-		benchmark_files = list(self.benchmark_dir.glob("benchmark_*.json"))
-		logger.info(f"📁 Found {len(benchmark_files)} benchmark files")
-		for benchmark_file_path in benchmark_files:
 			try:
-				logger.info(f"📖 Loading: {benchmark_file_path.name}")
-				with benchmark_file_path.open() as f:
 					data = json.load(f)
 				if data is not None:
-					# Update model name with proper mapping
-					original_name = data.get("model_name", "Unknown")
-					mapped_name = extract_model_name_from_filename(benchmark_file_path.stem)
-					data["model_name"] = mapped_name
-					data["original_model_name"] = original_name
-					self.benchmark_results.append(data)
-					logger.info(f"✅ Successfully loaded: {mapped_name}")
 			except (json.JSONDecodeError, KeyError) as e:
-				logger.warning(f"❌ Failed to load {benchmark_file_path}: {e}")
 		logger.info(f"📊 Total benchmark results loaded: {len(self.benchmark_results)}")
 		if self.benchmark_results:
@@ -217,6 +396,34 @@ class CodeSearchNetAnalyzer:
 		self._create_benchmark_dataframe()
 	def _create_benchmark_dataframe(self) -> None:
 		"""Create benchmark comparison DataFrame from results."""
 		if not self.benchmark_results:
@@ -263,10 +470,10 @@ class CodeSearchNetAnalyzer:
 					)
 			# CPU vs GPU comparison
-			for device in ["cpu", "cuda"]:
-				if device in cpu_vs_gpu and "error" not in cpu_vs_gpu[device]:
 					device_key = f"{device.upper()}_TextsPerSec"
-					row[device_key] = cpu_vs_gpu[device].get("texts_per_second", 0)
 			benchmark_data.append(row)
@@ -281,23 +488,31 @@ class CodeSearchNetAnalyzer:
 			return
 		logger.info(f"🔍 Searching for evaluation files in: {self.results_dir}")
-		json_files = list(self.results_dir.glob("codesearchnet_eval_*.json"))
-		logger.info(f"📁 Found {len(json_files)} evaluation files")
-		for json_file in json_files:
 			try:
 				logger.info(f"📖 Loading: {json_file.name}")
 				with json_file.open() as f:
 					data = json.load(f)
 				if data is not None:
-					# Update model name with proper mapping
-					original_name = data.get("model_name", "Unknown")
-					mapped_name = extract_model_name_from_filename(json_file.stem)
-					data["model_name"] = mapped_name
-					data["original_model_name"] = original_name
-					self.results.append(data)
-					logger.info(f"✅ Successfully loaded: {mapped_name}")
 			except (json.JSONDecodeError, KeyError) as e:
 				logger.warning(f"❌ Failed to load {json_file}: {e}")
@@ -311,6 +526,32 @@ class CodeSearchNetAnalyzer:
 		# Also load benchmark results
 		self.load_benchmark_results()
 	def _create_comparison_dataframe(self) -> None:
 		"""Create comparison DataFrame from results."""
 		if not self.results:
@@ -453,7 +694,7 @@ class CodeSearchNetAnalyzer:
 			if cpu_vs_gpu:
 				print("🖥️  CPU vs GPU:")
 				for device, metrics in cpu_vs_gpu.items():
-					if "error" not in metrics:
 						print(f"  {device.upper()}: {metrics.get('texts_per_second', 0):.1f} texts/sec")
 			# Memory efficiency
@@ -978,7 +1219,7 @@ class CodeSearchNetAnalyzer:
 					# Safe conversion to float for pandas values
 					score_value = pd.to_numeric(current_model_score, errors="coerce")
 					scores.append(float(score_value) if not pd.isna(score_value) else 0.0)
-					params.append(float(MODEL_SPECS[model_key].get("parameters", 100)))
 					is_user_model.append(False)
 		if not models:
@@ -1098,7 +1339,7 @@ class CodeSearchNetAnalyzer:
 		# Create visualizations
 		logger.info("Generating visualizations...")
-		setup_directories()
 		self.create_performance_radar_chart(main_model_name, language_scores)
 		comparison_chart = self.plot_model_comparison()
@@ -1163,21 +1404,14 @@ This report presents a comprehensive analysis of Model2Vec distillation experime
 				overall_metrics = result.get("overall", {})
 				# Extract teacher model name from model name
-				teacher = "Unknown"
-				if "all_MiniLM_L6_v2" in model_display:
-					teacher = "all-MiniLM-L6-v2"
-				elif "codebert_base" in model_display:
-					teacher = "codebert-base"
-				elif "graphcodebert_base" in model_display:
-					teacher = "graphcodebert-base"
-				elif "gte_Qwen2_7B_instruct" in model_display:
-					teacher = "gte-Qwen2-7B-instruct"
-				elif "all_mpnet_base_v2" in model_display:
-					teacher = "all-mpnet-base-v2"
 				status = "🥇 Best" if rank == 1 else "🥈 2nd" if rank == 2 else "🥉 3rd" if rank == 3 else f"#{rank}"
-				report += f"| {model_display} | {teacher} | {overall_metrics.get('ndcg@10', 0):.4f} | {overall_metrics.get('mrr', 0):.4f} | {overall_metrics.get('recall@5', 0):.4f} | {status} |\n"
 		report += """
@@ -1215,19 +1449,12 @@ This report presents a comprehensive analysis of Model2Vec distillation experime
 			report += "### Individual Model Performance by Language\n\n"
 			for chart_model_name, chart_path in individual_radar_charts.items():
 				# Extract teacher name for cleaner display
-				teacher = "Unknown"
-				if "all_MiniLM_L6_v2" in chart_model_name:
-					teacher = "all-MiniLM-L6-v2"
-				elif "codebert_base" in chart_model_name:
-					teacher = "codebert-base"
-				elif "graphcodebert_base" in chart_model_name:
-					teacher = "graphcodebert-base"
-				elif "gte_Qwen2_7B_instruct" in chart_model_name:
-					teacher = "gte-Qwen2-7B-instruct"
-				elif "all_mpnet_base_v2" in chart_model_name:
-					teacher = "all-mpnet-base-v2"
-				report += f"#### {chart_model_name} (Teacher: {teacher})\n\n"
 				report += f"![{chart_model_name} Radar Chart]({chart_path})\n\n"
 			report += f"""
@@ -1324,7 +1551,7 @@ This report presents a comprehensive analysis of Model2Vec distillation experime
 		if language_scores:
 			report += "| Language | Best Model Performance | Average Performance | Language Difficulty |\n"
-			report += "|----------|------------------------|--------------------|--------------------||\n"
 			for lang in sorted(language_scores.keys()):
 				# Find best performance for this language across all models
@@ -1358,16 +1585,8 @@ Based on the evaluation results across all simplified distillation models:
 				model_name = result["model_name"]
 				score = result.get("overall", {}).get("ndcg@10", 0)
-				if "all_MiniLM_L6_v2" in model_name:
-					teacher_performance["all-MiniLM-L6-v2"] = score
-				elif "codebert_base" in model_name:
-					teacher_performance["codebert-base"] = score
-				elif "graphcodebert_base" in model_name:
-					teacher_performance["graphcodebert-base"] = score
-				elif "gte_Qwen2_7B_instruct" in model_name:
-					teacher_performance["gte-Qwen2-7B-instruct"] = score
-				elif "all_mpnet_base_v2" in model_name:
-					teacher_performance["all-mpnet-base-v2"] = score
 			if teacher_performance:
 				best_teacher = max(teacher_performance.items(), key=lambda x: x[1])
@@ -1397,11 +1616,20 @@ Based on the evaluation results across all simplified distillation models:
 - **Evaluation**: Retrieval of correct code for each documentation query
 ### Teacher Models Tested
-- sentence-transformers/all-MiniLM-L6-v2 (proven baseline)
-- microsoft/codebert-base (code-specialized)
-- microsoft/graphcodebert-base (graph-aware code model)
-- Alibaba-NLP/gte-Qwen2-7B-instruct (large instruction model)
-- sentence-transformers/all-mpnet-base-v2 (general purpose)
 ### Distillation Method
 - **Technique**: Model2Vec static embedding generation
@@ -1424,31 +1652,27 @@ Based on the evaluation results across all simplified distillation models:
 			logger.info(f"Results exported to {output_file}")
-def main() -> None:
 	"""Main analysis function."""
-	parser = argparse.ArgumentParser(description="Analyze CodeSearchNet evaluation results and performance benchmarks")
-	parser.add_argument("--results-dir", default=DEFAULT_EVALUATION_DIR, help="Evaluation results directory")
-	parser.add_argument("--benchmark-dir", default=DEFAULT_BENCHMARK_DIR, help="Benchmark results directory")
-	parser.add_argument("--model-name", default="gte_qwen2_m2v_code (Ours)", help="Model name for report")
-	parser.add_argument("--output", default="REPORT.md", help="Output report file")
-	parser.add_argument("--export-csv", help="Export comparison results to CSV")
-	args = parser.parse_args()
-	logger.info("Starting CodeSearchNet Analysis with Benchmark Integration")
 	logger.info("=" * 60)
 	# Setup output directories
 	output_dir, images_dir, reports_dir = setup_directories()
-	# Initialize analyzer with local directories
 	analyzer = CodeSearchNetAnalyzer(
-		results_dir=args.results_dir,
-		benchmark_dir=args.benchmark_dir,
 		images_dir=images_dir,
 	)
-	# Load results (this will also load benchmark results)
 	analyzer.load_results()
 	if not analyzer.results:
@@ -1463,33 +1687,32 @@ def main() -> None:
 	if analyzer.benchmark_results:
 		analyzer.analyze_benchmark_performance()
 	else:
-		logger.warning("No benchmark results found. Run benchmark.py first for complete analysis.")
 	# Generate comprehensive report with benchmark integration
-	logger.info("Generating comprehensive report with benchmark data...")
-	report = analyzer.generate_comprehensive_report(args.model_name)
 	# Save report
-	report_path = Path(args.output)
 	with report_path.open("w") as f:
 		f.write(report)
 	# Export CSV if requested
-	if args.export_csv:
-		analyzer.export_results(args.export_csv)
 	# Export benchmark CSV if available
 	if analyzer.benchmark_df is not None and not analyzer.benchmark_df.empty:
-		benchmark_csv = report_path.parent / f"{args.model_name}_benchmark_comparison.csv"
 		analyzer.benchmark_df.to_csv(benchmark_csv, index=False)
 		logger.info(f"📊 Benchmark comparison saved to: {benchmark_csv}")
-	logger.info("✅ CodeSearchNet analysis with benchmarks complete!")
 	logger.info(f"📊 Report saved to: {report_path}")
 	logger.info(f"🖼️  Charts saved to: {images_dir}")
 if __name__ == "__main__":
-	import argparse
 	main()

     distiller analyze --results-dir evaluation_results
 """
 import json
 import logging
 import time
 import pandas as pd
 import seaborn as sns
+from .config import directories
 # Optional Plotly import with fallback
 PLOTLY_AVAILABLE = True
 try:
 IMAGES_DIR = Path("analysis_charts")
 REPORT_FILE = Path("REPORT.md")  # Changed from README.md
+# Local directories for results - using standardized directories from config
+DEFAULT_EVALUATION_DIR = directories.evaluation_results
+DEFAULT_BENCHMARK_DIR = directories.benchmark_results
 # CodeSearchNet Languages
 CODE_LANGUAGES = ["python", "javascript", "java", "php", "ruby", "go"]
 # Model name mapping from the default models in evaluate.py and benchmark.py
 MODEL_NAME_MAPPING = {
+	# File names to display names and HuggingFace links
+	"all-MiniLM-L6-v2": {
+		"name": "sentence-transformers/all-MiniLM-L6-v2",
+		"link": "https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2",
+	},
+	"all-mpnet-base-v2": {
+		"name": "sentence-transformers/all-mpnet-base-v2",
+		"link": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
+	},
+	"paraphrase-MiniLM-L6-v2": {
+		"name": "sentence-transformers/paraphrase-MiniLM-L6-v2",
+		"link": "https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2",
+	},
+	"codebert-base": {"name": "microsoft/codebert-base", "link": "https://huggingface.co/microsoft/codebert-base"},
+	"graphcodebert-base": {
+		"name": "microsoft/graphcodebert-base",
+		"link": "https://huggingface.co/microsoft/graphcodebert-base",
+	},
+	"CodeBERTa-small-v1": {
+		"name": "huggingface/CodeBERTa-small-v1",
+		"link": "https://huggingface.co/huggingface/CodeBERTa-small-v1",
+	},
+	"all-MiniLM-L12-v2": {
+		"name": "sentence-transformers/all-MiniLM-L12-v2",
+		"link": "https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2",
+	},
+	"potion-base-8M": {"name": "minishlab/potion-base-8M", "link": "https://huggingface.co/minishlab/potion-base-8M"},
+	"potion-retrieval-32M": {
+		"name": "minishlab/potion-retrieval-32M",
+		"link": "https://huggingface.co/minishlab/potion-retrieval-32M",
+	},
+	"codet5-base": {"name": "Salesforce/codet5-base", "link": "https://huggingface.co/Salesforce/codet5-base"},
+	"gte-Qwen2-1.5B-instruct": {
+		"name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+		"link": "https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+	},
+	"bge-m3": {"name": "BAAI/bge-m3", "link": "https://huggingface.co/BAAI/bge-m3"},
+	"jina-embeddings-v3": {
+		"name": "jinaai/jina-embeddings-v3",
+		"link": "https://huggingface.co/jinaai/jina-embeddings-v3",
+	},
+	"nomic-embed-text-v2-moe": {
+		"name": "nomic-ai/nomic-embed-text-v2-moe",
+		"link": "https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe",
+	},
+	"Qodo-Embed-1-1.5B": {"name": "Qodo/Qodo-Embed-1-1.5B", "link": "https://huggingface.co/Qodo/Qodo-Embed-1-1.5B"},
+	"Reason-ModernColBERT": {
+		"name": "lightonai/Reason-ModernColBERT",
+		"link": "https://huggingface.co/lightonai/Reason-ModernColBERT",
+	},
+	"Linq-Embed-Mistral": {
+		"name": "Linq-AI-Research/Linq-Embed-Mistral",
+		"link": "https://huggingface.co/Linq-AI-Research/Linq-Embed-Mistral",
+	},
+	"bge-code-v1": {"name": "BAAI/bge-code-v1", "link": "https://huggingface.co/BAAI/bge-code-v1"},
+	"SFR-Embedding-Code-2B_R": {
+		"name": "Salesforce/SFR-Embedding-Code-2B_R",
+		"link": "https://huggingface.co/Salesforce/SFR-Embedding-Code-2B_R",
+	},
 }
+# Reverse mapping for lookups - using just the names
+DISPLAY_NAME_TO_FILE = {v["name"]: k for k, v in MODEL_NAME_MAPPING.items()}
 # Peer models for comparison (code-specialized models)
 PEER_MODELS = {
+	"sentence-transformers/all-MiniLM-L6-v2": {
+		"overall_ndcg": 0.25,
+		"type": "General",
+		"link": "https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2",
+	},
+	"microsoft/codebert-base": {
+		"overall_ndcg": 0.32,
+		"type": "Code-Specific",
+		"link": "https://huggingface.co/microsoft/codebert-base",
+	},
+	"microsoft/graphcodebert-base": {
+		"overall_ndcg": 0.35,
+		"type": "Code-Specific",
+		"link": "https://huggingface.co/microsoft/graphcodebert-base",
+	},
+	"huggingface/CodeBERTa-small-v1": {
+		"overall_ndcg": 0.28,
+		"type": "Code-Specific",
+		"link": "https://huggingface.co/huggingface/CodeBERTa-small-v1",
+	},
+	"sentence-transformers/all-mpnet-base-v2": {
+		"overall_ndcg": 0.27,
+		"type": "General",
+		"link": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
+	},
 }
 # Model specifications for efficiency analysis
 MODEL_SPECS = {
+	"sentence-transformers/all-MiniLM-L6-v2": {
+		"parameters": 22.7,
+		"size_mb": 90,
+		"link": "https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2",
+	},
+	"microsoft/codebert-base": {
+		"parameters": 125.0,
+		"size_mb": 500,
+		"link": "https://huggingface.co/microsoft/codebert-base",
+	},
+	"microsoft/graphcodebert-base": {
+		"parameters": 125.0,
+		"size_mb": 500,
+		"link": "https://huggingface.co/microsoft/graphcodebert-base",
+	},
+	"huggingface/CodeBERTa-small-v1": {
+		"parameters": 84.0,
+		"size_mb": 340,
+		"link": "https://huggingface.co/huggingface/CodeBERTa-small-v1",
+	},
+	"sentence-transformers/all-mpnet-base-v2": {
+		"parameters": 109.0,
+		"size_mb": 440,
+		"link": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
+	},
+	"Alibaba-NLP/gte-Qwen2-1.5B-instruct": {
+		"parameters": 1500.0,
+		"size_mb": 3000,
+		"link": "https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+	},
 }
 # Distilled model specifications
 		images_dir = base_path / "analysis_results" / "charts"
 		reports_dir = base_path / "analysis_results" / "reports"
 	else:
+		output_dir = Path()  # Use current directory
+		images_dir = IMAGES_DIR  # Use analysis_charts
+		reports_dir = Path()  # Use current directory for reports
+	# Only create directories that we actually use
 	images_dir.mkdir(parents=True, exist_ok=True)
 	return output_dir, images_dir, reports_dir
 	# Check if it's in our mapping
 	if name in MODEL_NAME_MAPPING:
+		return MODEL_NAME_MAPPING[name]["name"]
 	# Try to find partial matches
+	for file_key, model_info in MODEL_NAME_MAPPING.items():
 		if file_key in name or name in file_key:
+			return model_info["name"]
 	# If no mapping found, return the cleaned name
 	return name
+def get_model_link(model_name: str) -> str:
+	"""Get HuggingFace link for a model."""
+	# First try direct lookup by file key
+	for model_info in MODEL_NAME_MAPPING.values():
+		if model_info["name"] == model_name:
+			return model_info["link"]
+	# Try partial matches
+	for model_info in MODEL_NAME_MAPPING.values():
+		if model_name.lower() in model_info["name"].lower() or model_info["name"].lower() in model_name.lower():
+			return model_info["link"]
+	# If no mapping found, construct link from model name
+	if "/" in model_name:
+		return f"https://huggingface.co/{model_name}"
+	return ""
+def format_model_with_link(model_name: str) -> str:
+	"""Format model name with markdown link."""
+	link = get_model_link(model_name)
+	if link:
+		return f"[{model_name}]({link})"
+	return model_name
+def get_teacher_model_info(model_display_name: str) -> tuple[str, str]:
+	"""Extract teacher model name and link from distilled model display name."""
+	# Mapping from model display patterns to teacher models
+	teacher_mapping = {
+		"all_MiniLM_L6_v2": (
+			"sentence-transformers/all-MiniLM-L6-v2",
+			"https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2",
+		),
+		"all_mpnet_base_v2": (
+			"sentence-transformers/all-mpnet-base-v2",
+			"https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
+		),
+		"paraphrase_MiniLM_L6_v2": (
+			"sentence-transformers/paraphrase-MiniLM-L6-v2",
+			"https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2",
+		),
+		"codebert_base": ("microsoft/codebert-base", "https://huggingface.co/microsoft/codebert-base"),
+		"graphcodebert_base": ("microsoft/graphcodebert-base", "https://huggingface.co/microsoft/graphcodebert-base"),
+		"gte_Qwen2_1.5B_instruct": (
+			"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+			"https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+		),
+		"bge_m3": ("BAAI/bge-m3", "https://huggingface.co/BAAI/bge-m3"),
+		"jina_embeddings_v3": ("jinaai/jina-embeddings-v3", "https://huggingface.co/jinaai/jina-embeddings-v3"),
+		"nomic_embed_text_v2_moe": (
+			"nomic-ai/nomic-embed-text-v2-moe",
+			"https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe",
+		),
+		"Qodo_Embed_1_1.5B": ("Qodo/Qodo-Embed-1-1.5B", "https://huggingface.co/Qodo/Qodo-Embed-1-1.5B"),
+		"Reason_ModernColBERT": (
+			"lightonai/Reason-ModernColBERT",
+			"https://huggingface.co/lightonai/Reason-ModernColBERT",
+		),
+		"Linq_Embed_Mistral": (
+			"Linq-AI-Research/Linq-Embed-Mistral",
+			"https://huggingface.co/Linq-AI-Research/Linq-Embed-Mistral",
+		),
+		"bge_code_v1": ("BAAI/bge-code-v1", "https://huggingface.co/BAAI/bge-code-v1"),
+		"SFR_Embedding_Code_2B_R": (
+			"Salesforce/SFR-Embedding-Code-2B_R",
+			"https://huggingface.co/Salesforce/SFR-Embedding-Code-2B_R",
+		),
+	}
+	for pattern, (teacher_name, teacher_link) in teacher_mapping.items():
+		if pattern in model_display_name:
+			return teacher_name, teacher_link
+	return "Unknown", ""
 class CodeSearchNetAnalyzer:
 	"""Analyzer for CodeSearchNet evaluation results and performance benchmarks."""
 		self.benchmark_df: pd.DataFrame | None = None
 	def load_benchmark_results(self) -> None:
+		"""Load benchmark results from comprehensive evaluation files."""
+		logger.info("📊 Loading benchmark results from comprehensive evaluations...")
+		if not self.results_dir.exists():
+			logger.warning(f"Evaluation directory not found: {self.results_dir}")
 			return
+		logger.info(f"🔍 Searching for comprehensive evaluation files in: {self.results_dir}")
+		# Look for both new comprehensive format and legacy formats
+		comprehensive_files = list(self.results_dir.glob("comprehensive_eval_*.json"))
+		legacy_files = list(self.results_dir.glob("codesearchnet_eval_*.json"))
+		all_files = comprehensive_files + legacy_files
+		logger.info(
+			f"📁 Found {len(all_files)} evaluation files ({len(comprehensive_files)} comprehensive, {len(legacy_files)} legacy)"
+		)
+		for eval_file_path in all_files:
 			try:
+				logger.info(f"📖 Loading: {eval_file_path.name}")
+				with eval_file_path.open() as f:
 					data = json.load(f)
 				if data is not None:
+					if not isinstance(data, dict):
+						logger.warning(f"⚠️ Skipping {eval_file_path.name} (not a dict)")
+						continue
+					# Extract benchmark data if available
+					benchmark_data = self._extract_benchmark_data(data, eval_file_path)
+					if benchmark_data:
+						self.benchmark_results.append(benchmark_data)
+						logger.info(f"✅ Successfully loaded benchmark data: {benchmark_data['model_name']}")
 			except (json.JSONDecodeError, KeyError) as e:
+				logger.warning(f"❌ Failed to load {eval_file_path}: {e}")
 		logger.info(f"📊 Total benchmark results loaded: {len(self.benchmark_results)}")
 		if self.benchmark_results:
 		self._create_benchmark_dataframe()
+	def _extract_benchmark_data(self, data: dict, file_path: Path) -> dict[str, Any] | None:
+		"""Extract benchmark data from comprehensive evaluation results."""
+		# Check if this evaluation contains benchmark data
+		if data.get("benchmark_skipped", False):
+			return None
+		# Check for benchmark fields
+		if not any(key in data for key in ["size_metrics", "speed_benchmarks", "memory_benchmarks", "cpu_vs_gpu"]):
+			return None
+		# Extract model name
+		original_name = data.get("model_name") or "Unknown"
+		mapped_name = extract_model_name_from_filename(
+			file_path.stem.replace("comprehensive_eval_", "").replace("codesearchnet_eval_", "")
+		)
+		# Create benchmark result structure
+		result: dict[str, Any] = {
+			"model_name": mapped_name,
+			"original_model_name": original_name,
+			"size_metrics": data.get("size_metrics", {}),
+			"speed_benchmarks": data.get("speed_benchmarks", {}),
+			"memory_benchmarks": data.get("memory_benchmarks", {}),
+			"cpu_vs_gpu": data.get("cpu_vs_gpu", {}),
+		}
+		return result
 	def _create_benchmark_dataframe(self) -> None:
 		"""Create benchmark comparison DataFrame from results."""
 		if not self.benchmark_results:
 					)
 			# CPU vs GPU comparison
+			for device, metrics in cpu_vs_gpu.items():
+				if isinstance(metrics, dict) and "error" not in metrics:
 					device_key = f"{device.upper()}_TextsPerSec"
+					row[device_key] = metrics.get("texts_per_second", 0)
 			benchmark_data.append(row)
 			return
 		logger.info(f"🔍 Searching for evaluation files in: {self.results_dir}")
+		# Look for both new comprehensive format and legacy formats
+		comprehensive_files = list(self.results_dir.glob("comprehensive_eval_*.json"))
+		legacy_files = list(self.results_dir.glob("codesearchnet_eval_*.json"))
+		all_files = comprehensive_files + legacy_files
+		logger.info(
+			f"📁 Found {len(all_files)} evaluation files ({len(comprehensive_files)} comprehensive, {len(legacy_files)} legacy)"
+		)
+		for json_file in all_files:
 			try:
 				logger.info(f"📖 Loading: {json_file.name}")
 				with json_file.open() as f:
 					data = json.load(f)
 				if data is not None:
+					if not isinstance(data, dict):
+						logger.warning(f"⚠️ Skipping {json_file.name} (not a dict)")
+						continue
+					# Normalize data format for analysis
+					normalized_data = self._normalize_evaluation_data(data, json_file)
+					self.results.append(normalized_data)
+					logger.info(f"✅ Successfully loaded: {normalized_data['model_name']}")
 			except (json.JSONDecodeError, KeyError) as e:
 				logger.warning(f"❌ Failed to load {json_file}: {e}")
 		# Also load benchmark results
 		self.load_benchmark_results()
+	def _normalize_evaluation_data(self, data: dict, file_path: Path) -> dict[str, Any]:
+		"""Normalize evaluation data to consistent format for analysis."""
+		# Extract model name
+		original_name = data.get("model_name", "Unknown")
+		file_stem = file_path.stem.replace("comprehensive_eval_", "").replace("codesearchnet_eval_", "")
+		mapped_name = extract_model_name_from_filename(file_stem)
+		# Handle comprehensive format (new)
+		if "codesearch_overall" in data and "codesearch_languages" in data:
+			result = {
+				"model_name": mapped_name,
+				"original_model_name": original_name,
+				"overall": data.get("codesearch_overall", {}),
+				"languages": data.get("codesearch_languages", {}),
+			}
+		# Handle legacy format (old codesearchnet_eval files)
+		else:
+			result = {
+				"model_name": mapped_name,
+				"original_model_name": original_name,
+				"overall": data.get("overall", {}),
+				"languages": data.get("languages", {}),
+			}
+		return result
 	def _create_comparison_dataframe(self) -> None:
 		"""Create comparison DataFrame from results."""
 		if not self.results:
 			if cpu_vs_gpu:
 				print("🖥️  CPU vs GPU:")
 				for device, metrics in cpu_vs_gpu.items():
+					if isinstance(metrics, dict) and "error" not in metrics:
 						print(f"  {device.upper()}: {metrics.get('texts_per_second', 0):.1f} texts/sec")
 			# Memory efficiency
 					# Safe conversion to float for pandas values
 					score_value = pd.to_numeric(current_model_score, errors="coerce")
 					scores.append(float(score_value) if not pd.isna(score_value) else 0.0)
+					params.append(float(MODEL_SPECS[model_key].get("parameters", 100.0)))
 					is_user_model.append(False)
 		if not models:
 		# Create visualizations
 		logger.info("Generating visualizations...")
+		output_dir, images_dir, reports_dir = setup_directories()
 		self.create_performance_radar_chart(main_model_name, language_scores)
 		comparison_chart = self.plot_model_comparison()
 				overall_metrics = result.get("overall", {})
 				# Extract teacher model name from model name
+				teacher_name, teacher_link = get_teacher_model_info(model_display)
 				status = "🥇 Best" if rank == 1 else "🥈 2nd" if rank == 2 else "🥉 3rd" if rank == 3 else f"#{rank}"
+				# Use linked teacher name if available
+				teacher_display = f"[{teacher_name}]({teacher_link})" if teacher_link else teacher_name
+				report += f"| {model_display} | {teacher_display} | {overall_metrics.get('ndcg@10', 0):.4f} | {overall_metrics.get('mrr', 0):.4f} | {overall_metrics.get('recall@5', 0):.4f} | {status} |\n"
 		report += """
 			report += "### Individual Model Performance by Language\n\n"
 			for chart_model_name, chart_path in individual_radar_charts.items():
 				# Extract teacher name for cleaner display
+				teacher_name, teacher_link = get_teacher_model_info(chart_model_name)
+				# Use linked teacher name if available
+				teacher_display = f"[{teacher_name}]({teacher_link})" if teacher_link else teacher_name
+				report += f"#### {chart_model_name} (Teacher: {teacher_display})\n\n"
 				report += f"![{chart_model_name} Radar Chart]({chart_path})\n\n"
 			report += f"""
 		if language_scores:
 			report += "| Language | Best Model Performance | Average Performance | Language Difficulty |\n"
+			report += "|----------|------------------------|--------------------|--------------------|\n"
 			for lang in sorted(language_scores.keys()):
 				# Find best performance for this language across all models
 				model_name = result["model_name"]
 				score = result.get("overall", {}).get("ndcg@10", 0)
+				teacher_name, teacher_link = get_teacher_model_info(model_name)
+				teacher_performance[teacher_name] = score
 			if teacher_performance:
 				best_teacher = max(teacher_performance.items(), key=lambda x: x[1])
 - **Evaluation**: Retrieval of correct code for each documentation query
 ### Teacher Models Tested
+- [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) (proven baseline)
+- [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) (general purpose)
+- [sentence-transformers/paraphrase-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2) (paraphrase model)
+- [microsoft/codebert-base](https://huggingface.co/microsoft/codebert-base) (code-specialized)
+- [microsoft/graphcodebert-base](https://huggingface.co/microsoft/graphcodebert-base) (graph-aware code model)
+- [Alibaba-NLP/gte-Qwen2-1.5B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct) (instruction model)
+- [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3) (multilingual model)
+- [jinaai/jina-embeddings-v3](https://huggingface.co/jinaai/jina-embeddings-v3) (modern embedding model)
+- [nomic-ai/nomic-embed-text-v2-moe](https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe) (mixture of experts)
+- [Qodo/Qodo-Embed-1-1.5B](https://huggingface.co/Qodo/Qodo-Embed-1-1.5B) (code-specialized)
+- [lightonai/Reason-ModernColBERT](https://huggingface.co/lightonai/Reason-ModernColBERT) (ColBERT architecture)
+- [Linq-AI-Research/Linq-Embed-Mistral](https://huggingface.co/Linq-AI-Research/Linq-Embed-Mistral) (Mistral-based)
+- [BAAI/bge-code-v1](https://huggingface.co/BAAI/bge-code-v1) (code-specialized BGE)
+- [Salesforce/SFR-Embedding-Code-2B_R](https://huggingface.co/Salesforce/SFR-Embedding-Code-2B_R) (large code model)
 ### Distillation Method
 - **Technique**: Model2Vec static embedding generation
 			logger.info(f"Results exported to {output_file}")
+def main(
+	results_dir: str = DEFAULT_EVALUATION_DIR,
+	model_name: str = "code_model2vec_distilled_models",
+	output: str = "REPORT.md",
+	export_csv: str | None = None,
+) -> None:
 	"""Main analysis function."""
+	logger.info("Starting CodeSearchNet Analysis with Integrated Benchmarks")
 	logger.info("=" * 60)
 	# Setup output directories
 	output_dir, images_dir, reports_dir = setup_directories()
+	# Initialize analyzer with results directory (benchmarks are integrated)
 	analyzer = CodeSearchNetAnalyzer(
+		results_dir=results_dir,
+		benchmark_dir=None,  # No longer needed - benchmarks are in comprehensive files
 		images_dir=images_dir,
 	)
+	# Load results (this will also load benchmark data from comprehensive files)
 	analyzer.load_results()
 	if not analyzer.results:
 	if analyzer.benchmark_results:
 		analyzer.analyze_benchmark_performance()
 	else:
+		logger.warning("No benchmark results found. Models may have been evaluated with --skip-benchmark flag.")
 	# Generate comprehensive report with benchmark integration
+	logger.info("Generating comprehensive report with integrated benchmark data...")
+	report = analyzer.generate_comprehensive_report(model_name)
 	# Save report
+	report_path = Path(output)
 	with report_path.open("w") as f:
 		f.write(report)
 	# Export CSV if requested
+	if export_csv:
+		analyzer.export_results(export_csv)
 	# Export benchmark CSV if available
 	if analyzer.benchmark_df is not None and not analyzer.benchmark_df.empty:
+		benchmark_csv = report_path.parent / f"{model_name}_benchmark_comparison.csv"
 		analyzer.benchmark_df.to_csv(benchmark_csv, index=False)
 		logger.info(f"📊 Benchmark comparison saved to: {benchmark_csv}")
+	logger.info("✅ CodeSearchNet analysis with integrated benchmarks complete!")
 	logger.info(f"📊 Report saved to: {report_path}")
 	logger.info(f"🖼️  Charts saved to: {images_dir}")
+	logger.info(f"💾 Source: Comprehensive evaluation files in {results_dir}")
 if __name__ == "__main__":
 	main()

src/distiller/beam_utils.py CHANGED Viewed

@@ -16,6 +16,7 @@ Features:
 import json
 import logging
 import shutil
 import time
 from pathlib import Path
 from typing import Any
@@ -204,7 +205,7 @@ class BeamVolumeManager:
 class BeamCheckpointManager:
-	"""Manager for checkpoint operations on Beam volumes."""
 	def __init__(self, volume_manager: BeamVolumeManager, checkpoint_prefix: str = "checkpoints") -> None:
 		"""
@@ -216,14 +217,21 @@ class BeamCheckpointManager:
 		"""
 		self.volume = volume_manager
 		self.checkpoint_prefix = checkpoint_prefix
-		self.checkpoint_dir = self.volume.mount_path / checkpoint_prefix
-		self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
 	def save_checkpoint(self, stage: str, data: dict[str, Any], step: int = 0) -> bool:
-		"""Save checkpoint to volume."""
 		try:
 			checkpoint_filename = f"{self.checkpoint_prefix}_{stage}_step_{step}.json"
-			checkpoint_path = self.checkpoint_dir / checkpoint_filename
 			with checkpoint_path.open("w") as f:
 				json.dump(data, f, indent=2, default=str)
@@ -236,10 +244,11 @@ class BeamCheckpointManager:
 			return False
 	def load_checkpoint(self, stage: str, step: int = 0) -> dict[str, Any] | None:
-		"""Load checkpoint from volume."""
 		try:
 			checkpoint_filename = f"{self.checkpoint_prefix}_{stage}_step_{step}.json"
-			checkpoint_path = self.checkpoint_dir / checkpoint_filename
 			if checkpoint_path.exists():
 				with checkpoint_path.open("r") as f:
@@ -257,11 +266,13 @@ class BeamCheckpointManager:
 	def get_latest_checkpoint(self, stage: str) -> tuple[int, dict[str, Any]] | None:
 		"""Get the latest checkpoint for a stage."""
 		try:
 			# Find checkpoint files for this stage
 			pattern = f"{self.checkpoint_prefix}_{stage}_step_*.json"
 			stage_checkpoints: list[tuple[int, Path]] = []
-			for checkpoint_file in self.checkpoint_dir.glob(pattern):
 				try:
 					# Extract step number from filename
 					step_str = checkpoint_file.stem.replace(f"{self.checkpoint_prefix}_{stage}_step_", "")
@@ -290,11 +301,13 @@ class BeamCheckpointManager:
 	def cleanup_old_checkpoints(self, stage: str, keep_latest: int = 3) -> list[str]:
 		"""Clean up old checkpoints, keeping only the latest N."""
 		try:
 			# Find checkpoint files for this stage
 			pattern = f"{self.checkpoint_prefix}_{stage}_step_*.json"
 			stage_checkpoints: list[tuple[int, Path]] = []
-			for checkpoint_file in self.checkpoint_dir.glob(pattern):
 				try:
 					step_str = checkpoint_file.stem.replace(f"{self.checkpoint_prefix}_{stage}_step_", "")
 					step = int(step_str)
@@ -329,29 +342,55 @@ class BeamCheckpointManager:
 		"""List all checkpoints, optionally filtered by stage."""
 		try:
 			checkpoints: list[dict[str, Any]] = []
-			pattern = f"{self.checkpoint_prefix}_*.json"
-			for checkpoint_file in self.checkpoint_dir.glob(pattern):
-				# Parse checkpoint info
-				name_parts = checkpoint_file.stem.split("_")
-				if len(name_parts) >= 4:
-					checkpoint_stage = name_parts[1]
-					try:
-						step = int(name_parts[3])
-					except ValueError:
-						step = 0
-					if stage is None or checkpoint_stage == stage:
 						stat = checkpoint_file.stat()
 						checkpoints.append(
 							{
-								"stage": checkpoint_stage,
 								"step": step,
 								"filename": checkpoint_file.name,
 								"size": f"{stat.st_size / 1024:.1f}KB",
 								"modified": time.ctime(stat.st_mtime),
 							}
 						)
 			return sorted(checkpoints, key=lambda x: (x["stage"], x["step"]))
@@ -747,6 +786,389 @@ def example_distillation_workflow() -> None:
 	logger.info(f"Workspace info: {info}")
 if __name__ == "__main__":
 	# Example usage
 	logging.basicConfig(level=logging.INFO)

 import json
 import logging
 import shutil
+import subprocess
 import time
 from pathlib import Path
 from typing import Any
 class BeamCheckpointManager:
+	"""Manager for checkpoint operations on Beam volumes with stage-based organization."""
 	def __init__(self, volume_manager: BeamVolumeManager, checkpoint_prefix: str = "checkpoints") -> None:
 		"""
 		"""
 		self.volume = volume_manager
 		self.checkpoint_prefix = checkpoint_prefix
+		self.checkpoint_base_dir = self.volume.mount_path / checkpoint_prefix
+		self.checkpoint_base_dir.mkdir(parents=True, exist_ok=True)
+	def _get_stage_dir(self, stage: str) -> Path:
+		"""Get stage-specific checkpoint directory."""
+		stage_dir = self.checkpoint_base_dir / stage
+		stage_dir.mkdir(parents=True, exist_ok=True)
+		return stage_dir
 	def save_checkpoint(self, stage: str, data: dict[str, Any], step: int = 0) -> bool:
+		"""Save checkpoint to volume in stage-specific directory."""
 		try:
+			stage_dir = self._get_stage_dir(stage)
 			checkpoint_filename = f"{self.checkpoint_prefix}_{stage}_step_{step}.json"
+			checkpoint_path = stage_dir / checkpoint_filename
 			with checkpoint_path.open("w") as f:
 				json.dump(data, f, indent=2, default=str)
 			return False
 	def load_checkpoint(self, stage: str, step: int = 0) -> dict[str, Any] | None:
+		"""Load checkpoint from volume stage-specific directory."""
 		try:
+			stage_dir = self._get_stage_dir(stage)
 			checkpoint_filename = f"{self.checkpoint_prefix}_{stage}_step_{step}.json"
+			checkpoint_path = stage_dir / checkpoint_filename
 			if checkpoint_path.exists():
 				with checkpoint_path.open("r") as f:
 	def get_latest_checkpoint(self, stage: str) -> tuple[int, dict[str, Any]] | None:
 		"""Get the latest checkpoint for a stage."""
 		try:
+			stage_dir = self._get_stage_dir(stage)
 			# Find checkpoint files for this stage
 			pattern = f"{self.checkpoint_prefix}_{stage}_step_*.json"
 			stage_checkpoints: list[tuple[int, Path]] = []
+			for checkpoint_file in stage_dir.glob(pattern):
 				try:
 					# Extract step number from filename
 					step_str = checkpoint_file.stem.replace(f"{self.checkpoint_prefix}_{stage}_step_", "")
 	def cleanup_old_checkpoints(self, stage: str, keep_latest: int = 3) -> list[str]:
 		"""Clean up old checkpoints, keeping only the latest N."""
 		try:
+			stage_dir = self._get_stage_dir(stage)
 			# Find checkpoint files for this stage
 			pattern = f"{self.checkpoint_prefix}_{stage}_step_*.json"
 			stage_checkpoints: list[tuple[int, Path]] = []
+			for checkpoint_file in stage_dir.glob(pattern):
 				try:
 					step_str = checkpoint_file.stem.replace(f"{self.checkpoint_prefix}_{stage}_step_", "")
 					step = int(step_str)
 		"""List all checkpoints, optionally filtered by stage."""
 		try:
 			checkpoints: list[dict[str, Any]] = []
+			if stage:
+				# List checkpoints for specific stage
+				stage_dir = self._get_stage_dir(stage)
+				pattern = f"{self.checkpoint_prefix}_{stage}_*.json"
+				for checkpoint_file in stage_dir.glob(pattern):
+					name_parts = checkpoint_file.stem.split("_")
+					if len(name_parts) >= 4:
+						try:
+							step = int(name_parts[3])
+						except ValueError:
+							step = 0
 						stat = checkpoint_file.stat()
 						checkpoints.append(
 							{
+								"stage": stage,
 								"step": step,
 								"filename": checkpoint_file.name,
 								"size": f"{stat.st_size / 1024:.1f}KB",
 								"modified": time.ctime(stat.st_mtime),
 							}
 						)
+			else:
+				# List checkpoints for all stages
+				for stage_dir in self.checkpoint_base_dir.iterdir():
+					if stage_dir.is_dir():
+						stage_name = stage_dir.name
+						pattern = f"{self.checkpoint_prefix}_{stage_name}_*.json"
+						for checkpoint_file in stage_dir.glob(pattern):
+							name_parts = checkpoint_file.stem.split("_")
+							if len(name_parts) >= 4:
+								try:
+									step = int(name_parts[3])
+								except ValueError:
+									step = 0
+								stat = checkpoint_file.stat()
+								checkpoints.append(
+									{
+										"stage": stage_name,
+										"step": step,
+										"filename": checkpoint_file.name,
+										"size": f"{stat.st_size / 1024:.1f}KB",
+										"modified": time.ctime(stat.st_mtime),
+									}
+								)
 			return sorted(checkpoints, key=lambda x: (x["stage"], x["step"]))
 	logger.info(f"Workspace info: {info}")
+def download_evaluation_results_from_beam(
+	volume_name: str,
+	remote_results_dir: str = "evaluation_results",
+	local_results_dir: str = "code_model2vec/evaluation_results",
+) -> bool:
+	"""
+	Download evaluation result files from Beam volume to local directory using beam cp.
+	Args:
+	    volume_name: Name of the Beam volume
+	    remote_results_dir: Directory path in the Beam volume containing results
+	    local_results_dir: Local directory to download results to
+	Returns:
+	    True if download successful, False otherwise
+	"""
+	try:
+		local_path = Path(local_results_dir)
+		local_path.mkdir(parents=True, exist_ok=True)
+		# Use beam cp to download individual JSON files
+		remote_path = f"{volume_name}:{remote_results_dir}"
+		# First, list files in the remote directory
+		list_cmd = ["beam", "cp", "-r", "--list-only", remote_path]
+		try:
+			result = subprocess.run(list_cmd, capture_output=True, text=True, check=True)  # noqa: S603
+			remote_files = [line.strip() for line in result.stdout.split("\n") if line.strip().endswith(".json")]
+		except subprocess.CalledProcessError:
+			logger.warning(f"Could not list files in {remote_path}")
+			remote_files = []
+		# Download each JSON file individually
+		downloaded_files = []
+		for file_name in remote_files:
+			if file_name.endswith(".json"):
+				remote_file_path = f"{volume_name}:{remote_results_dir}/{file_name}"
+				local_file_path = local_path / file_name
+				try:
+					download_cmd = ["beam", "cp", remote_file_path, str(local_file_path)]
+					subprocess.run(download_cmd, check=True, capture_output=True)  # noqa: S603
+					downloaded_files.append(file_name)
+					logger.info(f"📥 Downloaded: {file_name}")
+					# Delete the file from Beam volume after successful download
+					delete_cmd = ["beam", "rm", remote_file_path]
+					try:
+						subprocess.run(delete_cmd, check=True, capture_output=True)  # noqa: S603
+						logger.info(f"🗑️ Deleted from volume: {file_name}")
+					except subprocess.CalledProcessError as e:
+						logger.warning(f"⚠️ Could not delete {file_name} from volume: {e}")
+				except subprocess.CalledProcessError as e:
+					logger.warning(f"⚠️ Failed to download {file_name}: {e}")
+		if downloaded_files:
+			logger.info(f"✅ Downloaded {len(downloaded_files)} evaluation result files")
+			return True
+		logger.info("ℹ️ No new evaluation files to download")
+		return True
+	except Exception:
+		logger.exception("❌ Error downloading evaluation results from Beam")
+		return False
+def download_specific_evaluation_file(
+	volume_name: str,
+	model_name: str,
+	remote_results_dir: str = "evaluation_results",
+	local_results_dir: str = "code_model2vec/evaluation_results",
+	file_prefix: str = "codesearchnet_eval",
+) -> bool:
+	"""
+	Download a specific evaluation or benchmark result file from Beam volume.
+	Args:
+	    volume_name: Name of the Beam volume
+	    model_name: Name of the model whose results to download
+	    remote_results_dir: Directory path in the Beam volume containing results
+	    local_results_dir: Local directory to download results to
+	    file_prefix: Prefix for the file (e.g., 'codesearchnet_eval', 'benchmark')
+	Returns:
+	    True if download successful, False otherwise
+	"""
+	try:
+		local_path = Path(local_results_dir)
+		local_path.mkdir(parents=True, exist_ok=True)
+		# Generate filename following the pattern
+		safe_model_name = model_name.replace("/", "_")
+		filename = f"{file_prefix}_{safe_model_name}.json"
+		remote_file_path = f"{volume_name}:{remote_results_dir}/{filename}"
+		local_file_path = local_path / filename
+		# Download the specific file
+		download_cmd = ["beam", "cp", remote_file_path, str(local_file_path)]
+		subprocess.run(download_cmd, check=True, capture_output=True)  # noqa: S603
+		logger.info(f"📥 Downloaded {file_prefix} results for {model_name}")
+		# Delete the file from Beam volume after successful download
+		delete_cmd = ["beam", "rm", remote_file_path]
+		try:
+			subprocess.run(delete_cmd, check=True, capture_output=True)  # noqa: S603
+			logger.info(f"🗑️ Deleted {file_prefix} results for {model_name} from volume")
+		except subprocess.CalledProcessError as e:
+			logger.warning(f"⚠️ Could not delete {filename} from volume: {e}")
+		return True
+	except subprocess.CalledProcessError:
+		logger.warning(f"⚠️ No {file_prefix} results found for {model_name} on Beam")
+		return False
+	except Exception:
+		logger.exception(f"❌ Error downloading {file_prefix} results for {model_name}")
+		return False
+def download_model_from_beam(
+	volume_name: str,
+	model_name: str,
+	local_dir: str,
+) -> bool:
+	"""
+	Download a model from Beam volume to local directory.
+	Args:
+	    volume_name: Name of the Beam volume
+	    model_name: Name of the model to download
+	    local_dir: Local directory to download model to
+	Returns:
+	    True if download successful, False otherwise
+	"""
+	try:
+		local_path = Path(local_dir)
+		local_path.mkdir(parents=True, exist_ok=True)
+		# Use beam cp to download the model directory
+		remote_path = f"{volume_name}:models/{model_name}"
+		local_model_path = local_path / model_name
+		download_cmd = ["beam", "cp", "-r", remote_path, str(local_model_path)]
+		subprocess.run(download_cmd, check=True, capture_output=True)  # noqa: S603
+		logger.info(f"📥 Downloaded model {model_name} from Beam to {local_dir}")
+		return True
+	except subprocess.CalledProcessError as e:
+		logger.warning(f"⚠️ Failed to download model {model_name} from Beam: {e}")
+		return False
+	except Exception:
+		logger.exception(f"❌ Error downloading model {model_name} from Beam")
+		return False
+def upload_model_to_beam(
+	volume_name: str,
+	model_name: str,
+	local_dir: str,
+) -> bool:
+	"""
+	Upload a model from local directory to Beam volume.
+	Args:
+	    volume_name: Name of the Beam volume
+	    model_name: Name for the model on Beam
+	    local_dir: Local directory containing the model
+	Returns:
+	    True if upload successful, False otherwise
+	"""
+	try:
+		local_path = Path(local_dir)
+		if not local_path.exists():
+			logger.error(f"❌ Local model directory does not exist: {local_dir}")
+			return False
+		# Use beam cp to upload the model directory
+		remote_path = f"{volume_name}:models/{model_name}"
+		upload_cmd = ["beam", "cp", "-r", str(local_path), remote_path]
+		subprocess.run(upload_cmd, check=True, capture_output=True)  # noqa: S603
+		logger.info(f"📤 Uploaded model {model_name} to Beam from {local_dir}")
+		return True
+	except subprocess.CalledProcessError as e:
+		logger.warning(f"⚠️ Failed to upload model {model_name} to Beam: {e}")
+		return False
+	except Exception:
+		logger.exception(f"❌ Error uploading model {model_name} to Beam")
+		return False
+def download_checkpoints_from_beam(
+	volume_name: str,
+	stage: str | None = None,
+	remote_checkpoints_dir: str = "checkpoints",
+	local_checkpoints_dir: str = "code_model2vec/checkpoints",
+) -> bool:
+	"""
+	Download checkpoint files from Beam volume to local directory.
+	Args:
+	    volume_name: Name of the Beam volume
+	    stage: Specific stage to download (e.g., 'distillation', 'training'), or None for all
+	    remote_checkpoints_dir: Directory path in the Beam volume containing checkpoints
+	    local_checkpoints_dir: Local directory to download checkpoints to
+	Returns:
+	    True if download successful, False otherwise
+	"""
+	try:
+		local_path = Path(local_checkpoints_dir)
+		local_path.mkdir(parents=True, exist_ok=True)
+		# Build the pattern for files to download
+		if stage:
+			local_stage_dir = local_path / stage
+			local_stage_dir.mkdir(parents=True, exist_ok=True)
+		else:
+			pass
+		# Use beam cp to download checkpoint files
+		remote_path = f"{volume_name}:{remote_checkpoints_dir}"
+		# First, try to list files
+		list_cmd = ["beam", "cp", "-r", "--list-only", remote_path]
+		try:
+			result = subprocess.run(list_cmd, capture_output=True, text=True, check=True)  # noqa: S603
+			remote_files = [
+				line.strip()
+				for line in result.stdout.split("\n")
+				if line.strip().endswith(".json") and "checkpoints_" in line.strip()
+			]
+		except subprocess.CalledProcessError:
+			logger.warning(f"Could not list checkpoint files in {remote_path}")
+			remote_files = []
+		# Filter by stage if specified
+		if stage:
+			remote_files = [f for f in remote_files if f"checkpoints_{stage}_" in f]
+		# Download each checkpoint file
+		downloaded_files = []
+		for file_name in remote_files:
+			remote_file_path = f"{volume_name}:{remote_checkpoints_dir}/{file_name}"
+			# Determine local subdirectory based on checkpoint stage
+			file_stage = file_name.split("_")[1] if "_" in file_name else "unknown"
+			local_stage_dir = local_path / file_stage
+			local_stage_dir.mkdir(parents=True, exist_ok=True)
+			local_file_path = local_stage_dir / file_name
+			try:
+				download_cmd = ["beam", "cp", remote_file_path, str(local_file_path)]
+				subprocess.run(download_cmd, check=True, capture_output=True)  # noqa: S603
+				downloaded_files.append(file_name)
+				logger.info(f"📥 Downloaded checkpoint: {file_name}")
+			except subprocess.CalledProcessError as e:
+				logger.warning(f"⚠️ Failed to download checkpoint {file_name}: {e}")
+		if downloaded_files:
+			logger.info(f"✅ Downloaded {len(downloaded_files)} checkpoint files")
+			return True
+		logger.info("ℹ️ No new checkpoint files to download")
+		return True
+	except Exception:
+		logger.exception("❌ Error downloading checkpoints from Beam")
+		return False
+def upload_checkpoints_to_beam(
+	volume_name: str,
+	stage: str | None = None,
+	local_checkpoints_dir: str = "code_model2vec/checkpoints",
+	remote_checkpoints_dir: str = "checkpoints",
+) -> bool:
+	"""
+	Upload checkpoint files from local directory to Beam volume.
+	Args:
+	    volume_name: Name of the Beam volume
+	    stage: Specific stage to upload (e.g., 'distillation', 'training'), or None for all
+	    local_checkpoints_dir: Local directory containing checkpoints
+	    remote_checkpoints_dir: Directory path in the Beam volume to store checkpoints
+	Returns:
+	    True if upload successful, False otherwise
+	"""
+	try:
+		local_path = Path(local_checkpoints_dir)
+		if not local_path.exists():
+			logger.warning(f"⚠️ Local checkpoints directory does not exist: {local_checkpoints_dir}")
+			return True  # Not an error - no checkpoints to upload
+		# Find checkpoint files to upload
+		if stage:
+			# Look in the stage subdirectory
+			stage_dir = local_path / stage
+			checkpoint_files = list(stage_dir.glob(f"checkpoints_{stage}_*.json")) if stage_dir.exists() else []
+		else:
+			# Look for all checkpoint files in all subdirectories
+			checkpoint_files = []
+			for subdir in local_path.iterdir():
+				if subdir.is_dir():
+					checkpoint_files.extend(subdir.glob("checkpoints_*.json"))
+		if not checkpoint_files:
+			logger.info(f"ℹ️ No checkpoint files found to upload for stage: {stage or 'all'}")
+			return True
+		# Upload each checkpoint file
+		uploaded_files = []
+		for checkpoint_file in checkpoint_files:
+			remote_file_path = f"{volume_name}:{remote_checkpoints_dir}/{checkpoint_file.name}"
+			try:
+				upload_cmd = ["beam", "cp", str(checkpoint_file), remote_file_path]
+				subprocess.run(upload_cmd, check=True, capture_output=True)  # noqa: S603
+				uploaded_files.append(checkpoint_file.name)
+				logger.info(f"📤 Uploaded checkpoint: {checkpoint_file.name}")
+			except subprocess.CalledProcessError as e:
+				logger.warning(f"⚠️ Failed to upload checkpoint {checkpoint_file.name}: {e}")
+		if uploaded_files:
+			logger.info(f"✅ Uploaded {len(uploaded_files)} checkpoint files")
+			return True
+		return False
+	except Exception:
+		logger.exception("❌ Error uploading checkpoints to Beam")
+		return False
+def sync_checkpoints_from_beam(
+	volume_name: str,
+	stage: str,
+	local_checkpoints_dir: str = "code_model2vec/checkpoints",
+) -> bool:
+	"""
+	Sync specific stage checkpoints from Beam to local directory.
+	Args:
+	    volume_name: Name of the Beam volume
+	    stage: Stage to sync (e.g., 'distillation', 'training')
+	    local_checkpoints_dir: Local directory for checkpoints
+	Returns:
+	    True if sync successful, False otherwise
+	"""
+	logger.info(f"🔄 Syncing {stage} checkpoints from Beam...")
+	return download_checkpoints_from_beam(volume_name, stage, "checkpoints", local_checkpoints_dir)
+def sync_checkpoints_to_beam(
+	volume_name: str,
+	stage: str,
+	local_checkpoints_dir: str = "code_model2vec/checkpoints",
+) -> bool:
+	"""
+	Sync specific stage checkpoints from local directory to Beam.
+	Args:
+	    volume_name: Name of the Beam volume
+	    stage: Stage to sync (e.g., 'distillation', 'training')
+	    local_checkpoints_dir: Local directory containing checkpoints
+	Returns:
+	    True if sync successful, False otherwise
+	"""
+	logger.info(f"🔄 Syncing {stage} checkpoints to Beam...")
+	return upload_checkpoints_to_beam(volume_name, stage, local_checkpoints_dir, "checkpoints")
 if __name__ == "__main__":
 	# Example usage
 	logging.basicConfig(level=logging.INFO)

src/distiller/benchmark.py DELETED Viewed

@@ -1,1181 +0,0 @@
-"""
-Operational Performance Benchmarking for Embedding Models.
-This module benchmarks embedding models on operational metrics like:
-- Inference speed (latency and throughput)
-- Memory efficiency (RAM and GPU usage)
-- Model size and storage requirements
-- Scalability with batch size
-- CPU vs GPU performance
-"""
-import gc
-import json
-import logging
-import os
-import time
-from pathlib import Path
-from typing import Any
-import pandas as pd
-import psutil
-import torch
-from beam import GpuType, Image, Volume, function
-from sentence_transformers import SentenceTransformer
-from .beam_utils import (
-	BeamCheckpointManager,
-	BeamEvaluationManager,
-	create_beam_utilities,
-)
-logger = logging.getLogger(__name__)
-# =============================================================================
-# BEAM CONFIGURATION
-# =============================================================================
-GPU_NAME = GpuType.A100_40
-VOLUME_NAME = "gte_qwen2_m2v_code"  # Same volume as distill.py and evaluate.py
-VOLUME_PATH = "./gte_qwen2_m2v_code"  # Same mount path as distill.py and evaluate.py
-BENCHMARK_RESULTS_DIR = "benchmark_results"  # Subdirectory within volume
-BENCHMARK_CACHE_DIR = "benchmark_cache"  # Cache for models
-IMAGE = Image(python_version="python3.12").add_python_packages(
-	[
-		"torch>=2.7.0",
-		"transformers>=4.40.0",
-		"datasets>=3.2.0",
-		"sentence-transformers>=4.1.0",
-		"model2vec[train]>=0.5.0",
-		"numpy>=1.26.4",
-		"scikit-learn>=1.6.1",
-		"pandas>=2.0.0",
-		"tqdm>=4.65.0",
-		"psutil>=5.9.0",
-	]
-)
-# =============================================================================
-# CONFIGURATION
-# =============================================================================
-DEFAULT_OUTPUT_DIR = "benchmark_results"  # Local fallback directory
-# Default models to benchmark (can be overridden via command line)
-DEFAULT_BENCHMARK_MODELS = [
-	# Your distilled model (local files in Beam volume root)
-	"gte_qwen2_m2v_code",  # This will be resolved to VOLUME_PATH in Beam
-	# Established Code Models
-	"sentence-transformers/all-MiniLM-L6-v2",
-	"microsoft/codebert-base",
-	"microsoft/graphcodebert-base",
-	"huggingface/CodeBERTa-small-v1",
-	"sentence-transformers/all-mpnet-base-v2",
-	"sentence-transformers/all-MiniLM-L12-v2",
-	# Model2Vec & Efficiency Models (Direct Competitors)
-	"minishlab/potion-base-8M",
-	"minishlab/potion-retrieval-32M",
-	# Small Transformer-Based Code Models
-	"Salesforce/codet5-base",
-]
-# =============================================================================
-# CHECKPOINT CONFIGURATION
-# =============================================================================
-# Prevent conflicts with other modules by using unique prefixes
-BENCHMARK_CHECKPOINT_PREFIX = "benchmark_checkpoints"
-MODEL_CACHE_PREFIX = "model_cache"
-# Sample texts for benchmarking (various lengths)
-BENCHMARK_TEXTS = {
-	"short": [
-		"def add(a, b): return a + b",
-		"function multiply(x, y) { return x * y; }",
-		"class Calculator { public int subtract(int a, int b) { return a - b; } }",
-	]
-	* 100,  # 300 short texts
-	"medium": [
-		"def fibonacci(n):\n    if n <= 1:\n        return n\n    return fibonacci(n-1) + fibonacci(n-2)",
-		"function quickSort(arr) {\n    if (arr.length <= 1) return arr;\n    const pivot = arr[arr.length - 1];\n    const left = [], right = [];\n    for (let i = 0; i < arr.length - 1; i++) {\n        if (arr[i] < pivot) left.push(arr[i]);\n        else right.push(arr[i]);\n    }\n    return [...quickSort(left), pivot, ...quickSort(right)];\n}",
-	]
-	* 50,  # 100 medium texts
-	"long": [
-		"""
-def complex_algorithm(data, config):
-    '''
-    Complex data processing algorithm with multiple steps.
-    Args:
-        data: Input data structure
-        config: Configuration parameters
-    Returns:
-        Processed results
-    '''
-    results = []
-    # Step 1: Data validation
-    if not isinstance(data, (list, tuple)):
-        raise ValueError("Data must be list or tuple")
-    # Step 2: Preprocessing
-    processed_data = []
-    for item in data:
-        if config.get('normalize', False):
-            item = normalize_item(item)
-        if config.get('filter', False):
-            if not filter_item(item, config['filter_criteria']):
-                continue
-        processed_data.append(item)
-    # Step 3: Main processing
-    for item in processed_data:
-        result = process_item(item, config)
-        if result is not None:
-            results.append(result)
-    # Step 4: Post-processing
-    if config.get('sort', False):
-        results.sort(key=lambda x: x.get('score', 0), reverse=True)
-    return results
-        """.strip(),
-	]
-	* 20,  # 20 long texts
-}
-class PerformanceBenchmark:
-	"""Comprehensive performance benchmarking for embedding models."""
-	def __init__(
-		self,
-		model_path: str,
-		model_name: str | None = None,
-		checkpoint_manager: BeamCheckpointManager | None = None,
-		eval_manager: BeamEvaluationManager | None = None,
-	) -> None:
-		"""Initialize benchmarker with model and optional Beam utilities."""
-		self.model_path = model_path
-		self.model_name = model_name or Path(model_path).name
-		self.model: SentenceTransformer | None = None
-		self.device = "cuda" if torch.cuda.is_available() else "cpu"
-		self.results: dict[str, Any] = {}
-		self.checkpoint_manager = checkpoint_manager
-		self.eval_manager = eval_manager
-	def load_model(self) -> None:
-		"""Load the embedding model."""
-		logger.info(f"Loading model from {self.model_path}")
-		start_time = time.time()
-		try:
-			self.model = SentenceTransformer(self.model_path, device=self.device, trust_remote_code=True)
-			load_time = time.time() - start_time
-			logger.info(f"✅ Model loaded in {load_time:.2f}s on {self.device}")
-			self.results["model_load_time"] = load_time
-		except Exception:
-			logger.exception("❌ Failed to load model")
-			raise
-	def measure_model_size(self) -> dict[str, float]:
-		"""Measure model size metrics."""
-		logger.info("📏 Measuring model size...")
-		size_metrics = {}
-		# Disk size - handle both local paths and HuggingFace models
-		try:
-			if Path(self.model_path).is_dir():
-				# Local directory - calculate size of model files only
-				model_extensions = {".safetensors", ".bin", ".json", ".txt", ".tokenizer"}
-				total_size = 0
-				model_dir = Path(self.model_path)
-				for file_path in model_dir.rglob("*"):
-					if file_path.is_file() and (
-						file_path.suffix.lower() in model_extensions
-						or file_path.name.lower() in {"config.json", "tokenizer.json", "modules.json", "README.md"}
-					):
-						total_size += file_path.stat().st_size
-				size_metrics["disk_size_mb"] = total_size / (1024 * 1024)
-			elif Path(self.model_path).is_file():
-				# Single file
-				total_size = Path(self.model_path).stat().st_size
-				size_metrics["disk_size_mb"] = total_size / (1024 * 1024)
-			else:
-				# HuggingFace model - estimate from cache if available
-				from transformers import AutoConfig
-				try:
-					config = AutoConfig.from_pretrained(self.model_path)
-					# Estimate size based on parameters (rough approximation)
-					if hasattr(config, "hidden_size") and hasattr(config, "num_hidden_layers"):
-						# Rough estimation for transformer models
-						estimated_params = config.hidden_size * config.num_hidden_layers * 1000  # Very rough
-						size_metrics["disk_size_mb"] = estimated_params * 4 / (1024 * 1024)  # 4 bytes per float32
-					else:
-						size_metrics["disk_size_mb"] = 0  # Unknown
-				except Exception:
-					logger.warning(f"Could not determine disk size for HuggingFace model: {self.model_path}")
-					size_metrics["disk_size_mb"] = 0  # Unknown
-		except Exception as e:
-			logger.warning(f"Could not determine disk size: {e}")
-			size_metrics["disk_size_mb"] = 0
-		# Model parameters (if accessible)
-		try:
-			if self.model is not None and hasattr(self.model, "modules"):
-				total_params = sum(p.numel() for p in self.model.parameters())
-				size_metrics["parameters_millions"] = total_params / 1_000_000
-				# Try to get embedding dimension from model config
-				try:
-					# Use the public modules() method instead of private _modules
-					modules = list(self.model.modules())
-					if len(modules) > 1:  # modules[0] is usually the entire model, modules[1] is first submodule
-						first_module = modules[1]
-						if hasattr(first_module, "auto_model") and hasattr(first_module.auto_model, "config"):
-							config = first_module.auto_model.config
-							if hasattr(config, "hidden_size"):
-								size_metrics["embedding_dim"] = config.hidden_size
-							elif hasattr(config, "model_dim"):
-								size_metrics["embedding_dim"] = config.model_dim
-				except Exception as e:
-					logger.debug(
-						f"Could not extract embedding dimension from model config: {e}"
-					)  # Silently continue if this method fails
-			# For Model2Vec static models
-			elif self.model is not None and hasattr(self.model, "embedding"):
-				# Handle both tensor and numpy array embeddings
-				embedding = self.model.embedding
-				if hasattr(embedding, "shape"):
-					vocab_size, embedding_dim = embedding.shape  # type: ignore[misc]
-					total_params = vocab_size * embedding_dim
-					size_metrics["parameters_millions"] = total_params / 1_000_000
-					size_metrics["vocab_size"] = vocab_size
-					size_metrics["embedding_dim"] = embedding_dim
-				else:
-					logger.warning("Could not determine embedding shape for Model2Vec model")
-			# Alternative method: get embedding dimension from a test encoding
-			if "embedding_dim" not in size_metrics and self.model is not None:
-				try:
-					test_embedding = self.model.encode(["test"], convert_to_tensor=False)
-					if hasattr(test_embedding, "shape") and len(test_embedding.shape) >= 2:
-						size_metrics["embedding_dim"] = test_embedding.shape[1]
-					elif (
-						isinstance(test_embedding, (list, tuple))
-						and len(test_embedding) > 0
-						and hasattr(test_embedding[0], "__len__")
-					):
-						size_metrics["embedding_dim"] = len(test_embedding[0])
-				except Exception as e:
-					logger.warning(f"Could not determine embedding dimension: {e}")
-		except Exception as e:
-			logger.warning(f"Could not determine parameter count: {e}")
-		# Memory footprint
-		if self.device == "cuda" and torch.cuda.is_available():
-			torch.cuda.empty_cache()
-			size_metrics["gpu_memory_mb"] = torch.cuda.memory_allocated() / (1024 * 1024)
-		# RAM usage (approximate)
-		process = psutil.Process(os.getpid())
-		size_metrics["ram_usage_mb"] = process.memory_info().rss / (1024 * 1024)
-		self.results["size_metrics"] = size_metrics
-		return size_metrics
-	def benchmark_inference_speed(self, batch_sizes: list[int] | None = None) -> dict[str, Any]:
-		"""Benchmark inference speed across different batch sizes."""
-		if batch_sizes is None:
-			batch_sizes = [1, 8, 16, 32, 64, 128]
-		logger.info("⚡ Benchmarking inference speed...")
-		if self.model is None:
-			self.load_model()
-		if self.model is None:
-			msg = "Failed to load model"
-			raise RuntimeError(msg)
-		speed_results = {}
-		text_lengths = ["short", "medium", "long"]
-		for text_length in text_lengths:
-			logger.info(f"  📝 Testing {text_length} texts...")
-			texts = BENCHMARK_TEXTS[text_length]
-			length_results = {}
-			for batch_size in batch_sizes:
-				if batch_size > len(texts):
-					continue
-				logger.info(f"    🔄 Batch size: {batch_size}")
-				# Prepare batch
-				batch_texts = texts[:batch_size]
-				# Warmup
-				if self.device == "cuda":
-					torch.cuda.synchronize()
-				_ = self.model.encode(batch_texts[: min(2, batch_size)], convert_to_tensor=False)
-				# Clear cache
-				if self.device == "cuda":
-					torch.cuda.empty_cache()
-					torch.cuda.synchronize()
-				# Measure inference time
-				start_time = time.perf_counter()
-				embeddings = self.model.encode(batch_texts, convert_to_tensor=False, show_progress_bar=False)
-				if self.device == "cuda":
-					torch.cuda.synchronize()
-				end_time = time.perf_counter()
-				# Calculate metrics
-				total_time = end_time - start_time
-				time_per_text = total_time / batch_size
-				texts_per_second = batch_size / total_time
-				# Estimate tokens (rough approximation)
-				avg_tokens = sum(len(text.split()) for text in batch_texts) / batch_size
-				total_tokens = avg_tokens * batch_size
-				tokens_per_second = total_tokens / total_time
-				length_results[f"batch_{batch_size}"] = {
-					"total_time_ms": total_time * 1000,
-					"time_per_text_ms": time_per_text * 1000,
-					"texts_per_second": texts_per_second,
-					"tokens_per_second": tokens_per_second,
-					"avg_tokens_per_text": avg_tokens,
-					"embedding_shape": embeddings.shape
-					if hasattr(embeddings, "shape")
-					else f"({len(embeddings)}, {len(embeddings[0]) if embeddings else 0})",
-				}
-			speed_results[text_length] = length_results
-		self.results["speed_benchmarks"] = speed_results
-		return speed_results
-	def benchmark_memory_scaling(self, batch_sizes: list[int] | None = None) -> dict[str, Any]:
-		"""Benchmark memory usage across batch sizes."""
-		if batch_sizes is None:
-			batch_sizes = [1, 8, 16, 32, 64, 128, 256]
-		logger.info("💾 Benchmarking memory scaling...")
-		if self.model is None:
-			self.load_model()
-		if self.model is None:
-			msg = "Failed to load model"
-			raise RuntimeError(msg)
-		memory_results: dict[str, Any] = {}
-		texts = BENCHMARK_TEXTS["medium"]
-		baseline_memory = 0
-		if self.device == "cuda":
-			torch.cuda.empty_cache()
-			baseline_memory = torch.cuda.memory_allocated()
-		for batch_size in batch_sizes:
-			if batch_size > len(texts):
-				continue
-			logger.info(f"  📊 Testing batch size: {batch_size}")
-			# Clear cache
-			if self.device == "cuda":
-				torch.cuda.empty_cache()
-				gc.collect()
-			batch_texts = texts[:batch_size]
-			# Measure memory before
-			if self.device == "cuda":
-				torch.cuda.memory_allocated()
-			# Run inference
-			try:
-				embeddings = self.model.encode(
-					batch_texts,
-					convert_to_tensor=self.device == "cuda",
-					show_progress_bar=False,
-				)
-				# Measure memory after
-				memory_after = 0
-				if self.device == "cuda":
-					memory_after = torch.cuda.max_memory_allocated()
-					torch.cuda.reset_peak_memory_stats()
-				memory_used_mb = (memory_after - baseline_memory) / (1024 * 1024)
-				memory_per_text_mb = memory_used_mb / batch_size if batch_size > 0 else 0
-				memory_results[f"batch_{batch_size}"] = {
-					"memory_used_mb": memory_used_mb,
-					"memory_per_text_mb": memory_per_text_mb,
-					"baseline_memory_mb": baseline_memory / (1024 * 1024),
-					"peak_memory_mb": memory_after / (1024 * 1024),
-				}
-				# Clean up
-				del embeddings
-			except torch.cuda.OutOfMemoryError:
-				logger.warning(f"❌ OOM at batch size {batch_size}")
-				memory_results[f"batch_{batch_size}"] = {"oom": True}
-				break
-			except Exception as e:
-				logger.warning(f"❌ Error at batch size {batch_size}: {e}")
-				memory_results[f"batch_{batch_size}"] = {"error": str(e)}
-		self.results["memory_benchmarks"] = memory_results
-		return memory_results
-	def benchmark_cpu_vs_gpu(self) -> dict[str, Any]:
-		"""Compare CPU vs GPU performance."""
-		logger.info("🖥️ Benchmarking CPU vs GPU performance...")
-		comparison_results = {}
-		test_texts = BENCHMARK_TEXTS["medium"][:32]  # Fixed batch size
-		devices = ["cpu"]
-		if torch.cuda.is_available():
-			devices.append("cuda")
-		for device in devices:
-			logger.info(f"  🔄 Testing on {device}")
-			# Load model on device
-			try:
-				model = SentenceTransformer(self.model_path, device=device)
-				# Warmup
-				_ = model.encode(test_texts[:2], convert_to_tensor=False)
-				# Benchmark
-				start_time = time.perf_counter()
-				embeddings = model.encode(test_texts, convert_to_tensor=False, show_progress_bar=False)
-				end_time = time.perf_counter()
-				total_time = end_time - start_time
-				comparison_results[device] = {
-					"total_time_ms": total_time * 1000,
-					"texts_per_second": len(test_texts) / total_time,
-					"time_per_text_ms": (total_time / len(test_texts)) * 1000,
-					"embedding_shape": embeddings.shape
-					if hasattr(embeddings, "shape")
-					else f"({len(embeddings)}, {len(embeddings[0]) if embeddings else 0})",
-				}
-				del model
-				if device == "cuda":
-					torch.cuda.empty_cache()
-			except Exception as e:
-				logger.warning(f"❌ Failed on {device}: {e}")
-				comparison_results[device] = {"error": str(e)}
-		self.results["cpu_vs_gpu"] = comparison_results
-		return comparison_results
-	def run_comprehensive_benchmark(self) -> dict[str, Any]:
-		"""Run all benchmarks and return comprehensive results."""
-		logger.info(f"🚀 Starting comprehensive benchmark for {self.model_name}")
-		# Load model
-		self.load_model()
-		# Run all benchmarks
-		self.measure_model_size()
-		self.benchmark_inference_speed()
-		self.benchmark_memory_scaling()
-		self.benchmark_cpu_vs_gpu()
-		# Add metadata
-		self.results["model_name"] = self.model_name
-		self.results["model_path"] = self.model_path
-		self.results["device"] = self.device
-		self.results["torch_version"] = torch.__version__
-		self.results["cuda_available"] = torch.cuda.is_available()
-		if torch.cuda.is_available():
-			self.results["gpu_name"] = torch.cuda.get_device_name(0)
-			self.results["gpu_memory_gb"] = torch.cuda.get_device_properties(0).total_memory / (1024**3)
-		# System info
-		self.results["cpu_count"] = psutil.cpu_count()
-		self.results["ram_gb"] = psutil.virtual_memory().total / (1024**3)
-		logger.info("✅ Comprehensive benchmark completed!")
-		return self.results
-	def save_results(self, output_file: str) -> None:
-		"""Save benchmark results to JSON file."""
-		output_path = Path(output_file)
-		output_path.parent.mkdir(parents=True, exist_ok=True)
-		with output_path.open("w") as f:
-			json.dump(self.results, f, indent=2, default=str)
-		logger.info(f"📄 Results saved to {output_path}")
-	def print_summary(self) -> None:
-		"""Print a summary of benchmark results."""
-		if not self.results:
-			logger.warning("No results to summarize")
-			return
-		print(f"\n{'=' * 60}")
-		print(f"Performance Benchmark Summary: {self.model_name}")
-		print(f"{'=' * 60}")
-		# Model size
-		if "size_metrics" in self.results:
-			size = self.results["size_metrics"]
-			print("\n📏 Model Size:")
-			print(f"  Disk Size: {size.get('disk_size_mb', 0):.1f} MB")
-			if "parameters_millions" in size:
-				print(f"  Parameters: {size['parameters_millions']:.1f}M")
-			if "embedding_dim" in size:
-				print(f"  Embedding Dim: {size['embedding_dim']}")
-		# Speed summary
-		if "speed_benchmarks" in self.results:
-			speed = self.results["speed_benchmarks"]
-			print("\n⚡ Speed (medium texts, batch 32):")
-			if "medium" in speed and "batch_32" in speed["medium"]:
-				batch_32 = speed["medium"]["batch_32"]
-				print(f"  Throughput: {batch_32['texts_per_second']:.1f} texts/sec")
-				print(f"  Latency: {batch_32['time_per_text_ms']:.1f} ms/text")
-				print(f"  Token Speed: {batch_32['tokens_per_second']:.0f} tokens/sec")
-		# CPU vs GPU
-		if "cpu_vs_gpu" in self.results:
-			comparison = self.results["cpu_vs_gpu"]
-			print("\n🖥️ CPU vs GPU:")
-			for device, metrics in comparison.items():
-				if "error" not in metrics:
-					print(f"  {device.upper()}: {metrics['texts_per_second']:.1f} texts/sec")
-		print()
-def run_benchmark(
-	model_path: str | list[str],
-	model_name: str | None = None,
-	output: str = "benchmark_results.json",
-	quick: bool = False,
-	compare_models: list[str] | None = None,
-) -> None:
-	"""Run benchmark for one or multiple models with comparison."""
-	# Handle both single model and multiple models
-	models_to_benchmark = [model_path] if isinstance(model_path, str) else model_path
-	if compare_models:
-		models_to_benchmark.extend(compare_models)
-	all_results = []
-	for i, model in enumerate(models_to_benchmark):
-		current_model_name = model_name if i == 0 else Path(model).name
-		print(f"\n{'=' * 60}")
-		print(f"Benchmarking Model {i + 1}/{len(models_to_benchmark)}: {current_model_name}")
-		print(f"{'=' * 60}")
-		try:
-			benchmarker = PerformanceBenchmark(model, current_model_name)
-			if quick:
-				# Quick benchmark
-				benchmarker.load_model()
-				benchmarker.measure_model_size()
-				benchmarker.benchmark_inference_speed([1, 16, 32])
-			else:
-				# Comprehensive benchmark
-				benchmarker.run_comprehensive_benchmark()
-			all_results.append(benchmarker.results)
-			benchmarker.print_summary()
-		except Exception:
-			logger.exception(f"❌ Failed to benchmark {current_model_name}")
-			continue
-	# Save individual results
-	output_dir = Path(output).parent if Path(output).suffix else Path(output)
-	output_dir.mkdir(parents=True, exist_ok=True)
-	for results in all_results:
-		model_name_safe = "".join(c for c in results["model_name"] if c.isalnum() or c in ("-", "_", "."))
-		output_path = output_dir / f"benchmark_{model_name_safe}.json"
-		with output_path.open("w") as f:
-			json.dump(results, f, indent=2, default=str)
-		logger.info(f"📄 Results saved to {output_path}")
-	# Create comparison if multiple models
-	if len(all_results) > 1:
-		create_benchmark_comparison(all_results, str(output_dir / "benchmark_comparison.json"))
-	print(f"\n✅ Benchmark complete! Results saved to {output_dir}")
-def create_benchmark_comparison(all_results: list[dict[str, Any]], output_path: str) -> None:
-	"""Create a comparison report for multiple benchmark results."""
-	print(f"\n{'=' * 80}")
-	print("Performance Benchmark Comparison")
-	print(f"{'=' * 80}")
-	comparison_data = []
-	for results in all_results:
-		model_name = results.get("model_name", "Unknown")
-		size_metrics = results.get("size_metrics", {})
-		speed_benchmarks = results.get("speed_benchmarks", {})
-		cpu_vs_gpu = results.get("cpu_vs_gpu", {})
-		# Extract key metrics
-		row = {
-			"Model": model_name,
-			"Disk Size (MB)": size_metrics.get("disk_size_mb", 0),
-			"Parameters (M)": size_metrics.get("parameters_millions", 0),
-			"Embedding Dim": size_metrics.get("embedding_dim", 0),
-		}
-		# Speed metrics (medium texts, batch 32)
-		if "medium" in speed_benchmarks and "batch_32" in speed_benchmarks["medium"]:
-			batch_32 = speed_benchmarks["medium"]["batch_32"]
-			row.update(
-				{
-					"Throughput (texts/sec)": batch_32.get("texts_per_second", 0),
-					"Latency (ms/text)": batch_32.get("time_per_text_ms", 0),
-					"Token Speed (tokens/sec)": batch_32.get("tokens_per_second", 0),
-				}
-			)
-		# CPU vs GPU comparison
-		for device in ["cpu", "cuda"]:
-			if device in cpu_vs_gpu and "error" not in cpu_vs_gpu[device]:
-				row[f"{device.upper()} Speed (texts/sec)"] = cpu_vs_gpu[device].get("texts_per_second", 0)
-		comparison_data.append(row)
-	# Create DataFrame and save
-	df = pd.DataFrame(comparison_data)
-	# Sort by throughput (descending)
-	if "Throughput (texts/sec)" in df.columns:
-		df = df.sort_values("Throughput (texts/sec)", ascending=False)
-	# Print comparison table
-	print(df.to_string(index=False, float_format="%.2f"))
-	# Save comparison results
-	comparison_summary = {
-		"comparison_table": df.to_dict(orient="records"),
-		"summary": {
-			"fastest_model": df.iloc[0]["Model"] if len(df) > 0 else None,
-			"smallest_model": df.loc[df["Disk Size (MB)"].idxmin()]["Model"] if len(df) > 0 else None,
-			"most_efficient": df.loc[df["Throughput (texts/sec)"].idxmax()]["Model"]
-			if "Throughput (texts/sec)" in df.columns and len(df) > 0
-			else None,
-		},
-		"timestamp": time.time(),
-	}
-	with Path(output_path).open("w") as f:
-		json.dump(comparison_summary, f, indent=2, default=str)
-	print(f"\n📊 Comparison saved to {output_path}")
-def save_benchmark_results(
-	results: dict[str, Any],
-	output_dir: str,
-	model_name: str,
-	volume_results_dir: Path | None = None,
-) -> None:
-	"""Save benchmark results to JSON file with Beam volume support."""
-	# Save to Beam volume if available
-	if volume_results_dir:
-		volume_output_path = volume_results_dir / f"benchmark_{model_name}.json"
-		try:
-			with volume_output_path.open("w") as f:
-				json.dump(results, f, indent=2, default=str)
-			logger.info(f"💾 Results saved to Beam volume: {volume_output_path}")
-		except Exception as e:
-			logger.warning(f"⚠️ Failed to save to Beam volume: {e}")
-	# Always save local backup
-	output_path = Path(output_dir)
-	output_path.mkdir(parents=True, exist_ok=True)
-	# Clean model name for filename
-	safe_name = "".join(c for c in model_name if c.isalnum() or c in ("-", "_", "."))
-	filename = f"benchmark_{safe_name}.json"
-	filepath = output_path / filename
-	with filepath.open("w") as f:
-		json.dump(results, f, indent=2, default=str)
-	logger.info(f"📄 Local backup saved to {filepath}")
-def beam_benchmark_models(
-	models: list[str],
-	quick: bool = False,
-	output_dir: str = DEFAULT_OUTPUT_DIR,
-	volume_name: str = VOLUME_NAME,
-	mount_path: str = VOLUME_PATH,
-) -> list[dict[str, Any]]:
-	"""Main benchmarking function for Beam execution with checkpoint support."""
-	logger.info("🚀 Starting Beam-powered performance benchmarking")
-	logger.info(f"📊 Benchmarking {len(models)} models")
-	# Initialize Beam utilities
-	volume_mgr, checkpoint_mgr, model_mgr, eval_mgr = create_beam_utilities(volume_name, mount_path)
-	# Create benchmark results directory in volume
-	results_dir = Path(mount_path) / BENCHMARK_RESULTS_DIR
-	results_dir.mkdir(parents=True, exist_ok=True)
-	logger.info(f"📁 Using Beam volume: {volume_name} at {mount_path}")
-	logger.info(f"💾 Benchmark results directory: {results_dir}")
-	all_results = []
-	skipped_models = []
-	for model_path in models:
-		model_name = Path(model_path).name if model_path != str(Path(mount_path)) else "gte_qwen2_m2v_code"
-		# Check if this model has already been benchmarked (except for trained model)
-		is_trained_model = model_path == str(Path(mount_path)) or model_name == "gte_qwen2_m2v_code"
-		if not is_trained_model:
-			# Check for existing benchmark results
-			existing_result_file = results_dir / f"benchmark_{model_name}.json"
-			if existing_result_file.exists():
-				logger.info(f"✅ Model {model_name} already benchmarked - loading existing results")
-				try:
-					with existing_result_file.open("r") as f:
-						existing_results = json.load(f)
-					all_results.append(existing_results)
-					skipped_models.append(model_name)
-					continue
-				except Exception as e:
-					logger.warning(f"⚠️ Failed to load existing results for {model_name}: {e}")
-					# Continue with benchmarking if loading fails
-		logger.info(f"\n{'=' * 60}")
-		logger.info(f"🔍 Benchmarking model: {model_name}")
-		logger.info(f"📂 Path: {model_path}")
-		if is_trained_model:
-			logger.info("🎯 Trained model - always re-benchmark")
-		logger.info(f"{'=' * 60}")
-		try:
-			# Distinguish between local paths and HuggingFace model names
-			is_huggingface_model = (
-				"/" in model_path and not model_path.startswith("/") and not Path(model_path).exists()
-			)
-			if is_huggingface_model:
-				# This is a HuggingFace model name - pass directly to benchmarker
-				logger.info(f"📥 Loading HuggingFace model: {model_path}")
-				benchmarker = PerformanceBenchmark(
-					model_path,
-					model_name,
-					checkpoint_manager=checkpoint_mgr,
-					eval_manager=eval_mgr,
-				)
-			else:
-				# This is a local path - check if it exists in Beam volume
-				actual_model_path = model_path  # Default to original path
-				if not Path(model_path).exists() and not model_path.startswith("/"):
-					# Try to load from Beam volume
-					local_model_path = Path(mount_path) / model_name
-					logger.info(f"🔍 Trying to load {model_name} from Beam volume: {local_model_path}")
-					if local_model_path.exists():
-						actual_model_path = str(local_model_path)
-						logger.info(f"✅ Found model in Beam volume: {actual_model_path}")
-					else:
-						# Try in root of volume (for your trained model)
-						root_model_path = Path(mount_path)
-						if (root_model_path / "config.json").exists():
-							actual_model_path = str(root_model_path)
-							logger.info(f"✅ Found model in Beam volume root: {actual_model_path}")
-						else:
-							logger.warning(f"⚠️ Model not found locally or in Beam volume: {model_name}")
-							continue
-				benchmarker = PerformanceBenchmark(
-					actual_model_path,
-					model_name,
-					checkpoint_manager=checkpoint_mgr,
-					eval_manager=eval_mgr,
-				)
-			# Run benchmarking
-			if quick:
-				# Quick benchmark
-				benchmarker.load_model()
-				benchmarker.measure_model_size()
-				benchmarker.benchmark_inference_speed([1, 16, 32])
-			else:
-				# Comprehensive benchmark
-				benchmarker.run_comprehensive_benchmark()
-			# Save results with Beam support
-			save_benchmark_results(benchmarker.results, output_dir, model_name, results_dir)
-			# Print summary
-			benchmarker.print_summary()
-			all_results.append(benchmarker.results)
-		except Exception:
-			logger.exception(f"❌ Failed to benchmark {model_name}")
-			continue
-	# Create comparison report in Beam volume
-	if len(all_results) > 1:
-		comparison_dir = results_dir / "comparisons"
-		comparison_dir.mkdir(parents=True, exist_ok=True)
-		create_benchmark_comparison(all_results, str(comparison_dir / "benchmark_comparison.json"))
-		logger.info(f"📊 Comparison report saved to Beam volume: {comparison_dir}")
-	# Log summary of what was done
-	newly_benchmarked = len(all_results) - len(skipped_models)
-	logger.info("\n✅ Beam benchmarking complete!")
-	logger.info(f"📊 Newly benchmarked: {newly_benchmarked} models")
-	logger.info(f"⏭️  Skipped (already done): {len(skipped_models)} models")
-	logger.info(f"📁 Total results: {len(all_results)} models")
-	logger.info(f"💾 Results available in Beam volume: {volume_name}")
-	if skipped_models:
-		logger.info(f"⏭️  Skipped models: {', '.join(skipped_models)}")
-	return all_results
-@function(
-	gpu=GPU_NAME,
-	volumes=[Volume(name=VOLUME_NAME, mount_path=VOLUME_PATH)],
-	image=IMAGE,
-	secrets=["HF_ACCESS_TOKEN"],
-	env={
-		"TOKENIZERS_PARALLELISM": "false",
-		"CUDA_LAUNCH_BLOCKING": "0",
-	},
-	timeout=3600 * 4,  # 4 hours for benchmarking all models
-)
-def main() -> None:
-	"""Main benchmarking function - runs all default models on Beam."""
-	logger.info("🚀 Starting comprehensive performance benchmarking on Beam")
-	# Use default models but replace the local model path with Beam volume path
-	models = DEFAULT_BENCHMARK_MODELS.copy()
-	# Replace "gte_qwen2_m2v_code" with actual Beam volume path
-	for i, model in enumerate(models):
-		if model == "gte_qwen2_m2v_code":
-			models[i] = str(Path(VOLUME_PATH))  # Use the Beam volume root
-			logger.info(f"🎯 Using trained model from Beam volume: {models[i]}")
-	# Discover simplified distillation models
-	logger.info("🔍 Discovering simplified distillation models...")
-	discovered_models = discover_simplified_models(".")
-	# Add discovered models
-	if discovered_models:
-		logger.info(f"✅ Found {len(discovered_models)} simplified models:")
-		for model_path in discovered_models:
-			models.append(model_path)
-			logger.info(f"   📁 {model_path}")
-	else:
-		logger.warning("⚠️ No simplified distillation models found")
-	logger.info(f"📊 Benchmarking {len(models)} models:")
-	for i, model in enumerate(models, 1):
-		logger.info(f"  {i}. {model}")
-	logger.info("\n💡 Checkpoint Info:")
-	logger.info("   - Already benchmarked models will be skipped")
-	logger.info("   - Your trained model will always be re-benchmarked")
-	logger.info("   - Results are saved persistently to Beam volume")
-	# Run comprehensive benchmark using Beam utilities
-	results = beam_benchmark_models(
-		models=models,
-		quick=True,  # Use quick benchmark for efficiency
-		output_dir=str(Path(VOLUME_PATH) / BENCHMARK_RESULTS_DIR),
-		volume_name=VOLUME_NAME,
-		mount_path=VOLUME_PATH,
-	)
-	# Print final summary
-	print("\n🎯 Benchmarking Summary:")
-	print(f"📊 Total models processed: {len(results)}")
-	print(f"💾 Results saved to Beam volume: {VOLUME_NAME}")
-	print(f"📁 Directory: {BENCHMARK_RESULTS_DIR}")
-	print("\n🔍 To view analysis:")
-	print("   beam run src.distiller.analyze:beam_analysis")
-	print("\n📈 To run benchmarks again:")
-	print("   distiller benchmark  (will skip already completed models)")
-def discover_simplified_models(base_path: str = ".") -> list[str]:
-	"""
-	Discover all simplified distillation models in the correct directory.
-	Looks for directories matching the pattern: ./code_model2vec/final/code_model2vec_*
-	"""
-	discovered_models: list[str] = []
-	# Look in the correct location where distill_simplified.py saves models
-	models_dir = Path(base_path) / "code_model2vec" / "final"
-	if not models_dir.exists():
-		logger.warning(f"Models directory not found: {models_dir}")
-		return discovered_models
-	# Look for simplified model directories with the updated pattern
-	pattern = "code_model2vec_*"
-	for model_dir in models_dir.glob(pattern):
-		if model_dir.is_dir() and (model_dir / "config.json").exists():
-			discovered_models.append(str(model_dir))
-			logger.info(f"🔍 Discovered simplified model: {model_dir}")
-	# Sort alphabetically for consistent ordering
-	discovered_models.sort()
-	return discovered_models
-@function(
-	gpu=GPU_NAME,
-	volumes=[Volume(name=VOLUME_NAME, mount_path=VOLUME_PATH)],
-	image=IMAGE,
-	secrets=["HF_ACCESS_TOKEN"],
-	env={
-		"TOKENIZERS_PARALLELISM": "false",
-		"CUDA_LAUNCH_BLOCKING": "0",
-	},
-	timeout=3600 * 3,  # 3 hours for simplified models only
-)
-def benchmark_simplified_only() -> None:
-	"""Benchmark only simplified distillation models, skipping 3rd party models."""
-	logger.info("🚀 Starting simplified distillation models benchmarking on Beam")
-	logger.info("⏭️  Skipping 3rd party models - benchmarking only simplified distillation models")
-	# Discover simplified distillation models
-	logger.info("🔍 Discovering simplified distillation models...")
-	discovered_models = discover_simplified_models(".")
-	if not discovered_models:
-		logger.error("❌ No simplified distillation models found! Run distill-simple first.")
-		return
-	logger.info(f"✅ Found {len(discovered_models)} simplified models:")
-	for model_path in discovered_models:
-		logger.info(f"   📁 {model_path}")
-	logger.info("\n💡 Checkpoint Info:")
-	logger.info("   - Already benchmarked models will be skipped")
-	logger.info("   - Results are saved persistently to Beam volume")
-	# Run comprehensive benchmark using Beam utilities
-	results = beam_benchmark_models(
-		models=discovered_models,
-		quick=True,  # Use quick benchmark for efficiency
-		output_dir=str(Path(VOLUME_PATH) / BENCHMARK_RESULTS_DIR),
-		volume_name=VOLUME_NAME,
-		mount_path=VOLUME_PATH,
-	)
-	# Print final summary
-	print("\n🎯 Simplified Benchmarking Summary:")
-	print(f"📊 Total simplified models processed: {len(results)}")
-	print(f"💾 Results saved to Beam volume: {VOLUME_NAME}")
-	print(f"📁 Directory: {BENCHMARK_RESULTS_DIR}")
-	print("⏭️  3rd party models were skipped")
-	print("\n🔍 To view analysis:")
-	print("   distiller analyze")
-	print("\n📈 To run full benchmarks (including 3rd party):")
-	print("   distiller benchmark")
-def run_local_benchmark(
-	models: list[str] | None = None,
-	quick: bool = False,
-	output_dir: str = DEFAULT_OUTPUT_DIR,
-) -> list[dict[str, Any]]:
-	"""Main benchmarking function for local execution without Beam utilities."""
-	logger.info("🖥️ Running performance benchmarking locally")
-	if models is None:
-		models = DEFAULT_BENCHMARK_MODELS.copy()
-		# Replace "gte_qwen2_m2v_code" with a reasonable local path
-		for i, model in enumerate(models):
-			if model == "gte_qwen2_m2v_code":
-				# Look for local trained model
-				local_model_paths = [
-					"./gte_qwen2_m2v_code",
-					"./models/gte_qwen2_m2v_code",
-					"./output/gte_qwen2_m2v_code",
-				]
-				found = False
-				for local_path in local_model_paths:
-					if Path(local_path).exists():
-						models[i] = local_path
-						logger.info(f"🎯 Found local trained model: {local_path}")
-						found = True
-						break
-				if not found:
-					logger.warning("⚠️ Local trained model not found, skipping")
-					models.pop(i)
-		# Discover simplified distillation models
-		logger.info("🔍 Discovering simplified distillation models...")
-		discovered_models = discover_simplified_models(".")
-		# Add discovered models
-		if discovered_models:
-			logger.info(f"✅ Found {len(discovered_models)} simplified models:")
-			for model_path in discovered_models:
-				models.append(model_path)
-				logger.info(f"   📁 {model_path}")
-		else:
-			logger.warning("⚠️ No simplified distillation models found")
-	logger.info(f"📊 Benchmarking {len(models)} models")
-	logger.info(f"📁 Using local output directory: {output_dir}")
-	# Create local output directory
-	output_path = Path(output_dir)
-	output_path.mkdir(parents=True, exist_ok=True)
-	all_results = []
-	skipped_models = []
-	for model_path in models:
-		model_name = Path(model_path).name
-		# Check for existing benchmark results locally
-		safe_name = "".join(c for c in model_name if c.isalnum() or c in ("-", "_", "."))
-		result_file = output_path / f"benchmark_{safe_name}.json"
-		if result_file.exists():
-			logger.info(f"✅ Model {model_name} already benchmarked - loading existing results")
-			try:
-				with result_file.open("r") as f:
-					existing_results = json.load(f)
-				all_results.append(existing_results)
-				skipped_models.append(model_name)
-				continue
-			except Exception as e:
-				logger.warning(f"⚠️ Failed to load existing results for {model_name}: {e}")
-		logger.info(f"\n{'=' * 60}")
-		logger.info(f"🔍 Benchmarking model: {model_name}")
-		logger.info(f"📂 Path: {model_path}")
-		logger.info(f"{'=' * 60}")
-		try:
-			# Create benchmarker without Beam utilities
-			benchmarker = PerformanceBenchmark(
-				model_path,
-				model_name,
-				checkpoint_manager=None,  # No checkpointing for local benchmarking
-				eval_manager=None,
-			)
-			# Run benchmarking
-			if quick:
-				# Quick benchmark
-				benchmarker.load_model()
-				benchmarker.measure_model_size()
-				benchmarker.benchmark_inference_speed([1, 16, 32])
-			else:
-				# Comprehensive benchmark
-				benchmarker.run_comprehensive_benchmark()
-			# Save results locally only
-			save_benchmark_results(benchmarker.results, output_dir, model_name, volume_results_dir=None)
-			# Print summary
-			benchmarker.print_summary()
-			all_results.append(benchmarker.results)
-		except Exception:
-			logger.exception(f"❌ Failed to benchmark {model_name}")
-			continue
-	# Create comparison report locally
-	if len(all_results) > 1:
-		create_benchmark_comparison(all_results, str(output_path / "benchmark_comparison.json"))
-		logger.info(f"📊 Comparison report saved locally: {output_dir}")
-	# Log summary
-	newly_benchmarked = len(all_results) - len(skipped_models)
-	logger.info("\n✅ Local benchmarking complete!")
-	logger.info(f"📊 Newly benchmarked: {newly_benchmarked} models")
-	logger.info(f"⏭️  Skipped (already done): {len(skipped_models)} models")
-	logger.info(f"📁 Total results: {len(all_results)} models")
-	logger.info(f"💾 Results available locally: {output_dir}")
-	if skipped_models:
-		logger.info(f"⏭️  Skipped models: {', '.join(skipped_models)}")
-	return all_results
-def run_local_benchmark_simplified(
-	quick: bool = False,
-	output_dir: str = DEFAULT_OUTPUT_DIR,
-) -> list[dict[str, Any]]:
-	"""Local benchmarking function for simplified models only."""
-	logger.info("🖥️ Running simplified model benchmarking locally")
-	# Discover simplified distillation models only
-	logger.info("🔍 Discovering simplified distillation models...")
-	discovered_models = discover_simplified_models(".")
-	if not discovered_models:
-		logger.error("❌ No simplified distillation models found! Run 'distiller distill-simple' first.")
-		return []
-	logger.info(f"✅ Found {len(discovered_models)} simplified models:")
-	for model_path in discovered_models:
-		logger.info(f"   📁 {model_path}")
-	return run_local_benchmark(
-		models=discovered_models,
-		quick=quick,
-		output_dir=output_dir,
-	)
-if __name__ == "__main__":
-	main()

src/distiller/config.py ADDED Viewed

	@@ -0,0 +1,339 @@

+"""
+Shared configuration for the distiller package.
+This module centralizes all configuration constants, default values, and common
+settings used across distillation, evaluation, and benchmarking modules.
+"""
+import logging
+from pathlib import Path
+from typing import Any
+from beam import GpuType, Image
+from pydantic import BaseModel
+# =============================================================================
+# LOGGING CONFIGURATION
+# =============================================================================
+def setup_logging(level: int = logging.INFO) -> None:
+	"""Set up consistent logging across the package."""
+	log_dir = Path("logs")
+	log_dir.mkdir(parents=True, exist_ok=True)
+	log_path = log_dir / "distiller.log"
+	logging.basicConfig(
+		level=level,
+		format="%(asctime)s - %(levelname)s - %(message)s",
+		handlers=[logging.StreamHandler(), logging.FileHandler(log_path, mode="a")],
+	)
+# =============================================================================
+# BEAM CLOUD CONFIGURATION
+# =============================================================================
+# Beam execution settings
+GPU_NAME = GpuType.A100_40
+# Volume configurations for different workflows
+class VolumeConfig(BaseModel):
+	"""Volume configuration container."""
+	name: str
+	mount_path: str
+	description: str = ""
+# Define volume configurations - code_model2vec is the primary volume for all workflows
+VOLUMES: dict[str, VolumeConfig] = {
+	"primary": VolumeConfig(
+		name="code_model2vec",
+		mount_path="./code_model2vec",
+		description="Primary volume for all distillation models, evaluations, benchmarks, and checkpoints",
+	),
+	# Legacy volume name mapping for backwards compatibility
+	"simplified": VolumeConfig(
+		name="code_model2vec",
+		mount_path="./code_model2vec",
+		description="Primary volume for all distillation models, evaluations, benchmarks, and checkpoints",
+	),
+}
+# Default volume name for all workflows
+DEFAULT_VOLUME = "primary"
+# Beam environment settings
+BEAM_ENV_SETTINGS: dict[str, str] = {
+	"TOKENIZERS_PARALLELISM": "false",
+	"CUDA_LAUNCH_BLOCKING": "0",
+	"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True,max_split_size_mb:512",
+	"TORCH_CUDNN_V8_API_ENABLED": "1",
+}
+# Common Python packages for Beam images
+COMMON_PACKAGES: list[str] = [
+	"torch>=2.7.0",
+	"transformers>=4.40.0",
+	"datasets>=3.2.0",
+	"sentence-transformers>=4.1.0",
+	"model2vec[train]>=0.5.0",
+	"numpy>=1.26.4",
+	"scikit-learn>=1.6.1",
+	"pandas>=2.0.0",
+	"tqdm>=4.65.0",
+	"plotly>=5.0.0",
+	"matplotlib>=3.7.0",
+	"seaborn>=0.12.0",
+]
+# Create common Beam image
+IMAGE = Image(python_version="python3.12").add_python_packages(COMMON_PACKAGES)
+# =============================================================================
+# MODEL CONFIGURATION
+# =============================================================================
+# Teacher model configurations
+TEACHER_MODELS: list[str] = [
+	"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+	"BAAI/bge-m3",
+	"jinaai/jina-embeddings-v3",
+	"lightonai/Reason-ModernColBERT",
+	"Linq-AI-Research/Linq-Embed-Mistral",
+	"microsoft/codebert-base",
+	"microsoft/graphcodebert-base",
+	"nomic-ai/nomic-embed-text-v2-moe",
+	"Qodo/Qodo-Embed-1-1.5B",
+	"sentence-transformers/all-MiniLM-L6-v2",
+	"sentence-transformers/all-mpnet-base-v2",
+	"sentence-transformers/paraphrase-MiniLM-L6-v2",
+	"nomic-ai/nomic-embed-code",
+	"nomic-ai/CodeRankEmbed",
+]
+# Default evaluation models for comparison
+DEFAULT_EVALUATION_MODELS: list[str] = [
+	"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+	"BAAI/bge-m3",
+	"huggingface/CodeBERTa-small-v1",
+	"jinaai/jina-embeddings-v3",
+	"lightonai/Reason-ModernColBERT",
+	"Linq-AI-Research/Linq-Embed-Mistral",
+	"microsoft/codebert-base",
+	"microsoft/graphcodebert-base",
+	"minishlab/potion-base-8M",
+	"minishlab/potion-retrieval-32M",
+	"nomic-ai/nomic-embed-text-v2-moe",
+	"Qodo/Qodo-Embed-1-1.5B",
+	"Salesforce/codet5-base",
+	"sentence-transformers/all-MiniLM-L12-v2",
+	"sentence-transformers/all-MiniLM-L6-v2",
+	"sentence-transformers/all-mpnet-base-v2",
+	"sentence-transformers/paraphrase-MiniLM-L6-v2",
+	"nvidia/NV-Embed-v2",
+	"nomic-ai/nomic-embed-code",
+	"nomic-ai/CodeRankEmbed",
+]
+# Model2Vec distillation parameters
+class DistillationConfig(BaseModel):
+	"""Configuration for Model2Vec distillation parameters."""
+	# Teacher models for distillation
+	code_teacher_models: list[str] = TEACHER_MODELS
+	# Basic distillation parameters
+	optimal_pca_dims: int = 256
+	sif_coefficient: float = 1e-3
+	apply_zipf: bool = True
+	# Training parameters (used when --train flag is enabled)
+	training_epochs: int = 2
+	learning_rate: float = 1e-4
+	batch_size: int = 32
+	max_training_samples: int = 50000
+	teacher_model_config: dict[str, Any] = {}
+distillation_config = DistillationConfig()
+# =============================================================================
+# DATASET CONFIGURATION
+# =============================================================================
+# Add a LanguagesConfig Pydantic model
+class LanguagesConfig(BaseModel):
+	"""Configuration for languages used in evaluation."""
+	all: list[str] = [
+		"python",
+		"java",
+		"javascript",
+		"php",
+		"ruby",
+		"go",
+	]
+languages_config = LanguagesConfig()
+# Update CodeSearchNetConfig to use languages_config.all as the default for evaluation_languages
+class CodeSearchNetConfig(BaseModel):
+	"""Configuration for CodeSearchNet evaluation settings."""
+	dataset_name: str = "code_search_net"
+	evaluation_languages: list[str] = languages_config.all
+	max_queries_per_language: int = 1000
+	similarity_threshold: float = 0.7
+	evaluation_metrics: list[str] = ["ndcg@1", "ndcg@5", "ndcg@10", "mrr", "recall@1", "recall@5", "recall@10"]
+codesearchnet_config = CodeSearchNetConfig()
+# Training dataset configurations
+TRAINING_DATASETS: dict[str, str] = {
+	"codesearchnet": "sentence-transformers/codesearchnet",
+	"code_search_net": "code_search_net",
+}
+# =============================================================================
+# OUTPUT DIRECTORY CONFIGURATION
+# =============================================================================
+# Standardized directory structure within code_model2vec
+class StandardDirectories(BaseModel):
+	"""Standardized directory structure for code_model2vec workspace."""
+	# Root directory
+	root: str = "code_model2vec"
+	# Model directories
+	base: str = "code_model2vec/base"  # Basic distilled models
+	final: str = "code_model2vec/final"  # Final trained models
+	models: str = "code_model2vec/models"  # Legacy/alternative models location
+	# Results directories
+	evaluation_results: str = "code_model2vec/evaluation_results"
+	benchmark_results: str = "code_model2vec/benchmark_results"
+	analysis_results: str = "code_model2vec/analysis_results"
+	# Working directories
+	checkpoints: str = "code_model2vec/checkpoints"
+	cache: str = "code_model2vec/cache"
+	temp: str = "code_model2vec/temp"
+# Create global instance
+directories = StandardDirectories()
+# Legacy OutputDirs for backwards compatibility
+class OutputDirs(BaseModel):
+	"""Base output directory structure for storing models, checkpoints, and results."""
+	base: str = "base"
+	models: str = "final"
+	checkpoints: str = "checkpoints"
+	evaluation_results: str = "evaluation_results"
+	benchmark_results: str = "benchmark_results"
+	analysis_results: str = "analysis_results"
+	cache: str = "cache"
+output_dirs = OutputDirs()
+# File naming patterns
+class FilenamePatterns(BaseModel):
+	"""File naming patterns for evaluation, benchmark, checkpoint, and model files."""
+	evaluation: str = "codesearchnet_eval_{model_name}.json"
+	bencmark: str = "benchmark_{model_name}.json"
+	checkpoint: str = "checkpoints_{stage}_step_{step}.json"
+	model: str = "{teacher_model}_{dims}d"
+filename_patterns = FilenamePatterns()
+# =============================================================================
+# ANALYSIS AND VISUALIZATION
+# =============================================================================
+# Chart configuration
+class ChartConfig(BaseModel):
+	"""Chart configuration for analysis and visualization."""
+	figsize: tuple[int, int] = (12, 8)
+	dpi: int = 300
+	style: str = "whitegrid"
+	color_palette: str = "Set2"
+	save_formats: list[str] = ["png", "pdf"]
+chart_config = ChartConfig()
+# Performance thresholds for analysis
+class PerformanceThresholds(BaseModel):
+	"""Performance thresholds for analysis results."""
+	excellent: float = 0.7
+	good: float = 0.5
+	fair: float = 0.3
+	pour: float = 0.1
+performance_thresholds = PerformanceThresholds()
+# =============================================================================
+# HELPER FUNCTIONS
+# =============================================================================
+def get_volume_config() -> VolumeConfig:
+	"""Get volume configuration for any workflow - always returns the primary code_model2vec volume."""
+	return VOLUMES["primary"]
+def get_output_path(base_path: str | Path, output_type: str) -> Path:
+	"""Get standardized output path for different types of outputs."""
+	base = Path(base_path)
+	if hasattr(output_dirs, output_type):
+		return base / getattr(output_dirs, output_type)
+	return base / output_type
+def get_standard_directory(dir_type: str) -> str:
+	"""Get standardized directory path for any directory type."""
+	if hasattr(directories, dir_type):
+		return getattr(directories, dir_type)
+	# Default to relative path within code_model2vec
+	return f"code_model2vec/{dir_type}"
+def ensure_checkpoint_directory(stage: str) -> str:
+	"""Ensure checkpoint directory exists for a specific stage and return the path."""
+	checkpoint_dir = f"{directories.checkpoints}/{stage}"
+	Path(checkpoint_dir).mkdir(parents=True, exist_ok=True)
+	return checkpoint_dir
+def format_filename(pattern_key: str, **kwargs: Any) -> str:
+	"""Format filename using predefined patterns."""
+	if hasattr(filename_patterns, pattern_key):
+		return getattr(filename_patterns, pattern_key).format(**kwargs)
+	msg = f"Unknown filename pattern: {pattern_key}"
+	raise ValueError(msg)
+def get_safe_model_name(model_name: str) -> str:
+	"""Convert model name to filesystem-safe name."""
+	return "".join(c for c in model_name if c.isalnum() or c in ("-", "_", ".")).replace("/", "_")

src/distiller/distill.py CHANGED Viewed

@@ -1,31 +1,35 @@
 """
-Code-Specialized Model2Vec Distillation Script with Checkpoint Support.
-This script implements a focused approach for creating code-specialized embeddings
-using Model2Vec distillation with one additional training round on code-specific tasks.
 Features:
-- Incremental checkpoint saving
-- Resume from previous progress
-- Persistent storage of embeddings and models
-- Robust error handling and recovery
-- Smart checkpoint validation for parameter compatibility
-Approach:
-1. Basic Model2Vec distillation with optimized parameters
-2. Single code specialization round using sentence-transformers/codesearchnet dataset
 """
 import json
 import logging
-import os
 import time
 from pathlib import Path
-from typing import Any
 import numpy as np
 import torch
-from beam import GpuType, Image, Volume, function
 from datasets import load_dataset
 from model2vec.distill import distill
 from model2vec.train.base import FinetunableStaticModel, TextDataset
@@ -35,239 +39,474 @@ from torch import nn, optim
 from .beam_utils import (
 	BeamCheckpointManager,
-	BeamModelManager,
 	create_beam_utilities,
 )
 # =============================================================================
-# CODE-FOCUSED CONFIGURATION
 # =============================================================================
-# Model Configuration
-MODEL_NAME = "Alibaba-NLP/gte-Qwen2-7B-instruct"
-OUTPUT_DIR = "gte_qwen2_m2v_code"
-CHECKPOINT_DIR = "gte_qwen2_m2v_code/checkpoints"
-# Code-optimized parameters
-PCA_DIMS = 512  # Higher dims for code complexity
-TRAINING_EPOCHS = 2
-LEARNING_RATE = 1e-4
-BATCH_SIZE = 32
-REGULARIZATION_WEIGHT = 0.01
-# CodeSearchNet dataset configuration
-CODESEARCHNET_DATASET = "sentence-transformers/codesearchnet"
-MAX_TRAINING_SAMPLES = 50000  # Limit for manageable training time
-# Checkpoint configuration
-CHECKPOINT_INTERVAL = 1000  # Save every N samples
-EMBEDDINGS_BATCH_SIZE = 100  # Save embeddings in smaller batches
-# OPTIMIZED TEACHER MODEL CONFIGURATION FOR 40GB VRAM
-TEACHER_MODEL_CONFIG: dict[str, Any] = {
-	"batch_size": 12,  # Slightly reduced due to float32 memory usage
-	"precision": "float32",  # Use float32 for quality preservation
-	"max_seq_length": 8192,  # Reduce from 32k default for better performance
-	"device_map": "auto",  # Automatic device placement
-	"torch_dtype": torch.float32,  # Use float32 for quality preservation
-	"trust_remote_code": True,
-	"use_flash_attention": True,  # Try to enable flash attention if available
-	"attn_implementation": "flash_attention_2",  # Use flash attention 2 if available
-}
 # =============================================================================
-# BEAM CONFIGURATION
 # =============================================================================
-GPU_NAME = GpuType.A100_40
-VOLUME_NAME = "gte_qwen2_m2v_code"
-VOLUME_PATH = "./gte_qwen2_m2v_code"
-IMAGE = Image(python_version="python3.12").add_python_packages(
-	[
-		"torch>=2.7.0",  # Install torch first
-		"transformers>=4.40.0",  # Latest transformers with flash attention support
-		"accelerate>=1.7.0",
-		"datasets>=3.2.0",
-		"model2vec[train]>=0.5.0",
-		"numpy>=1.26.4",
-		"scikit-learn>=1.6.1",
-		"sentence-transformers>=4.1.0",
-	]
-)
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-logger = logging.getLogger(__name__)
-def get_current_config_hash() -> str:
 	"""Generate a hash of current configuration parameters for checkpoint validation."""
 	import hashlib
 	config_params = {
-		"model_name": MODEL_NAME,
-		"pca_dims": PCA_DIMS,
-		"precision": TEACHER_MODEL_CONFIG["precision"],
-		"torch_dtype": str(TEACHER_MODEL_CONFIG["torch_dtype"]),
-		"max_samples": MAX_TRAINING_SAMPLES,
-		"codesearchnet_dataset": CODESEARCHNET_DATASET,
 	}
 	config_str = str(sorted(config_params.items()))
 	return hashlib.md5(config_str.encode()).hexdigest()[:12]  # noqa: S324
-def validate_checkpoint_compatibility(checkpoint_data: dict[str, Any]) -> bool:
-	"""
-	Validate if checkpoint is compatible with current configuration.
-	Args:
-	        checkpoint_data: Checkpoint data dictionary
-	Returns:
-	        True if compatible, False otherwise
-	"""
-	current_hash = get_current_config_hash()
-	checkpoint_hash = checkpoint_data.get("config_hash", "")
-	if checkpoint_hash != current_hash:
-		logger.warning(f"Configuration mismatch: current={current_hash}, checkpoint={checkpoint_hash}")
 		return False
-	# Additional validation checks
-	checkpoint_config = checkpoint_data.get("config", {})
-	# Check critical parameters
-	if checkpoint_config.get("pca_dims") != PCA_DIMS:
-		logger.warning(f"PCA dimensions mismatch: current={PCA_DIMS}, checkpoint={checkpoint_config.get('pca_dims')}")
 		return False
-	if checkpoint_config.get("precision") != TEACHER_MODEL_CONFIG["precision"]:
-		logger.warning(
-			f"Precision mismatch: current={TEACHER_MODEL_CONFIG['precision']}, checkpoint={checkpoint_config.get('precision')}"
-		)
 		return False
-	if checkpoint_config.get("max_samples") != MAX_TRAINING_SAMPLES:
-		logger.warning(
-			f"Max samples mismatch: current={MAX_TRAINING_SAMPLES}, checkpoint={checkpoint_config.get('max_samples')}"
-		)
 		return False
-	logger.info("✅ Checkpoint configuration is compatible")
-	return True
-def create_checkpoint_data(stage: str, data: dict[str, Any], step: int = 0) -> dict[str, Any]:
 	"""
-	Create checkpoint data with configuration metadata.
 	Args:
-	        stage: Checkpoint stage name
-	        data: Core checkpoint data
-	        step: Step number
 	Returns:
-	        Enhanced checkpoint data with configuration
 	"""
-	return {
-		"config_hash": get_current_config_hash(),
-		"config": {
-			"model_name": MODEL_NAME,
-			"pca_dims": PCA_DIMS,
-			"precision": TEACHER_MODEL_CONFIG["precision"],
-			"torch_dtype": str(TEACHER_MODEL_CONFIG["torch_dtype"]),
-			"max_samples": MAX_TRAINING_SAMPLES,
-			"codesearchnet_dataset": CODESEARCHNET_DATASET,
-		},
-		"stage": stage,
-		"step": step,
-		"timestamp": time.time(),
-		"data": data,
-	}
-def load_codesearchnet_dataset_with_resume(
-	max_samples: int = MAX_TRAINING_SAMPLES,
 	checkpoint_manager: BeamCheckpointManager | None = None,
 ) -> list[str]:
-	"""Load and format the sentence-transformers/codesearchnet dataset with resume capability."""
-	logger.info(f"Loading CodeSearchNet dataset from {CODESEARCHNET_DATASET}")
 	logger.info(f"Limiting to {max_samples} samples for training efficiency")
-	# Check for existing dataset checkpoint with validation
 	if checkpoint_manager:
 		checkpoint_data = checkpoint_manager.load_checkpoint("dataset", 0)
 		if checkpoint_data:
-			if validate_checkpoint_compatibility(checkpoint_data):
-				texts = checkpoint_data.get("data", {}).get("texts", [])
-				if len(texts) >= max_samples:
-					logger.info(f"✅ Resumed dataset loading: {len(texts)} texts from checkpoint")
-					return texts[:max_samples]
-				logger.info(f"📋 Partial dataset found: {len(texts)} texts, continuing from there")
-				start_from = len(texts)
-			else:
-				logger.warning("🔄 Incompatible dataset checkpoint found, starting fresh")
-				# Clean up incompatible checkpoint
-				checkpoint_manager.cleanup_old_checkpoints("dataset", keep_latest=0)
-				texts = []
-				start_from = 0
-		else:
-			texts = []
-			start_from = 0
-	else:
-		texts = []
-		start_from = 0
 	try:
-		# Load the dataset
-		dataset = load_dataset(CODESEARCHNET_DATASET, split="train", streaming=True)
-		# Skip to where we left off
-		dataset_iter = iter(dataset)
-		for _ in range(start_from):
-			try:
-				next(dataset_iter)
-			except StopIteration:
-				break
-		for i, example in enumerate(dataset_iter, start=start_from):
-			if len(texts) >= max_samples:
 				break
-			comment = example.get("comment", "").strip()
-			code = example.get("code", "").strip()
-			if comment and code and len(comment) > 10 and len(code) > 50:
-				# Format as comment-code pair for training
-				text = f"Comment: {comment}\nCode:\n{code}"
-				# Ensure reasonable length
-				if len(text) <= 2048:  # Reasonable limit for embedding models
-					texts.append(text)
-			# Save checkpoint periodically
-			if checkpoint_manager and (i + 1) % CHECKPOINT_INTERVAL == 0:
-				checkpoint_data = create_checkpoint_data("dataset", {"texts": texts}, 0)
-				checkpoint_manager.save_checkpoint("dataset", checkpoint_data, 0)
-				logger.info(f"💾 Saved dataset checkpoint: {len(texts)} texts collected")
-			if (i + 1) % 10000 == 0:
-				logger.info(f"Processed {i + 1} examples, collected {len(texts)} valid pairs")
 		# Final checkpoint save
 		if checkpoint_manager:
-			checkpoint_data = create_checkpoint_data("dataset", {"texts": texts}, 0)
 			checkpoint_manager.save_checkpoint("dataset", checkpoint_data, 0)
-		logger.info(f"Successfully loaded {len(texts)} code-comment pairs from CodeSearchNet")
-		return texts
 	except Exception:
 		logger.exception("Error loading CodeSearchNet dataset")
 		return texts  # Return what we have so far
-def generate_teacher_embeddings_with_checkpoints(
 	teacher_model: SentenceTransformer,
 	texts: list[str],
 	checkpoint_manager: BeamCheckpointManager | None = None,
@@ -275,13 +514,11 @@ def generate_teacher_embeddings_with_checkpoints(
 	"""Generate teacher embeddings for code training with checkpoint support."""
 	logger.info(f"Generating teacher embeddings for {len(texts)} texts...")
-	# Check for existing embeddings checkpoint using torch.save format
-	final_embeddings = None
 	if checkpoint_manager:
-		# Try to load complete embeddings tensor directly
-		embeddings_path = Path(VOLUME_PATH) / "embeddings_cache.pt"
-		config_path = Path(VOLUME_PATH) / "embeddings_config.json"
 		if embeddings_path.exists() and config_path.exists():
 			try:
@@ -289,118 +526,78 @@ def generate_teacher_embeddings_with_checkpoints(
 				with config_path.open("r") as f:
 					config_data = json.load(f)
-				# Create a dummy checkpoint data structure for validation
-				checkpoint_data = {
-					"config_hash": config_data.get("config_hash"),
-					"config": config_data.get("config", {}),
-				}
-				if validate_checkpoint_compatibility(checkpoint_data):
 					# Load the embeddings tensor
 					final_embeddings = torch.load(embeddings_path, map_location="cpu")
 					num_expected = config_data.get("num_texts", len(texts))
 					if final_embeddings.shape[0] >= num_expected:
-						logger.info(
-							f"✅ Loaded complete embeddings from cache ({final_embeddings.shape[0]} embeddings)"
-						)
-						return final_embeddings[: len(texts)]  # Return only the needed amount
-					logger.info(
-						f"⚠️ Cached embeddings incomplete ({final_embeddings.shape[0]}/{num_expected}), regenerating"
-					)
-					final_embeddings = None
-				else:
-					logger.warning("🔄 Incompatible embeddings cache found, regenerating")
-					final_embeddings = None
 			except Exception as e:
 				logger.warning(f"Failed to load embeddings cache: {e}, regenerating...")
-				final_embeddings = None
-	# If we have complete embeddings, return them
-	if final_embeddings is not None:
-		return final_embeddings
 	# Generate embeddings from scratch
 	logger.info("Generating fresh teacher embeddings...")
-	# Use optimized batch size for large models with proper type casting
-	batch_size_raw = TEACHER_MODEL_CONFIG["batch_size"]
-	current_batch_size: int = batch_size_raw if isinstance(batch_size_raw, int) else 16
-	logger.info(f"Using optimized batch size: {current_batch_size} for 40GB VRAM (7B model)")
 	embeddings_list = []
-	for i in range(0, len(texts), current_batch_size):
-		batch_texts = texts[i : i + current_batch_size]
 		try:
-			# Use optimized encoding with convert_to_tensor=True for efficiency
 			batch_embeddings = teacher_model.encode(
 				batch_texts,
 				convert_to_tensor=True,
-				batch_size=current_batch_size,
-				show_progress_bar=False,  # Reduce overhead
-				normalize_embeddings=True,  # Pre-normalize for efficiency
 			)
 			embeddings_list.append(batch_embeddings)
-			if i % (current_batch_size * 10) == 0:
 				logger.info(f"Generated embeddings for {i + len(batch_texts)}/{len(texts)} texts")
 		except torch.cuda.OutOfMemoryError:
-			logger.warning(
-				f"GPU OOM with batch size {current_batch_size}, reducing to {max(1, current_batch_size // 2)}"
-			)
-			# Clear cache and reduce batch size
-			if torch.cuda.is_available():
-				torch.cuda.empty_cache()
-			current_batch_size = max(1, current_batch_size // 2)
 			# Retry with smaller batch size
-			batch_texts = texts[i : i + current_batch_size]
 			batch_embeddings = teacher_model.encode(
 				batch_texts,
 				convert_to_tensor=True,
-				batch_size=current_batch_size,
 				show_progress_bar=False,
 				normalize_embeddings=True,
 			)
 			embeddings_list.append(batch_embeddings)
-			logger.info(f"Successfully processed batch with reduced size {current_batch_size}")
-	# Combine all embeddings and force fp32 precision
 	teacher_embeddings = torch.cat(embeddings_list, dim=0)
-	# Ensure teacher embeddings are in fp32 for maximum quality
 	if teacher_embeddings.dtype != torch.float32:
-		logger.info(f"Converting teacher embeddings from {teacher_embeddings.dtype} to fp32")
 		teacher_embeddings = teacher_embeddings.to(torch.float32)
 	logger.info(f"Generated {teacher_embeddings.shape[0]} teacher embeddings in {teacher_embeddings.dtype}")
-	# Save embeddings cache using torch.save for future runs
 	if checkpoint_manager:
 		try:
-			embeddings_path = Path(VOLUME_PATH) / "embeddings_cache.pt"
-			config_path = Path(VOLUME_PATH) / "embeddings_config.json"
 			# Save embeddings tensor
 			torch.save(teacher_embeddings, embeddings_path)
 			# Save configuration
 			config_data = {
-				"config_hash": get_current_config_hash(),
-				"config": {
-					"model_name": MODEL_NAME,
-					"pca_dims": PCA_DIMS,
-					"precision": TEACHER_MODEL_CONFIG["precision"],
-					"torch_dtype": str(TEACHER_MODEL_CONFIG["torch_dtype"]),
-					"max_samples": MAX_TRAINING_SAMPLES,
-					"codesearchnet_dataset": CODESEARCHNET_DATASET,
-				},
 				"num_texts": len(texts),
 				"embedding_shape": list(teacher_embeddings.shape),
 				"timestamp": time.time(),
@@ -417,890 +614,953 @@ def generate_teacher_embeddings_with_checkpoints(
 	return teacher_embeddings
-def refine_with_code_training(
 	student_model: Any,
-	training_texts: list[str],
-	teacher_embeddings: torch.Tensor,
-	epochs: int = 2,
 	checkpoint_manager: BeamCheckpointManager | None = None,
-	model_manager: BeamModelManager | None = None,
 ) -> Any:
-	"""Refine the student model with code-specific training."""
-	logger.info(f"Starting code specialization training for {epochs} epochs...")
-	# Validate input parameters
-	if student_model is None:
-		logger.error("student_model is None - cannot proceed with code training")
-		msg = "student_model cannot be None"
-		raise ValueError(msg)
-	if not hasattr(student_model, "embedding"):
-		logger.error(f"student_model of type {type(student_model)} does not have 'embedding' attribute")
-		msg = f"student_model must have 'embedding' attribute, got {type(student_model)}"
-		raise ValueError(msg)
-	logger.info(f"Student model type: {type(student_model)}")
-	logger.info(f"Student model embedding shape: {student_model.embedding.shape}")
-	try:
-		# Force fp32 precision throughout for maximum quality
-		target_dtype = torch.float32
-		logger.info("🎯 Enforcing fp32 precision throughout for maximum quality")
-		# Detect student model dtype for logging purposes
-		student_dtype = student_model.embedding.dtype
-		logger.info(f"Student model original embedding dtype: {student_dtype}")
-		# Force teacher embeddings to fp32 if not already
-		if teacher_embeddings.dtype != target_dtype:
-			logger.info(f"Converting teacher embeddings from {teacher_embeddings.dtype} to {target_dtype}")
-			teacher_embeddings = teacher_embeddings.to(target_dtype)
-		# Get dimensions
-		student_embedding_dim = student_model.embedding.shape[1]
-		teacher_embedding_dim = teacher_embeddings.shape[1]
-		logger.info(f"Student dims: {student_embedding_dim}, Teacher dims: {teacher_embedding_dim}")
-		# Project teacher embeddings if needed with high-precision PCA
-		if teacher_embedding_dim != student_embedding_dim:
-			from sklearn.decomposition import PCA
-			logger.info("Performing high-precision PCA projection for quality preservation...")
-			pca = PCA(n_components=student_embedding_dim)
-			# Use float64 for PCA computation to maximize precision
-			teacher_embeddings_np = teacher_embeddings.cpu().numpy().astype(np.float64)
-			teacher_embeddings_projected = pca.fit_transform(teacher_embeddings_np)
-			# Convert back to fp32 (always use fp32, never fp16)
-			teacher_embeddings = torch.tensor(
-				teacher_embeddings_projected.astype(np.float32),
-				dtype=target_dtype,
-			)
-			logger.info(f"PCA projection completed: {teacher_embeddings.shape} with dtype {target_dtype}")
-			logger.info(
-				f"PCA preserved variance ratio: {pca.explained_variance_ratio_[:5].sum():.4f} (first 5 components)"
-			)
-		# Create trainable model
-		trainable_model = FinetunableStaticModel.from_static_model(
-			model=student_model,
-			out_dim=student_embedding_dim,
-		)
-		# Force ALL model parameters to fp32 to ensure no precision loss
-		trainable_model = trainable_model.float()
-		# Additional explicit conversion of embedding weights to fp32
-		if hasattr(trainable_model, "embeddings") and hasattr(trainable_model.embeddings, "weight"):
-			trainable_model.embeddings.weight.data = trainable_model.embeddings.weight.data.to(target_dtype)
-		# Verify final model dtype after model2vec patch fix
-		actual_model_dtype = None
-		for param in trainable_model.parameters():
-			actual_model_dtype = param.dtype
-			break
-		logger.info(f"Model parameter dtype: {actual_model_dtype}")
-		logger.info(f"Embedding weight dtype: {trainable_model.embeddings.weight.dtype}")
-		# Ensure teacher embeddings are definitely in fp32
-		teacher_embeddings = teacher_embeddings.to(target_dtype)
-		logger.info(f"Final teacher embeddings dtype: {teacher_embeddings.dtype}")
-		logger.info(f"Final model parameter dtype: {actual_model_dtype}")
-		# Verify we're using fp32 throughout
-		if teacher_embeddings.dtype != target_dtype:
-			logger.warning(f"⚠️ Teacher embeddings not in {target_dtype}: {teacher_embeddings.dtype}")
-		if actual_model_dtype != target_dtype:
-			logger.warning(f"⚠️ Model parameters not in {target_dtype}: {actual_model_dtype}")
-		logger.info("✅ Confirmed fp32 precision throughout the training pipeline")
-		# Tokenize texts
-		tokenized_texts = []
-		for text in training_texts:
-			tokens = trainable_model.tokenize([text])
-			if tokens.shape[1] > 0:
-				tokenized_texts.append(tokens[0].tolist())
-		# Prepare training data with explicit fp32 casting
-		targets = teacher_embeddings[: len(tokenized_texts)]
-		# Force targets to fp32 to maintain maximum precision
-		targets = targets.to(target_dtype)
-		logger.info(f"Cast targets to fp32: {targets.dtype}")
-		train_texts, val_texts, train_targets, val_targets = train_test_split(
-			tokenized_texts, targets, test_size=0.2, random_state=42
-		)
-		logger.info(f"Train targets dtype: {train_targets.dtype}")
-		logger.info(f"Val targets dtype: {val_targets.dtype}")
-		# Training setup
-		train_dataset = TextDataset(train_texts, train_targets)
-		val_dataset = TextDataset(val_texts, val_targets)
-		optimizer = optim.Adam(trainable_model.parameters(), lr=LEARNING_RATE)
-		mse_loss = nn.MSELoss()
-		device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-		try:
-			trainable_model = trainable_model.to(device)
-			logger.info(f"Training on {device}")
-		except torch.cuda.OutOfMemoryError:
-			logger.warning("GPU OOM loading training model, using CPU")
-			device = torch.device("cpu")
-			trainable_model = trainable_model.to(device)
-			if torch.cuda.is_available():
-				torch.cuda.empty_cache()
-		# Adaptive batch size for training
-		adaptive_batch_size = BATCH_SIZE
-		# Quality monitoring: compute embedding similarity before training
-		logger.info("🔍 Quality monitoring: Computing pre-training teacher-student similarity...")
-		trainable_model.eval()
-		with torch.no_grad():
-			# Take a small sample of texts for quality measurement
-			sample_texts = training_texts[: min(5, len(training_texts))]
-			sample_tokens = trainable_model.tokenize(sample_texts)
-			sample_tokens = sample_tokens.to(device)
-			_, student_embeddings_before = trainable_model(sample_tokens)
-			sample_teacher_embeddings = targets[: len(sample_texts)].to(device)
-			# Compute average cosine similarity
-			similarities_before = []
-			for i in range(len(sample_texts)):
-				sim = torch.cosine_similarity(
-					student_embeddings_before[i].unsqueeze(0),
-					sample_teacher_embeddings[i].unsqueeze(0),
-				).item()
-				similarities_before.append(sim)
-			avg_similarity_before = np.mean(similarities_before)
-			logger.info(f"📊 Pre-training average teacher-student similarity: {avg_similarity_before:.4f}")
-		# Training loop with validation
-		for epoch in range(epochs):
-			# Training phase
-			trainable_model.train()
-			# Try with current batch size, reduce if OOM
-			train_successful = False
-			while not train_successful and adaptive_batch_size >= 1:
-				try:
-					train_loader = train_dataset.to_dataloader(shuffle=True, batch_size=adaptive_batch_size)
-					epoch_loss = 0.0
-					num_batches = 0
-					for batch_idx, (tokens, targets_batch) in enumerate(train_loader):
-						batch_tokens = tokens.to(device)
-						batch_targets = targets_batch.to(device)
-						optimizer.zero_grad()
-						_, student_embeddings = trainable_model(batch_tokens)
-						# Debug dtype information on first batch
-						if batch_idx == 0:
-							logger.info(
-								f"Batch {batch_idx}: tokens shape {batch_tokens.shape}, dtype {batch_tokens.dtype}"
-							)
-							logger.info(
-								f"Batch {batch_idx}: targets shape {batch_targets.shape}, dtype {batch_targets.dtype}"
-							)
-							logger.info(
-								f"Batch {batch_idx}: student_embeddings shape {student_embeddings.shape}, dtype {student_embeddings.dtype}"
-							)
-						# Force both tensors to fp32 to avoid any precision loss
-						if student_embeddings.dtype != target_dtype:
-							logger.warning(
-								f"Student embeddings not in fp32: {student_embeddings.dtype}, converting to fp32"
-							)
-							student_embeddings = student_embeddings.to(target_dtype)
-						if batch_targets.dtype != target_dtype:
-							logger.info(f"Converting targets from {batch_targets.dtype} to fp32")
-							batch_targets = batch_targets.to(target_dtype)
-						try:
-							loss = mse_loss(student_embeddings, batch_targets)
-							loss.backward()
-							optimizer.step()
-						except RuntimeError as e:
-							if "expected scalar type" in str(e):
-								logger.exception("Dtype mismatch error occurred:")
-								logger.exception(
-									f"student_embeddings: {student_embeddings.shape}, {student_embeddings.dtype}"
-								)
-								logger.exception(f"batch_targets: {batch_targets.shape}, {batch_targets.dtype}")
-								logger.exception(
-									f"MSE loss input dtypes: {student_embeddings.dtype} vs {batch_targets.dtype}"
-								)
-								# Force explicit casting to fp32 for maximum precision
-								batch_targets = batch_targets.to(target_dtype)
-								student_embeddings = student_embeddings.to(target_dtype)
-								logger.info("Emergency dtype fix: forced both to fp32")
-								loss = mse_loss(student_embeddings, batch_targets)
-								loss.backward()
-								optimizer.step()
-							else:
-								raise
-						epoch_loss += loss.item()
-						num_batches += 1
-						# Save training checkpoint periodically
-						if checkpoint_manager and batch_idx % 100 == 0:
-							training_state = {
-								"epoch": epoch,
-								"batch": batch_idx,
-								"model_state": trainable_model.state_dict(),
-								"optimizer_state": optimizer.state_dict(),
-								"loss": epoch_loss / max(1, num_batches),
-							}
-							checkpoint_data = create_checkpoint_data("training", training_state, epoch)
-							checkpoint_manager.save_checkpoint("training", checkpoint_data, epoch)
-					train_successful = True
-				except torch.cuda.OutOfMemoryError:
-					logger.warning(
-						f"Training OOM with batch size {adaptive_batch_size}, reducing to {adaptive_batch_size // 2}"
-					)
-					adaptive_batch_size = max(1, adaptive_batch_size // 2)
-					if torch.cuda.is_available():
-						torch.cuda.empty_cache()
-			if not train_successful:
-				logger.error("Unable to train even with batch size 1, skipping training")
-				break
-			avg_train_loss = epoch_loss / num_batches if num_batches > 0 else 0.0
-			# Validation phase
-			trainable_model.eval()
-			val_loader = val_dataset.to_dataloader(shuffle=False, batch_size=adaptive_batch_size)
-			val_loss = 0.0
-			val_batches = 0
-			with torch.no_grad():
-				for tokens, targets_batch in val_loader:
-					batch_tokens = tokens.to(device)
-					batch_targets = targets_batch.to(device)
-					_, student_embeddings = trainable_model(batch_tokens)
-					# Force both tensors to fp32 to avoid any precision loss in validation
-					if student_embeddings.dtype != target_dtype:
-						student_embeddings = student_embeddings.to(target_dtype)
-					if batch_targets.dtype != target_dtype:
-						batch_targets = batch_targets.to(target_dtype)
-					loss = mse_loss(student_embeddings, batch_targets)
-					val_loss += loss.item()
-					val_batches += 1
-			avg_val_loss = val_loss / val_batches if val_batches > 0 else 0.0
-			logger.info(
-				f"Epoch {epoch + 1}/{epochs} - Train Loss: {avg_train_loss:.6f}, Val Loss: {avg_val_loss:.6f}, Batch Size: {adaptive_batch_size}"
-			)
-			# Save epoch checkpoint
-			if checkpoint_manager:
-				epoch_state = {
-					"epoch": epoch + 1,
-					"model_state": trainable_model.state_dict(),
-					"optimizer_state": optimizer.state_dict(),
-					"train_loss": avg_train_loss,
-					"val_loss": avg_val_loss,
 				}
-				checkpoint_data = create_checkpoint_data("epoch", epoch_state, epoch + 1)
-				checkpoint_manager.save_checkpoint("epoch", checkpoint_data, epoch + 1)
-			# Quality monitoring: compute embedding similarity after training
-			logger.info("🔍 Quality monitoring: Computing post-training teacher-student similarity...")
-			trainable_model.eval()
-			with torch.no_grad():
-				# Use the same sample texts as before
-				sample_texts = training_texts[: min(5, len(training_texts))]
-				sample_tokens = trainable_model.tokenize(sample_texts)
-				sample_tokens = sample_tokens.to(device)
-				_, student_embeddings_after = trainable_model(sample_tokens)
-				sample_teacher_embeddings = targets[: len(sample_texts)].to(device)
-				# Compute average cosine similarity
-				similarities_after = []
-				for i in range(len(sample_texts)):
-					sim = torch.cosine_similarity(
-						student_embeddings_after[i].unsqueeze(0),
-						sample_teacher_embeddings[i].unsqueeze(0),
-					).item()
-					similarities_after.append(sim)
-				avg_similarity_after = np.mean(similarities_after)
-				logger.info(f"📊 Post-training average teacher-student similarity: {avg_similarity_after:.4f}")
-				# Quality assessment
-				quality_change = avg_similarity_after - avg_similarity_before
-				logger.info(f"📈 Quality change: {quality_change:+.4f}")
-				if abs(quality_change) < 0.01:
-					logger.info("✅ Quality well preserved during training!")
-				elif quality_change > 0:
-					logger.info("✅ Quality improved during training!")
-				else:
-					logger.warning(f"⚠️ Quality degraded by {abs(quality_change):.4f} during training")
-		# Convert back to static model
-		refined_model = trainable_model.to_static_model()
-		# Save final refined model to beam volume
-		if model_manager:
-			# Save to temporary local directory first
-			temp_refined_path = Path("./temp_refined_save")
-			temp_refined_path.mkdir(exist_ok=True)
-			refined_model.save_pretrained(str(temp_refined_path))
-			# Upload to beam volume
-			model_manager.save_model("refined_model", str(temp_refined_path))
-			# Clean up temp directory
-			import shutil
-			shutil.rmtree(temp_refined_path, ignore_errors=True)
-			logger.info("💾 Saved refined model to beam volume")
-		logger.info("Code specialization training completed")
-		return refined_model
 	except Exception as e:
-		logger.warning(f"Code training failed: {e}")
-		return student_model
-def apply_regularization(model: Any, weight: float = 0.01) -> Any:
-	"""Apply light regularization with overflow protection."""
-	# Validate input
-	if model is None:
-		logger.error("Cannot apply regularization: model is None")
-		msg = "model cannot be None"
-		raise ValueError(msg)
-	if not hasattr(model, "embedding"):
-		logger.error(f"Cannot apply regularization: model of type {type(model)} does not have 'embedding' attribute")
-		msg = f"model must have 'embedding' attribute, got {type(model)}"
-		raise ValueError(msg)
-	logger.info(f"Applying regularization to model of type: {type(model)}")
-	try:
-		embeddings = model.embedding.copy()
-		# Check for extreme values and clip if necessary
-		max_val = np.abs(embeddings).max()
-		if max_val > 1e6:  # Clip extremely large values
-			logger.warning(f"Large embedding values detected (max: {max_val:.2e}), clipping to prevent overflow")
-			embeddings = np.clip(embeddings, -1e6, 1e6)
-		# Apply regularization
-		regularized_embeddings = embeddings * (1.0 - weight)
-		# Stable normalization to prevent overflow
-		norms = np.linalg.norm(regularized_embeddings, axis=1, keepdims=True)
-		# Handle zero norms and potential overflow
-		norms = np.where(norms == 0, 1, norms)
-		norms = np.where(norms > 1e6, 1e6, norms)  # Prevent extremely large norms
-		regularized_embeddings = regularized_embeddings / norms
-		# Create new model
-		from model2vec.model import StaticModel
-		regularized_model = StaticModel(
-			vectors=regularized_embeddings,
-			tokenizer=model.tokenizer,
-			config=model.config,
-			base_model_name=model.base_model_name,
-			language=model.language,
-			normalize=True,
 		)
-		logger.info("Regularization applied successfully")
-		return regularized_model
 	except Exception as e:
-		logger.warning(f"Regularization failed: {e}")
-		return model
-def load_teacher_model_with_cache(
-	model_name: str,
-	output_dir: str,
-	device: str = "cuda",
-	resume: bool = True,
-) -> SentenceTransformer:
-	"""Load teacher model with local caching to avoid re-downloading."""
-	cache_dir = Path(output_dir) / "teacher_model_cache"
-	# Check if cached model exists
-	if resume and cache_dir.exists():
-		try:
-			logger.info(f"Loading cached teacher model from {cache_dir}")
-			teacher_model = SentenceTransformer(str(cache_dir), device=device)
-			# Set optimized sequence length
-			max_seq_len = TEACHER_MODEL_CONFIG.get("max_seq_length", 8192)
-			if isinstance(max_seq_len, int):
-				teacher_model.max_seq_length = max_seq_len
-			logger.info("Successfully loaded cached teacher model")
-			return teacher_model
-		except Exception as e:
-			logger.warning(f"Failed to load cached teacher model: {e}")
-			logger.info("Will download fresh model")
-	# Download and cache the model
-	logger.info(f"Downloading teacher model {model_name} (this may take a while)")
-	# Prepare model kwargs with flash attention
-	model_kwargs = {
-		"torch_dtype": TEACHER_MODEL_CONFIG["torch_dtype"],
-		"device_map": TEACHER_MODEL_CONFIG["device_map"],
 	}
-	# Try to add flash attention if available
-	if TEACHER_MODEL_CONFIG.get("use_flash_attention", False):
-		try:
-			model_kwargs["attn_implementation"] = TEACHER_MODEL_CONFIG["attn_implementation"]
-			logger.info("Flash Attention 2 enabled")
-		except Exception as e:
-			logger.warning(f"Flash Attention not available, using default attention: {e}")
-	try:
-		teacher_model = SentenceTransformer(
-			model_name,
-			device=device,
-			trust_remote_code=bool(TEACHER_MODEL_CONFIG["trust_remote_code"]),
-			model_kwargs=model_kwargs,
-		)
-	except ImportError as e:
-		if "flash_attn" in str(e):
-			logger.warning("Flash Attention 2 not available, falling back to default attention")
-			# Remove flash attention from model_kwargs and retry
-			model_kwargs_fallback = {k: v for k, v in model_kwargs.items() if k != "attn_implementation"}
-			teacher_model = SentenceTransformer(
-				model_name,
-				device=device,
-				trust_remote_code=bool(TEACHER_MODEL_CONFIG["trust_remote_code"]),
-				model_kwargs=model_kwargs_fallback,
-			)
-		else:
-			raise
-	# Set optimized sequence length
-	max_seq_len = TEACHER_MODEL_CONFIG.get("max_seq_length", 8192)
-	if isinstance(max_seq_len, int):
-		teacher_model.max_seq_length = max_seq_len
-		logger.info(f"Set max_seq_length to {max_seq_len} for better performance")
-	# Cache the model for future use
 	try:
-		cache_dir.mkdir(parents=True, exist_ok=True)
-		teacher_model.save(str(cache_dir))
-		logger.info(f"Cached teacher model to {cache_dir}")
 	except Exception as e:
-		logger.warning(f"Failed to cache teacher model: {e}")
-		# Continue without caching
-	return teacher_model
-def code_specialized_distillation(
-	model_name: str = MODEL_NAME,
-	output_dir: str = OUTPUT_DIR,
-	pca_dims: int = PCA_DIMS,
-	max_samples: int = MAX_TRAINING_SAMPLES,
-	resume: bool = True,
 ) -> Any:
-	"""Main code-specialized distillation function using CodeSearchNet dataset with checkpoint support."""
 	output_path = Path(output_dir)
 	output_path.mkdir(parents=True, exist_ok=True)
-	# Initialize Beam utilities
-	volume_mgr, checkpoint_mgr, model_mgr, eval_mgr = create_beam_utilities(VOLUME_NAME, VOLUME_PATH)
-	logger.info(f"Starting code-specialized distillation of {model_name}")
-	logger.info(f"Using CodeSearchNet dataset: {CODESEARCHNET_DATASET}")
-	logger.info(f"Resume mode: {resume}")
-	# GPU Diagnostics
-	logger.info("=== GPU DIAGNOSTICS ===")
-	logger.info(f"CUDA available: {torch.cuda.is_available()}")
-	if torch.cuda.is_available():
-		logger.info(f"CUDA version: {torch.version.cuda}")
-		logger.info(f"GPU count: {torch.cuda.device_count()}")
-		for i in range(torch.cuda.device_count()):
-			gpu_name = torch.cuda.get_device_name(i)
-			gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
-			logger.info(f"GPU {i}: {gpu_name} ({gpu_memory:.1f} GB)")
-		# Current GPU memory
-		current_device = torch.cuda.current_device()
-		allocated = torch.cuda.memory_allocated(current_device) / 1024**3
-		total = torch.cuda.get_device_properties(current_device).total_memory / 1024**3
-		logger.info(f"Current GPU {current_device}: {allocated:.2f}GB allocated, {total:.1f}GB total")
-	else:
-		logger.warning("CUDA not available - will use CPU (much slower)")
-	logger.info("======================")
 	start_time = time.time()
-	# Step 1: Basic Model2Vec distillation with checkpoint support
-	logger.info("Step 1: Basic Model2Vec distillation...")
-	# Check for existing distilled model in beam volume
-	m2v_model = None
-	if resume:
-		# Check if model files exist directly in the volume root
-		try:
-			# Try to load from the volume root where the model was successfully saved
-			volume_root_path = Path(VOLUME_PATH)
-			if (volume_root_path / "config.json").exists() and (volume_root_path / "model.safetensors").exists():
-				logger.info("✅ Found existing model files in volume root")
-				from model2vec.model import StaticModel
-				m2v_model = StaticModel.from_pretrained(str(volume_root_path))
-				logger.info("✅ Successfully loaded existing distilled model from volume")
-			else:
-				logger.info("No existing model files found in volume root")
-		except Exception as e:
-			logger.warning(f"Failed to load existing model from volume: {e}")
-			m2v_model = None
-	if m2v_model is None:
-		# Clear GPU cache before starting
-		if torch.cuda.is_available():
-			torch.cuda.empty_cache()
-			current_device = torch.cuda.current_device()
-			allocated = torch.cuda.memory_allocated(current_device) / 1024**3
-			total = torch.cuda.get_device_properties(current_device).total_memory / 1024**3
-			logger.info(f"GPU memory before distillation: {allocated:.2f}GB allocated / {total:.1f}GB total")
-		else:
-			logger.info("Using CPU for distillation")
-		try:
-			m2v_model = distill(
-				model_name=model_name,
-				pca_dims=pca_dims,
-				apply_zipf=None,
-				sif_coefficient=1e-4,
 				trust_remote_code=True,
 			)
-			logger.info("Basic distillation completed with preserved precision")
-			# Validate the distilled model
-			if m2v_model is None:
-				msg = "Distillation returned None - this should not happen"
-				raise ValueError(msg) from None
-			logger.info(f"Distilled model type: {type(m2v_model)}")
-			logger.info(f"Distilled model has embedding attribute: {hasattr(m2v_model, 'embedding')}")
-			# Save the base distilled model - DISABLED due to recursive directory bug
-			# model_mgr.save_model("base_distilled_model", str(output_path))
-		except torch.cuda.OutOfMemoryError:
-			logger.warning("GPU OOM during distillation, clearing cache and retrying...")
-			torch.cuda.empty_cache()
-			# Force CPU-only distillation if GPU fails
-			os.environ["CUDA_VISIBLE_DEVICES"] = ""
-			logger.info("Retrying distillation on CPU...")
-			m2v_model = distill(
-				model_name=model_name,
-				pca_dims=pca_dims,
-				apply_zipf=None,
-				sif_coefficient=1e-4,
 				trust_remote_code=True,
 			)
-			logger.info("Basic distillation completed on CPU")
-			# Validate the distilled model
-			if m2v_model is None:
-				msg = "CPU distillation returned None - this should not happen"
-				raise ValueError(msg) from None
-			logger.info(f"CPU distilled model type: {type(m2v_model)}")
-			logger.info(f"CPU distilled model has embedding attribute: {hasattr(m2v_model, 'embedding')}")
-			# Save the base distilled model - DISABLED due to recursive directory bug
-			# model_mgr.save_model("base_distilled_model", str(output_path))
-		except Exception:
-			logger.exception("Distillation failed with error")
-			raise
-	# Validate m2v_model before proceeding
-	if m2v_model is None:
-		msg = "m2v_model is None after distillation step - cannot proceed"
-		raise ValueError(msg)
-	# Step 2: Load CodeSearchNet training data with resume
-	logger.info("Step 2: Loading CodeSearchNet training data...")
-	code_texts = load_codesearchnet_dataset_with_resume(max_samples, checkpoint_mgr)
-	if not code_texts:
-		logger.warning("No code training data available, skipping code specialization")
-	else:
-		logger.info("Step 3: Code specialization training...")
-		# Check for existing refined model
-		if resume:
-			# Check if refined model exists in beam volume
-			models = model_mgr.list_models()
-			refined_model_exists = any(model["name"] == "refined_model" for model in models)
-			if refined_model_exists:
-				# Download model to local path for loading
-				temp_model_path = Path("./temp_refined_model")
-				if model_mgr.load_model("refined_model", temp_model_path):
-					try:
-						from model2vec.model import StaticModel
-						refined_model = StaticModel.from_pretrained(str(temp_model_path / "refined_model"))
-						logger.info("✅ Resumed from existing refined model")
-						m2v_model = refined_model
-						# Clean up temp directory
-						import shutil
-						shutil.rmtree(temp_model_path, ignore_errors=True)
-					except Exception as e:
-						logger.warning(f"Failed to load existing refined model: {e}")
-						refined_model = None
-						# Clean up temp directory
-						import shutil
-						shutil.rmtree(temp_model_path, ignore_errors=True)
-				else:
-					refined_model = None
-			else:
-				refined_model = None
-			if refined_model is None:
-				# Load teacher model with memory management
-				try:
-					device = "cuda" if torch.cuda.is_available() else "cpu"
-					logger.info(f"Loading teacher model on {device} with optimized settings")
-					logger.info(
-						f"Using precision: {TEACHER_MODEL_CONFIG['precision']}, batch_size: {TEACHER_MODEL_CONFIG['batch_size']}"
-					)
-					logger.info("Attempting to enable Flash Attention 2 for maximum performance")
-					teacher_model = load_teacher_model_with_cache(model_name, output_dir, device=device, resume=resume)
-					# Generate teacher embeddings with checkpoints
-					teacher_embeddings = generate_teacher_embeddings_with_checkpoints(
-						teacher_model, code_texts, checkpoint_mgr
-					)
-					# Refine with code training
-					m2v_model = refine_with_code_training(
-						m2v_model,
-						code_texts,
-						teacher_embeddings,
-						epochs=TRAINING_EPOCHS,
-						checkpoint_manager=checkpoint_mgr,
-						model_manager=model_mgr,
-					)
-					del teacher_model
-					if torch.cuda.is_available():
-						torch.cuda.empty_cache()
-				except torch.cuda.OutOfMemoryError:
-					logger.warning("GPU OOM during code training, falling back to CPU...")
-					if torch.cuda.is_available():
-						torch.cuda.empty_cache()
-					# Force CPU for teacher model with optimized settings (no flash attention on CPU)
-					try:
-						teacher_model = load_teacher_model_with_cache(
-							model_name, output_dir, device="cpu", resume=resume
-						)
-					except ImportError as e:
-						if "flash_attn" in str(e):
-							logger.warning("Flash Attention 2 not available on CPU, using default attention")
-							# Fallback without any special attention implementation
-							teacher_model = load_teacher_model_with_cache(
-								model_name, output_dir, device="cpu", resume=resume
-							)
-						else:
-							raise
-					# Generate teacher embeddings on CPU with checkpoints
-					teacher_embeddings = generate_teacher_embeddings_with_checkpoints(
-						teacher_model, code_texts, checkpoint_mgr
-					)
-					# Refine with code training on CPU
-					m2v_model = refine_with_code_training(
-						m2v_model,
-						code_texts,
-						teacher_embeddings,
-						epochs=TRAINING_EPOCHS,
-						checkpoint_manager=checkpoint_mgr,
-						model_manager=model_mgr,
-					)
-					del teacher_model
-		else:
-			# Fresh training without resume
-			try:
-				device = "cuda" if torch.cuda.is_available() else "cpu"
-				logger.info(f"Loading teacher model on {device} with optimized settings")
-				logger.info(
-					f"Using precision: {TEACHER_MODEL_CONFIG['precision']}, batch_size: {TEACHER_MODEL_CONFIG['batch_size']}"
-				)
-				logger.info("Attempting to enable Flash Attention 2 for maximum performance")
-				teacher_model = load_teacher_model_with_cache(model_name, output_dir, device=device, resume=resume)
-				# Generate teacher embeddings with checkpoints
-				teacher_embeddings = generate_teacher_embeddings_with_checkpoints(
-					teacher_model, code_texts, checkpoint_mgr
-				)
-				# Refine with code training
-				m2v_model = refine_with_code_training(
-					m2v_model,
-					code_texts,
-					teacher_embeddings,
-					epochs=TRAINING_EPOCHS,
-					checkpoint_manager=checkpoint_mgr,
-					model_manager=model_mgr,
-				)
-				del teacher_model
-				if torch.cuda.is_available():
-					torch.cuda.empty_cache()
-			except torch.cuda.OutOfMemoryError:
-				logger.warning("GPU OOM during code training, falling back to CPU...")
-				if torch.cuda.is_available():
-					torch.cuda.empty_cache()
-				# Force CPU for teacher model with optimized settings (no flash attention on CPU)
-				try:
-					teacher_model = load_teacher_model_with_cache(model_name, output_dir, device="cpu", resume=resume)
-				except ImportError as e:
-					if "flash_attn" in str(e):
-						logger.warning("Flash Attention 2 not available on CPU, using default attention")
-						# Fallback without any special attention implementation
-						teacher_model = load_teacher_model_with_cache(
-							model_name, output_dir, device="cpu", resume=resume
-						)
-					else:
-						raise
-				# Generate teacher embeddings on CPU with checkpoints
-				teacher_embeddings = generate_teacher_embeddings_with_checkpoints(
-					teacher_model, code_texts, checkpoint_mgr
-				)
-				# Refine with code training on CPU
-				m2v_model = refine_with_code_training(
-					m2v_model,
-					code_texts,
-					teacher_embeddings,
-					epochs=TRAINING_EPOCHS,
-					checkpoint_manager=checkpoint_mgr,
-					model_manager=model_mgr,
-				)
-				del teacher_model
-		# Step 4: Light regularization
-		logger.info("Step 4: Applying regularization...")
-		m2v_model = apply_regularization(m2v_model, REGULARIZATION_WEIGHT)
-	# Save final model
-	logger.info("Saving code-specialized model...")
-	# Final validation before saving
-	if m2v_model is None:
-		msg = "Cannot save model: m2v_model is None"
-		raise ValueError(msg)
-	if not hasattr(m2v_model, "save_pretrained"):
-		msg = f"Cannot save model: m2v_model of type {type(m2v_model)} does not have save_pretrained method"
-		raise ValueError(msg)
-	logger.info(f"Final model type: {type(m2v_model)}")
-	logger.info(f"Final model has embedding attribute: {hasattr(m2v_model, 'embedding')}")
-	m2v_model.save_pretrained(str(output_path))
-	# Save final model to beam volume as well - DISABLED due to recursive directory bug
-	# model_mgr.save_model("final_model", str(output_path))
-	total_time = time.time() - start_time
-	logger.info(f"Code-specialized distillation completed in {total_time:.2f} seconds")
-	return m2v_model
-@function(
-	gpu=GPU_NAME,
-	volumes=[Volume(name=VOLUME_NAME, mount_path=VOLUME_PATH)],
-	image=IMAGE,
-	secrets=["HF_ACCESS_TOKEN"],
-	env={
-		"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True,max_split_size_mb:512",
-		"TOKENIZERS_PARALLELISM": "false",
-		"CUDA_LAUNCH_BLOCKING": "0",  # Allow async CUDA operations
-		"TORCH_CUDNN_V8_API_ENABLED": "1",  # Enable optimized cuDNN
-		"OMP_NUM_THREADS": "8",  # Limit CPU threads for better GPU utilization
-	},
-	timeout=3600 * 12,  # 12 hours
-)
-def beam_code_distillation(
-	model_name: str = MODEL_NAME,
-	output_dir: str = OUTPUT_DIR,
-	pca_dims: int = PCA_DIMS,
-	max_samples: int = MAX_TRAINING_SAMPLES,
-	resume: bool = True,
-) -> Any:
-	# Apply all patches from the patches directory
-	try:
-		from .patch_utils import apply_all_patches
-		logger.info("Applying all patches from patches directory...")
-		patches_applied = apply_all_patches()
-		logger.info(f"Successfully applied {patches_applied} patches")
-	except Exception as e:
-		logger.warning(f"Failed to apply patches: {e}. Continuing without patches.")
-	return code_specialized_distillation(
-		model_name=model_name,
-		output_dir=output_dir,
-		pca_dims=pca_dims,
-		max_samples=max_samples,
-		resume=resume,
-	)
 if __name__ == "__main__":
-	code_specialized_distillation()

 """
+Unified Code-Specialized Model2Vec Distillation Script.
+This script provides a unified approach for creating code-specialized embeddings
+using Model2Vec distillation with optional code-specific training.
 Features:
+- Basic distillation (default): Simple Model2Vec distillation
+- Advanced training (--train flag): Additional CodeSearchNet fine-tuning
+- Checkpoint support with Beam sync utilities
+- Multi-teacher model processing
+- Smart resume capabilities
+- Hierarchical storage: base → final
+Directory Structure:
+- code_model2vec/base: Basic distilled models (first step)
+- code_model2vec/final: Final models (copied from base or after training)
+Usage:
+    distiller distill [--use-beam] [--train]  # Basic distillation or with training
 """
 import json
 import logging
 import time
 from pathlib import Path
+from typing import Annotated, Any
 import numpy as np
 import torch
+import typer
+from beam import Volume, function
 from datasets import load_dataset
 from model2vec.distill import distill
 from model2vec.train.base import FinetunableStaticModel, TextDataset
 from .beam_utils import (
 	BeamCheckpointManager,
 	create_beam_utilities,
+	download_model_from_beam,
+	sync_checkpoints_from_beam,
+	sync_checkpoints_to_beam,
+	upload_model_to_beam,
+)
+from .config import (
+	BEAM_ENV_SETTINGS,
+	GPU_NAME,
+	IMAGE,
+	codesearchnet_config,
+	directories,
+	distillation_config,
+	get_volume_config,
+	languages_config,
 )
 # =============================================================================
+# CONFIGURATION
 # =============================================================================
+VOLUME_CONFIG = get_volume_config()
+LOCAL_BASE_DIR = directories.base
+LOCAL_FINAL_DIR = directories.final
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+# Teacher models for distillation
+DEFAULT_TEACHER_MODELS = list(distillation_config.code_teacher_models)
 # =============================================================================
+# UTILITY FUNCTIONS
 # =============================================================================
+def apply_local_patches() -> bool:
+	"""Apply patches locally without requiring Beam utilities."""
+	try:
+		try:
+			from .patch_utils import apply_all_patches
+			patches_applied = apply_all_patches()
+			logger.info(f"Successfully applied {patches_applied} patches via patch_utils")
+			return True
+		except ImportError:
+			logger.warning("patch_utils not available, trying direct patching")
+		return False
+	except Exception as e:
+		logger.warning(f"Failed to apply patches: {e}")
+		return False
+def get_current_config_hash(enable_training: bool) -> str:
 	"""Generate a hash of current configuration parameters for checkpoint validation."""
 	import hashlib
 	config_params = {
+		"pca_dims": distillation_config.optimal_pca_dims,
+		"sif_coefficient": distillation_config.sif_coefficient,
+		"apply_zipf": distillation_config.apply_zipf,
+		"enable_training": enable_training,
 	}
+	if enable_training:
+		config_params.update(
+			{
+				"training_epochs": distillation_config.training_epochs,
+				"learning_rate": distillation_config.learning_rate,
+				"max_samples": distillation_config.max_training_samples,
+			}
+		)
 	config_str = str(sorted(config_params.items()))
 	return hashlib.md5(config_str.encode()).hexdigest()[:12]  # noqa: S324
+def check_existing_base_model(teacher_name: str) -> str | None:
+	"""Check if base distilled model already exists locally."""
+	base_dir = Path(LOCAL_BASE_DIR)
+	model_dir = base_dir / f"code_model2vec_{teacher_name}"
+	if model_dir.exists():
+		# Check for essential model files
+		has_config = (model_dir / "config.json").exists()
+		has_model_file = any(
+			[
+				(model_dir / "model.safetensors").exists(),
+				(model_dir / "model.bin").exists(),
+				(model_dir / "pytorch_model.bin").exists(),
+			]
+		)
+		if has_config and has_model_file:
+			logger.info(f"✅ Found existing base model: {teacher_name}")
+			return str(model_dir)
+	return None
+def check_existing_final_model(teacher_name: str, enable_training: bool = False) -> str | None:
+	"""Check if final model already exists locally."""
+	final_dir = Path(LOCAL_FINAL_DIR)
+	# Add suffix for trained models
+	model_name = f"code_model2vec_{teacher_name}"
+	if enable_training:
+		model_name += "_fine_tuned"
+	model_dir = final_dir / model_name
+	if model_dir.exists():
+		# Check for essential model files
+		has_config = (model_dir / "config.json").exists()
+		has_model_file = any(
+			[
+				(model_dir / "model.safetensors").exists(),
+				(model_dir / "model.bin").exists(),
+				(model_dir / "pytorch_model.bin").exists(),
+			]
+		)
+		if has_config and has_model_file:
+			logger.info(f"✅ Found existing final model: {teacher_name}{'_fine_tuned' if enable_training else ''}")
+			return str(model_dir)
+	return None
+def copy_base_to_final(teacher_name: str, enable_training: bool = False) -> bool:
+	"""Copy base model to final directory."""
+	import shutil
+	base_path = Path(LOCAL_BASE_DIR) / f"code_model2vec_{teacher_name}"
+	# Add suffix for trained models
+	final_model_name = f"code_model2vec_{teacher_name}"
+	if enable_training:
+		final_model_name += "_fine_tuned"
+	final_path = Path(LOCAL_FINAL_DIR) / final_model_name
+	try:
+		final_path.parent.mkdir(parents=True, exist_ok=True)
+		if final_path.exists():
+			shutil.rmtree(final_path)
+		shutil.copytree(base_path, final_path)
+		logger.info(f"📁 Copied {teacher_name} from base to final{'_fine_tuned' if enable_training else ''}")
+		return True
+	except Exception:
+		logger.exception(f"❌ Failed to copy {teacher_name} to final{'_fine_tuned' if enable_training else ''}")
 		return False
+def sync_model_from_beam(
+	teacher_name: str,
+	target_dir: str,
+	use_beam_utilities: bool = False,
+) -> bool:
+	"""Sync model from Beam volume to local directory."""
+	if not use_beam_utilities:
 		return False
+	try:
+		target_path = Path(target_dir)
+		target_path.mkdir(parents=True, exist_ok=True)
+		beam_model_name = f"{teacher_name}_model"
+		success = download_model_from_beam(VOLUME_CONFIG.name, beam_model_name, str(target_path))
+		if success:
+			logger.info(f"📥 Synced {teacher_name} from Beam to {target_dir}")
+			return True
+		logger.warning(f"⚠️ Failed to sync {teacher_name} from Beam")
 		return False
+	except Exception as e:
+		logger.warning(f"Failed to sync {teacher_name} from Beam: {e}")
+		return False
+def sync_model_to_beam(
+	teacher_name: str,
+	source_dir: str,
+	use_beam_utilities: bool = False,
+) -> bool:
+	"""Sync model from local directory to Beam volume."""
+	if not use_beam_utilities:
+		return False
+	try:
+		beam_model_name = f"{teacher_name}_model"
+		success = upload_model_to_beam(VOLUME_CONFIG.name, beam_model_name, source_dir)
+		if success:
+			logger.info(f"📤 Synced {teacher_name} to Beam from {source_dir}")
+			return True
+		logger.warning(f"⚠️ Failed to sync {teacher_name} to Beam")
 		return False
+	except Exception as e:
+		logger.warning(f"Failed to sync {teacher_name} to Beam: {e}")
+		return False
+# =============================================================================
+# DISTILLATION FUNCTIONS
+# =============================================================================
+def simple_distillation(
+	teacher_model: str,
+	output_dir: str,
+	pca_dims: int | None = None,
+	retry_with_cache_clear: bool = False,
+) -> Any:
 	"""
+	Perform simple Model2Vec distillation without additional training.
 	Args:
+		teacher_model: Name of teacher model
+		output_dir: Output directory for the distilled model
+		pca_dims: PCA dimensions (uses config default if None)
+		retry_with_cache_clear: Whether this is a retry after clearing cache
 	Returns:
+		Distilled model or None if failed
 	"""
+	if pca_dims is None:
+		pca_dims = int(distillation_config.optimal_pca_dims)
+	output_path = Path(output_dir)
+	output_path.mkdir(parents=True, exist_ok=True)
+	retry_suffix = " (retry after cache clear)" if retry_with_cache_clear else ""
+	logger.info(f"🔄 Simple distillation{retry_suffix}: {teacher_model} → {output_dir}")
+	logger.info(f"📊 PCA dims: {pca_dims}, SIF: {distillation_config.sif_coefficient}")
+	start_time = time.time()
+	try:
+		# Perform distillation with optimal parameters
+		model = distill(
+			model_name=teacher_model,
+			pca_dims=int(pca_dims),
+			apply_zipf=bool(distillation_config.apply_zipf),
+			sif_coefficient=float(distillation_config.sif_coefficient),
+			trust_remote_code=True,
+		)
+		logger.info("✅ Core distillation completed successfully")
+		# Save the model
+		model.save_pretrained(str(output_path))
+		logger.info(f"💾 Model saved to {output_path}")
+		# Log model info
+		logger.info(f"Model type: {type(model)}")
+		if hasattr(model, "embedding"):
+			logger.info(f"Embedding shape: {model.embedding.shape}")
+			logger.info(f"Embedding dtype: {model.embedding.dtype}")
+		total_time = time.time() - start_time
+		logger.info(f"🎉 Simple distillation completed in {total_time:.2f} seconds")
+		return model
+	except ValueError as e:
+		if "Number of tokens" in str(e) and "does not match number of vectors" in str(e):
+			logger.warning(f"⚠️ Token-vector mismatch with {teacher_model} - this is a Model2Vec library issue")
+			logger.warning(f"Error details: {e}")
+			logger.warning("💡 This model has incompatible tokenization. Skipping...")
+			return None
+		if "weight is on the meta device" in str(e):
+			logger.warning(f"⚠️ Device placement issue with {teacher_model} - model weights on meta device")
+			logger.warning(f"Error details: {e}")
+			logger.warning("💡 This model has device placement issues. Skipping...")
+			return None
+		raise
+	except AttributeError as e:
+		if "backend_tokenizer" in str(e):
+			logger.warning(f"⚠️ Tokenizer compatibility issue with {teacher_model}")
+			logger.warning(f"Error details: {e}")
+			logger.warning("💡 This model's tokenizer is incompatible with Model2Vec. Skipping...")
+			return None
+		raise
+	except FileNotFoundError as e:
+		if "transformers_modules" in str(e) or "xlm_padding.py" in str(e):
+			logger.warning(f"⚠️ Missing custom model files for {teacher_model}")
+			logger.warning(f"Error details: {e}")
+			# Try clearing cache and retrying once
+			if not retry_with_cache_clear:
+				logger.info("🔧 Attempting to clear cache and retry...")
+				if clear_model_cache(teacher_model):
+					logger.info("🔄 Retrying distillation after cache clear...")
+					return simple_distillation(teacher_model, output_dir, pca_dims, retry_with_cache_clear=True)
+			logger.warning("💡 This model has missing dependencies. Manual intervention may be required.")
+			return None
+		raise
+	except Exception:
+		logger.exception(f"❌ Simple distillation failed for {teacher_model}")
+		return None
+def load_codesearchnet_dataset(
+	max_samples: int | None = None,
 	checkpoint_manager: BeamCheckpointManager | None = None,
 ) -> list[str]:
+	"""Load and format the CodeSearchNet dataset for training with balanced language distribution."""
+	if max_samples is None:
+		max_samples = int(distillation_config.max_training_samples)
+	logger.info(f"Loading CodeSearchNet dataset from {codesearchnet_config.dataset_name}")
 	logger.info(f"Limiting to {max_samples} samples for training efficiency")
+	logger.info(f"Languages: {', '.join(languages_config.all)}")
+	# Check for existing dataset checkpoint
+	texts = []
+	start_from = 0
 	if checkpoint_manager:
 		checkpoint_data = checkpoint_manager.load_checkpoint("dataset", 0)
 		if checkpoint_data:
+			cached_texts = checkpoint_data.get("data", {}).get("texts", [])
+			if len(cached_texts) >= max_samples:
+				logger.info(f"✅ Resumed dataset loading: {len(cached_texts)} texts from checkpoint")
+				return cached_texts[:max_samples]
+			logger.info(f"📋 Partial dataset found: {len(cached_texts)} texts, continuing...")
+			texts = cached_texts
+			start_from = len(texts)
 	try:
+		# Calculate samples per language for balanced distribution
+		num_languages = len(languages_config.all)
+		samples_per_language = max_samples // num_languages
+		remaining_samples = max_samples % num_languages
+		logger.info(f"📊 Target distribution: {samples_per_language} samples per language")
+		if remaining_samples > 0:
+			logger.info(f"📊 Extra {remaining_samples} samples will be distributed to first languages")
+		# Load training data from each language separately for balanced distribution
+		language_texts: dict[str, list[str]] = {}
+		total_collected = len(texts)
+		for i, language in enumerate(languages_config.all):
+			if total_collected >= max_samples:
 				break
+			logger.info(f"🔍 Loading {language} training data...")
+			# Determine how many samples to collect for this language
+			target_for_lang = samples_per_language
+			if i < remaining_samples:  # Distribute extra samples to first languages
+				target_for_lang += 1
+			# Skip if we already have enough from this language
+			if language in language_texts and len(language_texts[language]) >= target_for_lang:
+				continue
+			try:
+				# Load training split for the specific language (same format as evaluate.py)
+				dataset = load_dataset(
+					codesearchnet_config.dataset_name,
+					language,
+					split="train",
+					trust_remote_code=True,
+				)
+				lang_texts: list[str] = []
+				processed_count = 0
+				for processed_count, example in enumerate(dataset, 1):
+					if len(lang_texts) >= target_for_lang:
+						break
+					# Use same field names as evaluate.py
+					doc_string = example.get("func_documentation_string", "").strip()
+					code_string = example.get("func_code_string", "").strip()
+					if doc_string and code_string and len(doc_string.split()) >= 3 and len(code_string) > 50:
+						# Format as documentation-code pair for training (same as evaluate.py)
+						text = f"Documentation: {doc_string}\nCode:\n{code_string}"
+						# Ensure reasonable length for embedding models
+						if len(text) <= 2048:
+							lang_texts.append(text)
+					if processed_count % 5000 == 0:
+						logger.info(f"  {language}: processed {processed_count}, collected {len(lang_texts)}")
+				language_texts[language] = lang_texts
+				total_collected += len(lang_texts)
+				logger.info(f"✅ {language}: collected {len(lang_texts)} samples")
+			except Exception as e:
+				logger.warning(f"⚠️ Failed to load {language} data: {e}")
+				continue
+		# Combine all language texts in a balanced way
+		combined_texts = []
+		# Add existing texts first (from checkpoint)
+		if start_from > 0:
+			combined_texts = texts[:start_from]
+		# Interleave texts from different languages for better training distribution
+		max_lang_samples = max(len(lang_texts) for lang_texts in language_texts.values()) if language_texts else 0
+		for sample_idx in range(max_lang_samples):
+			for language in languages_config.all:
+				if len(combined_texts) >= max_samples:
+					break
+				if language in language_texts and sample_idx < len(language_texts[language]):
+					combined_texts.append(language_texts[language][sample_idx])
+			if len(combined_texts) >= max_samples:
+				break
+		# Truncate to exact max_samples
+		combined_texts = combined_texts[:max_samples]
+		# Log final distribution
+		logger.info("📊 Final dataset distribution:")
+		lang_counts: dict[str, int] = {}
+		for text in combined_texts:
+			# Simple heuristic to identify language from code patterns
+			if "def " in text and ":" in text:
+				lang_counts["python"] = lang_counts.get("python", 0) + 1
+			elif "function " in text and "{" in text:
+				lang_counts["javascript"] = lang_counts.get("javascript", 0) + 1
+			elif "public " in text and "class " in text:
+				lang_counts["java"] = lang_counts.get("java", 0) + 1
+			elif "<?php" in text or "$" in text:
+				lang_counts["php"] = lang_counts.get("php", 0) + 1
+			elif "func " in text and "end" in text:
+				lang_counts["ruby"] = lang_counts.get("ruby", 0) + 1
+			elif "func " in text and "}" in text:
+				lang_counts["go"] = lang_counts.get("go", 0) + 1
+			else:
+				lang_counts["other"] = lang_counts.get("other", 0) + 1
+		for lang, count in lang_counts.items():
+			percentage = (count / len(combined_texts)) * 100
+			logger.info(f"  {lang}: {count} samples ({percentage:.1f}%)")
 		# Final checkpoint save
 		if checkpoint_manager:
+			checkpoint_data = {
+				"config_hash": get_current_config_hash(enable_training=True),
+				"stage": "dataset",
+				"step": 0,
+				"timestamp": time.time(),
+				"data": {"texts": combined_texts},
+			}
 			checkpoint_manager.save_checkpoint("dataset", checkpoint_data, 0)
+		logger.info(f"Successfully loaded {len(combined_texts)} balanced code-documentation pairs from CodeSearchNet")
+		return combined_texts
 	except Exception:
 		logger.exception("Error loading CodeSearchNet dataset")
 		return texts  # Return what we have so far
+def generate_teacher_embeddings(
 	teacher_model: SentenceTransformer,
 	texts: list[str],
 	checkpoint_manager: BeamCheckpointManager | None = None,
 	"""Generate teacher embeddings for code training with checkpoint support."""
 	logger.info(f"Generating teacher embeddings for {len(texts)} texts...")
+	# Check for existing embeddings checkpoint
 	if checkpoint_manager:
+		volume_path = Path(VOLUME_CONFIG.mount_path)
+		embeddings_path = volume_path / "embeddings_cache.pt"
+		config_path = volume_path / "embeddings_config.json"
 		if embeddings_path.exists() and config_path.exists():
 			try:
 				with config_path.open("r") as f:
 					config_data = json.load(f)
+				current_hash = get_current_config_hash(enable_training=True)
+				if config_data.get("config_hash") == current_hash:
 					# Load the embeddings tensor
 					final_embeddings = torch.load(embeddings_path, map_location="cpu")
 					num_expected = config_data.get("num_texts", len(texts))
 					if final_embeddings.shape[0] >= num_expected:
+						logger.info(f"✅ Loaded embeddings from cache ({final_embeddings.shape[0]} embeddings)")
+						return final_embeddings[: len(texts)]
 			except Exception as e:
 				logger.warning(f"Failed to load embeddings cache: {e}, regenerating...")
 	# Generate embeddings from scratch
 	logger.info("Generating fresh teacher embeddings...")
+	batch_size = int(distillation_config.teacher_model_config.get("batch_size", 16))
 	embeddings_list = []
+	for i in range(0, len(texts), batch_size):
+		batch_texts = texts[i : i + batch_size]
 		try:
 			batch_embeddings = teacher_model.encode(
 				batch_texts,
 				convert_to_tensor=True,
+				batch_size=batch_size,
+				show_progress_bar=False,
+				normalize_embeddings=True,
 			)
 			embeddings_list.append(batch_embeddings)
+			if i % (batch_size * 10) == 0:
 				logger.info(f"Generated embeddings for {i + len(batch_texts)}/{len(texts)} texts")
 		except torch.cuda.OutOfMemoryError:
+			logger.warning(f"GPU OOM with batch size {batch_size}, reducing...")
+			torch.cuda.empty_cache()
+			batch_size = max(1, batch_size // 2)
 			# Retry with smaller batch size
 			batch_embeddings = teacher_model.encode(
 				batch_texts,
 				convert_to_tensor=True,
+				batch_size=batch_size,
 				show_progress_bar=False,
 				normalize_embeddings=True,
 			)
 			embeddings_list.append(batch_embeddings)
+	# Combine all embeddings
 	teacher_embeddings = torch.cat(embeddings_list, dim=0)
+	# Ensure fp32 precision
 	if teacher_embeddings.dtype != torch.float32:
 		teacher_embeddings = teacher_embeddings.to(torch.float32)
 	logger.info(f"Generated {teacher_embeddings.shape[0]} teacher embeddings in {teacher_embeddings.dtype}")
+	# Save embeddings cache for future runs
 	if checkpoint_manager:
 		try:
+			volume_path = Path(VOLUME_CONFIG.mount_path)
+			embeddings_path = volume_path / "embeddings_cache.pt"
+			config_path = volume_path / "embeddings_config.json"
 			# Save embeddings tensor
 			torch.save(teacher_embeddings, embeddings_path)
 			# Save configuration
 			config_data = {
+				"config_hash": get_current_config_hash(enable_training=True),
 				"num_texts": len(texts),
 				"embedding_shape": list(teacher_embeddings.shape),
 				"timestamp": time.time(),
 	return teacher_embeddings
+def advanced_training(
 	student_model: Any,
+	teacher_model: SentenceTransformer,
 	checkpoint_manager: BeamCheckpointManager | None = None,
 ) -> Any:
+	"""Perform advanced code specialization training."""
+	logger.info("🎓 Starting advanced code specialization training...")
+	# Load CodeSearchNet training data
+	training_texts = load_codesearchnet_dataset(checkpoint_manager=checkpoint_manager)
+	if not training_texts:
+		logger.warning("No training data available, skipping advanced training")
+		return student_model
+	# Generate teacher embeddings
+	teacher_embeddings = generate_teacher_embeddings(teacher_model, training_texts, checkpoint_manager)
+	# Create trainable model
+	student_embedding_dim = student_model.embedding.shape[1]
+	teacher_embedding_dim = teacher_embeddings.shape[1]
+	# Project teacher embeddings if needed
+	if teacher_embedding_dim != student_embedding_dim:
+		from sklearn.decomposition import PCA
+		logger.info("Performing PCA projection for dimension matching...")
+		pca = PCA(n_components=student_embedding_dim)
+		teacher_embeddings_np = teacher_embeddings.cpu().numpy().astype(np.float64)
+		teacher_embeddings_projected = pca.fit_transform(teacher_embeddings_np)
+		teacher_embeddings = torch.tensor(teacher_embeddings_projected.astype(np.float32), dtype=torch.float32)
+	# Create trainable model
+	trainable_model = FinetunableStaticModel.from_static_model(
+		model=student_model,
+		out_dim=student_embedding_dim,
+	)
+	trainable_model = trainable_model.float()
+	# Tokenize texts
+	tokenized_texts = []
+	for text in training_texts:
+		tokens = trainable_model.tokenize([text])
+		if tokens.shape[1] > 0:
+			tokenized_texts.append(tokens[0].tolist())
+	# Prepare training data
+	targets = teacher_embeddings[: len(tokenized_texts)].to(torch.float32)
+	train_texts, val_texts, train_targets, val_targets = train_test_split(
+		tokenized_texts, targets, test_size=0.2, random_state=42
+	)
+	# Training setup
+	train_dataset = TextDataset(train_texts, train_targets)
+	val_dataset = TextDataset(val_texts, val_targets)
+	optimizer = optim.Adam(trainable_model.parameters(), lr=float(distillation_config.learning_rate))
+	mse_loss = nn.MSELoss()
+	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+	trainable_model = trainable_model.to(device)
+	batch_size = int(distillation_config.batch_size)
+	epochs = int(distillation_config.training_epochs)
+	# Training loop
+	for epoch in range(epochs):
+		trainable_model.train()
+		try:
+			train_loader = train_dataset.to_dataloader(shuffle=True, batch_size=batch_size)
+			epoch_loss = 0.0
+			num_batches = 0
+			for _batch_idx, (tokens, targets_batch) in enumerate(train_loader):
+				batch_tokens = tokens.to(device)
+				batch_targets = targets_batch.to(device).to(torch.float32)
+				optimizer.zero_grad()
+				_, student_embeddings = trainable_model(batch_tokens)
+				student_embeddings = student_embeddings.to(torch.float32)
+				loss = mse_loss(student_embeddings, batch_targets)
+				loss.backward()
+				optimizer.step()
+				epoch_loss += loss.item()
+				num_batches += 1
+		except torch.cuda.OutOfMemoryError:
+			logger.warning(f"Training OOM with batch size {batch_size}, reducing...")
+			batch_size = max(1, batch_size // 2)
+			torch.cuda.empty_cache()
+			continue
+		avg_train_loss = epoch_loss / num_batches if num_batches > 0 else 0.0
+		# Validation
+		trainable_model.eval()
+		val_loader = val_dataset.to_dataloader(shuffle=False, batch_size=batch_size)
+		val_loss = 0.0
+		val_batches = 0
+		with torch.no_grad():
+			for tokens, targets_batch in val_loader:
+				batch_tokens = tokens.to(device)
+				batch_targets = targets_batch.to(device).to(torch.float32)
+				_, student_embeddings = trainable_model(batch_tokens)
+				student_embeddings = student_embeddings.to(torch.float32)
+				loss = mse_loss(student_embeddings, batch_targets)
+				val_loss += loss.item()
+				val_batches += 1
+		avg_val_loss = val_loss / val_batches if val_batches > 0 else 0.0
+		logger.info(f"Epoch {epoch + 1}/{epochs} - Train: {avg_train_loss:.6f}, Val: {avg_val_loss:.6f}")
+		# Save checkpoint
+		if checkpoint_manager:
+			checkpoint_data = {
+				"config_hash": get_current_config_hash(enable_training=True),
+				"stage": "training",
+				"step": epoch + 1,
+				"timestamp": time.time(),
+				"data": {
+					"model_state": trainable_model.state_dict(),
+					"optimizer_state": optimizer.state_dict(),
+					"train_loss": avg_train_loss,
+					"val_loss": avg_val_loss,
+				},
+			}
+			checkpoint_manager.save_checkpoint("training", checkpoint_data, epoch + 1)
+	# Convert back to static model
+	refined_model = trainable_model.to_static_model()
+	logger.info("✅ Advanced training completed")
+	return refined_model
+def distill_single_teacher(
+	teacher_model: str,
+	enable_training: bool = False,
+	use_beam_utilities: bool = False,
+	pca_dims: int | None = None,
+) -> dict[str, Any]:
+	"""
+	Distill a single teacher model with optional training.
+	Args:
+		teacher_model: Name of teacher model
+		enable_training: Whether to enable advanced training
+		use_beam_utilities: Whether to use Beam utilities
+		pca_dims: PCA dimensions
+	Returns:
+		Dictionary with distillation results
+	"""
+	teacher_name = teacher_model.split("/")[-1].replace("-", "_")
+	base_dir = Path(LOCAL_BASE_DIR) / f"code_model2vec_{teacher_name}"
+	# Add suffix for trained models
+	final_model_name = f"code_model2vec_{teacher_name}"
+	if enable_training:
+		final_model_name += "_fine_tuned"
+	final_dir = Path(LOCAL_FINAL_DIR) / final_model_name
+	logger.info(f"\n{'=' * 60}")
+	logger.info(f"🔄 Processing teacher model: {teacher_model}")
+	logger.info(f"📁 Teacher name: {teacher_name}")
+	logger.info(f"🎓 Training enabled: {enable_training}")
+	logger.info(f"{'=' * 60}")
+	# Check model compatibility first
+	is_compatible, warning_msg = check_model_compatibility(teacher_model)
+	if not is_compatible:
+		logger.warning(f"⚠️ Known compatibility issue: {warning_msg}")
+		logger.info("🔧 Attempting distillation anyway, but may fail...")
+		# Try model-specific workarounds
+		workaround_type = try_model_workarounds(teacher_model)
+		# Don't skip if we have a workaround - we'll use it later
+	start_time = time.time()
+	# Initialize Beam utilities if requested
+	checkpoint_mgr = None
+	model_mgr = None
+	if use_beam_utilities:
+		try:
+			_, checkpoint_mgr, model_mgr, _ = create_beam_utilities(VOLUME_CONFIG.name, VOLUME_CONFIG.mount_path)
+		except Exception as e:
+			logger.warning(f"Failed to initialize Beam utilities: {e}")
+	try:
+		# Step 1: Check for existing final model
+		existing_final = check_existing_final_model(teacher_name, enable_training)
+		if existing_final:
+			logger.info(f"✅ Final model already exists: {teacher_name}{'_fine_tuned' if enable_training else ''}")
+			return {
+				"teacher_model": teacher_model,
+				"teacher_name": teacher_name,
+				"status": "skipped_existing_final",
+				"final_path": existing_final,
+				"distillation_time": 0.0,
+			}
+		# Step 1.5: Sync existing checkpoints from Beam if using Beam utilities
+		if use_beam_utilities and checkpoint_mgr:
+			logger.info(f"🔄 Syncing existing checkpoints for {teacher_name}...")
+			sync_checkpoints_from_beam(VOLUME_CONFIG.name, f"distillation_{teacher_name}", directories.checkpoints)
+			if enable_training:
+				sync_checkpoints_from_beam(VOLUME_CONFIG.name, f"training_{teacher_name}", directories.checkpoints)
+		# Step 2: Check for existing base model or create it
+		existing_base = check_existing_base_model(teacher_name)
+		base_model = None
+		if existing_base:
+			logger.info(f"✅ Found existing base model: {teacher_name}")
+			if enable_training:
+				# Load base model for training
+				from model2vec.model import StaticModel
+				base_model = StaticModel.from_pretrained(existing_base)
+		elif use_beam_utilities:
+			synced = sync_model_from_beam(teacher_name, str(base_dir), use_beam_utilities)
+			if synced:
+				existing_base = str(base_dir)
+				if enable_training:
+					from model2vec.model import StaticModel
+					base_model = StaticModel.from_pretrained(existing_base)
+		if not existing_base:
+			# Perform simple distillation to create base model
+			logger.info(f"🔄 Creating base model for {teacher_name}")
+			# Check if we need specialized distillation
+			workaround_type = try_model_workarounds(teacher_model)
+			if workaround_type == "salesforce":
+				base_model = salesforce_model_distillation(teacher_model, str(base_dir), pca_dims)
+			elif workaround_type == "baai":
+				base_model = baai_bge_model_distillation(teacher_model, str(base_dir), pca_dims)
+			else:
+				base_model = simple_distillation(teacher_model, str(base_dir), pca_dims)
+			if base_model is None:
+				return {
+					"teacher_model": teacher_model,
+					"teacher_name": teacher_name,
+					"status": "failed_base_distillation",
+					"error": "Simple distillation failed",
 				}
+			# Sync base model and checkpoints to Beam
+			if use_beam_utilities:
+				sync_model_to_beam(teacher_name, str(base_dir), use_beam_utilities)
+				if checkpoint_mgr:
+					sync_checkpoints_to_beam(
+						VOLUME_CONFIG.name, f"distillation_{teacher_name}", directories.checkpoints
+					)
+			existing_base = str(base_dir)
+		# Step 3: Handle final model creation
+		if enable_training and base_model is not None:
+			# Perform advanced training
+			logger.info(f"🎓 Starting advanced training for {teacher_name}")
+			# Load teacher model for training
+			device = "cuda" if torch.cuda.is_available() else "cpu"
+			teacher_st_model = SentenceTransformer(teacher_model, device=device, trust_remote_code=True)
+			# Perform advanced training
+			final_model = advanced_training(base_model, teacher_st_model, checkpoint_mgr)
+			# Save final model
+			final_dir.mkdir(parents=True, exist_ok=True)
+			final_model.save_pretrained(str(final_dir))
+			# Sync final model and training checkpoints to Beam
+			if use_beam_utilities:
+				sync_model_to_beam(f"{teacher_name}_final", str(final_dir), use_beam_utilities)
+				if checkpoint_mgr:
+					sync_checkpoints_to_beam(VOLUME_CONFIG.name, f"training_{teacher_name}", directories.checkpoints)
+			del teacher_st_model
+			if torch.cuda.is_available():
+				torch.cuda.empty_cache()
+		else:
+			# Copy base to final (no training)
+			logger.info(f"📁 Copying base to final for {teacher_name}")
+			if not copy_base_to_final(teacher_name, enable_training):
+				return {
+					"teacher_model": teacher_model,
+					"teacher_name": teacher_name,
+					"status": "failed_copy_to_final",
+					"error": "Failed to copy base to final",
+				}
+		total_time = time.time() - start_time
+		return {
+			"teacher_model": teacher_model,
+			"teacher_name": teacher_name,
+			"status": "success",
+			"enable_training": enable_training,
+			"base_path": existing_base,
+			"final_path": str(final_dir),
+			"distillation_time": total_time,
+		}
 	except Exception as e:
+		logger.exception(f"❌ Failed to process {teacher_model}")
+		return {
+			"teacher_model": teacher_model,
+			"teacher_name": teacher_name,
+			"status": "failed",
+			"error": str(e),
+		}
+# =============================================================================
+# MAIN EXECUTION FUNCTIONS
+# =============================================================================
+def run_local_distillation(
+	teacher_models: list[str] | None = None,
+	enable_training: bool = False,
+	pca_dims: int | None = None,
+	clear_cache: bool = False,
+) -> dict[str, Any]:
+	"""Run distillation locally."""
+	logger.info("🖥️ Running distillation locally")
+	if teacher_models is None:
+		teacher_models = DEFAULT_TEACHER_MODELS
+	# Apply patches
+	patch_success = apply_local_patches()
+	if patch_success:
+		logger.info("✅ Successfully applied patches")
+	else:
+		logger.warning("⚠️ Failed to apply patches - some models may fail")
+	results = {}
+	successful_models = []
+	logger.info("🚀 Starting distillation workflow")
+	logger.info(f"📊 Processing {len(teacher_models)} teacher models")
+	logger.info(f"🎓 Training enabled: {enable_training}")
+	# Use default models if none specified
+	models_to_distill = teacher_models if teacher_models else DEFAULT_TEACHER_MODELS
+	logger.info(f"📊 Teacher models to process: {len(models_to_distill)}")
+	for i, model in enumerate(models_to_distill, 1):
+		logger.info(f"  {i}. {model}")
+	# Clear cache for problematic models if requested
+	if clear_cache:
+		logger.info("🧹 Clearing cache for known problematic models...")
+		problematic_models = ["BAAI/bge-code-v1", "jinaai/jina-embeddings-v3", "Salesforce/SFR-Embedding-Code-2B_R"]
+		for model in problematic_models:
+			if model in models_to_distill:
+				clear_model_cache(model)
+	for teacher_model in models_to_distill:
+		result = distill_single_teacher(
+			teacher_model=teacher_model,
+			enable_training=enable_training,
+			use_beam_utilities=False,
+			pca_dims=pca_dims,
+		)
+		teacher_name = result["teacher_name"]
+		results[teacher_name] = result
+		if result["status"] == "success" or result["status"].startswith("skipped"):
+			successful_models.append(teacher_name)
+	# Summary
+	logger.info("\n🏆 DISTILLATION WORKFLOW COMPLETE!")
+	logger.info(f"📊 Successful models: {len(successful_models)}")
+	logger.info(f"🎓 Training mode: {'Enabled' if enable_training else 'Basic distillation only'}")
+	for model_name in successful_models:
+		result = results[model_name]
+		logger.info(f"✅ {model_name}: {result['teacher_model']}")
+	# Save results summary
+	results_summary = {
+		"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+		"enable_training": enable_training,
+		"successful_models": successful_models,
+		"all_results": results,
+		"total_successful": len(successful_models),
+		"total_attempted": len(teacher_models or DEFAULT_TEACHER_MODELS),
+	}
+	# Save results to file
+	results_file = Path(LOCAL_BASE_DIR).parent / "distillation_results.json"
+	results_file.parent.mkdir(parents=True, exist_ok=True)
+	with results_file.open("w") as f:
+		json.dump(results_summary, f, indent=2)
+	logger.info(f"📊 Results summary saved to: {results_file}")
+	return results_summary
+@function(
+	gpu=GPU_NAME,
+	volumes=[Volume(name=VOLUME_CONFIG.name, mount_path=VOLUME_CONFIG.mount_path)],
+	image=IMAGE,
+	secrets=["HF_ACCESS_TOKEN"],
+	env=BEAM_ENV_SETTINGS,
+	timeout=3600 * 12,  # 12 hours
+)
+def _beam_distill_models(
+	teacher_models: list[str] | None = None,
+	enable_training: bool = False,
+	pca_dims: int | None = None,
+	clear_cache: bool = False,
+) -> dict[str, Any]:
+	"""Internal Beam function for distillation."""
+	logger.info("☁️ Running distillation on Beam")
+	# Apply patches
+	patch_success = apply_local_patches()
+	if patch_success:
+		logger.info("✅ Successfully applied patches")
+	else:
+		logger.warning("⚠️ Failed to apply patches - some models may fail")
+	if teacher_models is None:
+		teacher_models = DEFAULT_TEACHER_MODELS
+	# Clear cache for problematic models if requested
+	if clear_cache:
+		logger.info("🧹 Clearing cache for known problematic models...")
+		problematic_models = ["BAAI/bge-code-v1", "jinaai/jina-embeddings-v3", "Salesforce/SFR-Embedding-Code-2B_R"]
+		for model in problematic_models:
+			if model in teacher_models:
+				clear_model_cache(model)
+	results = {}
+	successful_models = []
+	logger.info("🚀 Starting Beam distillation workflow")
+	logger.info(f"📊 Processing {len(teacher_models)} teacher models")
+	logger.info(f"🎓 Training enabled: {enable_training}")
+	# Use default models if none specified
+	models_to_distill = teacher_models if teacher_models else DEFAULT_TEACHER_MODELS
+	logger.info(f"📊 Teacher models to process: {len(models_to_distill)}")
+	for i, model in enumerate(models_to_distill, 1):
+		logger.info(f"  {i}. {model}")
+	for teacher_model in models_to_distill:
+		result = distill_single_teacher(
+			teacher_model=teacher_model,
+			enable_training=enable_training,
+			use_beam_utilities=True,
+			pca_dims=pca_dims,
 		)
+		teacher_name = result["teacher_name"]
+		results[teacher_name] = result
+		if result["status"] == "success" or result["status"].startswith("skipped"):
+			successful_models.append(teacher_name)
+	# Summary
+	logger.info("\n🏆 BEAM DISTILLATION WORKFLOW COMPLETE!")
+	logger.info(f"📊 Successful models: {len(successful_models)}")
+	# Save results to Beam volume
+	volume_path = Path(VOLUME_CONFIG.mount_path)
+	results_summary = {
+		"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+		"enable_training": enable_training,
+		"successful_models": successful_models,
+		"all_results": results,
+		"total_successful": len(successful_models),
+		"total_attempted": len(teacher_models or DEFAULT_TEACHER_MODELS),
+	}
+	results_file = volume_path / "distillation_results.json"
+	with results_file.open("w") as f:
+		json.dump(results_summary, f, indent=2)
+	logger.info(f"📊 Beam results saved to: {results_file}")
+	return results_summary
+def run_beam_distillation(
+	teacher_models: list[str] | None = None,
+	enable_training: bool = False,
+	pca_dims: int | None = None,
+	clear_cache: bool = False,
+) -> dict[str, Any]:
+	"""Run distillation on Beam and sync results."""
+	logger.info("☁️ Running distillation on Beam with local sync")
+	try:
+		# Run distillation on Beam
+		results = _beam_distill_models.remote(teacher_models, enable_training, pca_dims, clear_cache)
+		# Check if Beam execution was successful
+		if not results:
+			logger.error("❌ Beam execution failed or returned no results")
+			return {
+				"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+				"enable_training": enable_training,
+				"successful_models": [],
+				"all_results": {},
+				"total_successful": 0,
+				"total_attempted": len(teacher_models or DEFAULT_TEACHER_MODELS),
+				"error": "Beam execution failed",
+			}
+		# Sync models back to local directories
+		if results.get("successful_models"):
+			logger.info("📥 Syncing models from Beam to local directories...")
+			for teacher_name in results["successful_models"]:
+				# Sync base model
+				base_dir = Path(LOCAL_BASE_DIR) / f"code_model2vec_{teacher_name}"
+				sync_model_from_beam(teacher_name, str(base_dir), use_beam_utilities=True)
+				# Sync final model if training was enabled
+				if enable_training:
+					final_dir = Path(LOCAL_FINAL_DIR) / f"code_model2vec_{teacher_name}"
+					sync_model_from_beam(f"{teacher_name}_final", str(final_dir), use_beam_utilities=True)
+				else:
+					# Copy base to final
+					copy_base_to_final(teacher_name, enable_training)
+			logger.info("✅ All models synced from Beam")
+		return results
 	except Exception as e:
+		logger.exception("❌ Beam distillation failed with exception")
+		return {
+			"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+			"enable_training": enable_training,
+			"successful_models": [],
+			"all_results": {},
+			"total_successful": 0,
+			"total_attempted": len(teacher_models or DEFAULT_TEACHER_MODELS),
+			"error": str(e),
+		}
+# =============================================================================
+# CLI INTERFACE
+# =============================================================================
+def main(
+	use_beam: Annotated[bool, typer.Option(help="Use Beam for distillation")] = False,
+	train: Annotated[bool, typer.Option(help="Enable advanced training (CodeSearchNet fine-tuning)")] = False,
+	teacher_models: Annotated[list[str] | None, typer.Option(help="Specific teacher models to distill")] = None,
+	pca_dims: Annotated[int | None, typer.Option(help="PCA dimensions (uses config default if not specified)")] = None,
+	clear_cache: Annotated[
+		bool, typer.Option(help="Clear HuggingFace cache for problematic models before distillation")
+	] = False,
+) -> None:
+	"""Unified distillation command with optional training."""
+	logger.info("🚀 Starting unified Model2Vec distillation workflow")
+	logger.info(f"🎓 Training mode: {'Advanced (CodeSearchNet fine-tuning)' if train else 'Basic distillation only'}")
+	logger.info(f"☁️  Execution: {'Beam' if use_beam else 'Local'}")
+	# Use default models if none specified
+	models_to_distill = teacher_models if teacher_models else DEFAULT_TEACHER_MODELS
+	logger.info(f"📊 Teacher models to process: {len(models_to_distill)}")
+	for i, model in enumerate(models_to_distill, 1):
+		logger.info(f"  {i}. {model}")
+	# Clear cache for problematic models if requested
+	if clear_cache:
+		logger.info("🧹 Clearing cache for known problematic models...")
+		problematic_models = ["BAAI/bge-code-v1", "jinaai/jina-embeddings-v3", "Salesforce/SFR-Embedding-Code-2B_R"]
+		for model in problematic_models:
+			if model in models_to_distill:
+				clear_model_cache(model)
+	# Run distillation workflow
+	if use_beam:
+		results = run_beam_distillation(
+			teacher_models=models_to_distill,
+			enable_training=train,
+			pca_dims=pca_dims,
+			clear_cache=clear_cache,
+		)
+	else:
+		results = run_local_distillation(
+			teacher_models=models_to_distill,
+			enable_training=train,
+			pca_dims=pca_dims,
+			clear_cache=clear_cache,
+		)
+	# Handle case where results might be None or invalid
+	if not results or not isinstance(results, dict):
+		logger.error("❌ Distillation workflow failed - no valid results returned")
+		results = {
+			"total_successful": 0,
+			"total_attempted": len(models_to_distill),
+			"error": "Workflow failed",
+		}
+	# Final summary
+	successful_count = results.get("total_successful", 0)
+	total_attempted = results.get("total_attempted", 0)
+	logger.info("\n🎉 UNIFIED DISTILLATION WORKFLOW COMPLETED!")
+	logger.info(f"📊 Successfully processed: {successful_count}/{total_attempted} models")
+	logger.info(f"📁 Base models saved to: {LOCAL_BASE_DIR}")
+	logger.info(f"📁 Final models saved to: {LOCAL_FINAL_DIR}")
+	if train:
+		logger.info("🎓 Advanced training was enabled - models include CodeSearchNet specialization")
+	else:
+		logger.info("📖 Basic distillation only - use --train flag to enable advanced training")
+def check_model_compatibility(teacher_model: str) -> tuple[bool, str | None]:
+	"""
+	Check if a model has known compatibility issues with Model2Vec.
+	Returns:
+		Tuple of (is_compatible, warning_message)
+	"""
+	known_incompatible = {
+		"BAAI/bge-code-v1": "Qwen2Tokenizer lacks backend_tokenizer attribute",
+		"jinaai/jina-embeddings-v3": "Missing custom transformers module dependencies",
+		"Salesforce/SFR-Embedding-Code-2B_R": "Device placement issues with meta tensors",
 	}
+	if teacher_model in known_incompatible:
+		return False, known_incompatible[teacher_model]
+	# Check for model families that might have issues
+	if "qwen2" in teacher_model.lower() and "bge" in teacher_model.lower():
+		return False, "BGE models with Qwen2 tokenizers may have compatibility issues"
+	if "jina" in teacher_model.lower() and "embeddings-v3" in teacher_model.lower():
+		return False, "Jina embeddings v3 models may have missing dependencies"
+	if "salesforce" in teacher_model.lower() and "sfr-embedding" in teacher_model.lower():
+		return False, "Salesforce SFR embedding models may have device placement issues"
+	return True, None
+def clear_model_cache(model_name: str) -> bool:
+	"""Clear HuggingFace cache for a specific model."""
 	try:
+		import shutil
+		from pathlib import Path
+		# Get HuggingFace cache directory
+		cache_dir = Path.home() / ".cache" / "huggingface"
+		# Find model-specific cache directories
+		model_slug = model_name.replace("/", "--")
+		# Clear transformers cache
+		transformers_cache = cache_dir / "transformers" / model_slug
+		if transformers_cache.exists():
+			shutil.rmtree(transformers_cache)
+			logger.info(f"🗑️ Cleared transformers cache for {model_name}")
+		# Clear hub cache
+		hub_cache = cache_dir / "hub" / f"models--{model_slug}"
+		if hub_cache.exists():
+			shutil.rmtree(hub_cache)
+			logger.info(f"🗑️ Cleared hub cache for {model_name}")
+		# Clear modules cache
+		modules_cache = cache_dir / "modules" / "transformers_modules" / model_name.split("/")[0]
+		if modules_cache.exists():
+			shutil.rmtree(modules_cache)
+			logger.info(f"🗑️ Cleared modules cache for {model_name}")
+		return True
 	except Exception as e:
+		logger.warning(f"Failed to clear cache for {model_name}: {e}")
+		return False
+def try_model_workarounds(teacher_model: str) -> str | None:
+	"""
+	Try specific workarounds for problematic models.
+	Returns:
+		The type of workaround needed ("salesforce", "baai", etc.) or None if no workaround available
+	"""
+	if "salesforce" in teacher_model.lower() and "sfr-embedding" in teacher_model.lower():
+		logger.info("🔧 Salesforce SFR model detected - will use specialized distillation")
+		return "salesforce"
+	if "baai" in teacher_model.lower() and ("bge-code" in teacher_model.lower() or "bge-m3" in teacher_model.lower()):
+		logger.info("🔧 BAAI BGE model detected - will use specialized distillation")
+		return "baai"
+	return None
+def salesforce_model_distillation(
+	teacher_model: str,
+	output_dir: str,
+	pca_dims: int | None = None,
 ) -> Any:
+	"""Special distillation function for Salesforce SFR models that handles device placement issues."""
+	if pca_dims is None:
+		pca_dims = int(distillation_config.optimal_pca_dims)
 	output_path = Path(output_dir)
 	output_path.mkdir(parents=True, exist_ok=True)
+	logger.info(f"🔄 Salesforce-specific distillation: {teacher_model} → {output_dir}")
+	logger.info(f"📊 PCA dims: {pca_dims}, SIF: {distillation_config.sif_coefficient}")
 	start_time = time.time()
+	try:
+		import torch
+		from sentence_transformers import SentenceTransformer
+		from transformers import AutoModel, AutoTokenizer
+		# Enhanced custom model loading for Salesforce models
+		logger.info("🔧 Loading model with enhanced device settings...")
+		# Method 1: Try with to_empty() for meta tensor handling
+		try:
+			logger.info("🔄 Attempting with to_empty() method...")
+			# Load tokenizer first
+			tokenizer = AutoTokenizer.from_pretrained(teacher_model, trust_remote_code=True)
+			# Load model with meta device initially
+			model = AutoModel.from_pretrained(
+				teacher_model,
 				trust_remote_code=True,
+				torch_dtype=torch.float16,
+				device_map="meta",  # Load on meta device first
 			)
+			# Move from meta to actual device using to_empty()
+			if torch.cuda.is_available():
+				device = torch.device("cuda")
+				# Create empty tensors on target device and copy weights
+				model = model.to_empty(device=device)
+			else:
+				device = torch.device("cpu")
+				model = model.to_empty(device=device)
+			# Ensure model is in the right dtype
+			model = model.to(torch.float16 if torch.cuda.is_available() else torch.float32)
+			logger.info("✅ Successfully loaded with to_empty() method")
+		except Exception as e:
+			logger.warning(f"to_empty() method failed: {e}")
+			# Method 2: Try SentenceTransformer with specific settings
+			logger.info("🔄 Falling back to SentenceTransformer method...")
+			sentence_model = SentenceTransformer(
+				teacher_model,
 				trust_remote_code=True,
+				device="cpu",  # Force CPU loading first
 			)
+			# Move to GPU if available
+			if torch.cuda.is_available():
+				sentence_model = sentence_model.to("cuda")
+			# Extract components
+			model = sentence_model[0].auto_model
+			tokenizer = sentence_model.tokenizer
+			logger.info("✅ Successfully loaded with SentenceTransformer method")
+		# Now use Model2Vec's distill_from_model function directly
+		from model2vec.distill.distillation import distill_from_model
+		distilled_model = distill_from_model(
+			model=model,
+			tokenizer=tokenizer,
+			pca_dims=int(pca_dims),
+			apply_zipf=bool(distillation_config.apply_zipf),
+			sif_coefficient=float(distillation_config.sif_coefficient),
+		)
+		logger.info("✅ Core distillation completed successfully")
+		# Save the model
+		distilled_model.save_pretrained(str(output_path))
+		logger.info(f"💾 Model saved to {output_path}")
+		# Log model info
+		logger.info(f"Model type: {type(distilled_model)}")
+		if hasattr(distilled_model, "embedding"):
+			logger.info(f"Embedding shape: {distilled_model.embedding.shape}")
+			logger.info(f"Embedding dtype: {distilled_model.embedding.dtype}")
+		total_time = time.time() - start_time
+		logger.info(f"🎉 Salesforce distillation completed in {total_time:.2f} seconds")
+		# Clean up
+		if "sentence_model" in locals():
+			del sentence_model
+		del model
+		if torch.cuda.is_available():
+			torch.cuda.empty_cache()
+		return distilled_model
+	except Exception:
+		logger.exception(f"❌ Salesforce-specific distillation failed for {teacher_model}")
+		return None
+def baai_bge_model_distillation(
+	teacher_model: str,
+	output_dir: str,
+	pca_dims: int | None = None,
+) -> Any:
+	"""Special distillation function for BAAI BGE models that handles Qwen2Tokenizer compatibility issues."""
+	if pca_dims is None:
+		pca_dims = int(distillation_config.optimal_pca_dims)
+	output_path = Path(output_dir)
+	output_path.mkdir(parents=True, exist_ok=True)
+	logger.info(f"🔄 BAAI BGE-specific distillation: {teacher_model} → {output_dir}")
+	logger.info(f"📊 PCA dims: {pca_dims}, SIF: {distillation_config.sif_coefficient}")
+	start_time = time.time()
+	try:
+		import torch
+		from sentence_transformers import SentenceTransformer
+		from transformers import AutoModel, AutoTokenizer
+		logger.info("🔧 Loading BAAI model with tokenizer workaround...")
+		# Try multiple approaches for BAAI models
+		success = False
+		# Method 1: Try SentenceTransformer first (often handles tokenizer issues better)
+		try:
+			logger.info("🔄 Attempting with SentenceTransformer wrapper...")
+			sentence_model = SentenceTransformer(teacher_model, trust_remote_code=True)
+			# Extract components
+			model = sentence_model[0].auto_model
+			tokenizer = sentence_model.tokenizer
+			# Test if tokenizer works by encoding a simple text
+			test_encoding = tokenizer.encode("test", return_tensors="pt")
+			logger.info("✅ SentenceTransformer method successful")
+			success = True
+		except Exception as e:
+			logger.warning(f"SentenceTransformer method failed: {e}")
+			# Method 2: Try direct loading with tokenizer replacement
+			try:
+				logger.info("🔄 Attempting with tokenizer replacement...")
+				from transformers import BertTokenizerFast
+				# Load model directly
+				model = AutoModel.from_pretrained(teacher_model, trust_remote_code=True)
+				# Try to use a compatible tokenizer instead
+				try:
+					# First try the original tokenizer
+					tokenizer = AutoTokenizer.from_pretrained(teacher_model, trust_remote_code=True)
+				except Exception:
+					# Fall back to BERT tokenizer for BGE models
+					logger.info("🔄 Falling back to BERT tokenizer...")
+					tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+				logger.info("✅ Tokenizer replacement method successful")
+				success = True
+			except Exception as e2:
+				logger.warning(f"Tokenizer replacement method failed: {e2}")
+		if not success:
+			logger.error("❌ All BAAI model loading methods failed")
+			return None
+		# Now use Model2Vec's distill_from_model function directly
+		from model2vec.distill.distillation import distill_from_model
+		distilled_model = distill_from_model(
+			model=model,
+			tokenizer=tokenizer,
+			pca_dims=int(pca_dims),
+			apply_zipf=bool(distillation_config.apply_zipf),
+			sif_coefficient=float(distillation_config.sif_coefficient),
+		)
+		logger.info("✅ Core distillation completed successfully")
+		# Save the model
+		distilled_model.save_pretrained(str(output_path))
+		logger.info(f"💾 Model saved to {output_path}")
+		# Log model info
+		logger.info(f"Model type: {type(distilled_model)}")
+		if hasattr(distilled_model, "embedding"):
+			logger.info(f"Embedding shape: {distilled_model.embedding.shape}")
+			logger.info(f"Embedding dtype: {distilled_model.embedding.dtype}")
+		total_time = time.time() - start_time
+		logger.info(f"🎉 BAAI BGE distillation completed in {total_time:.2f} seconds")
+		# Clean up
+		if "sentence_model" in locals():
+			del sentence_model
+		del model
+		if torch.cuda.is_available():
+			torch.cuda.empty_cache()
+		return distilled_model
+	except Exception:
+		logger.exception(f"❌ BAAI BGE-specific distillation failed for {teacher_model}")
+		return None
 if __name__ == "__main__":
+	typer.run(main)

src/distiller/distill_simplified.py DELETED Viewed

@@ -1,413 +0,0 @@
-"""
-Simplified Code-Specialized Model2Vec Distillation Script.
-This script implements a focused, simplified approach for creating code-specialized embeddings
-using only the core Model2Vec distillation without additional fine-tuning that may degrade quality.
-Can run locally or on Beam with the --use-beam flag.
-"""
-import argparse
-import json
-import logging
-import sys
-import time
-from pathlib import Path
-from typing import Any
-from beam import GpuType, Image, Volume, function
-from model2vec.distill import distill
-# =============================================================================
-# SIMPLIFIED CONFIGURATION
-# =============================================================================
-# Use a code-specialized teacher model instead of general instruction model
-# Ordered by success likelihood and performance:
-CODE_TEACHER_MODELS = [
-	"sentence-transformers/all-MiniLM-L6-v2",
-	"sentence-transformers/all-mpnet-base-v2",
-	"microsoft/codebert-base",
-	"microsoft/graphcodebert-base",
-	"sentence-transformers/paraphrase-MiniLM-L6-v2",
-	"Alibaba-NLP/gte-Qwen2-7B-instruct",
-]
-OUTPUT_BASE_DIR = "code_model2vec"
-# Optimal Model2Vec parameters based on successful models
-OPTIMAL_PCA_DIMS = 256  # Match other successful Model2Vec models
-SIF_COEFFICIENT = 1e-3  # Slightly higher than default for code specialization
-APPLY_ZIPF = True  # Enable Zipf weighting for better word importance
-# =============================================================================
-# BEAM CONFIGURATION
-# =============================================================================
-GPU_NAME = GpuType.A100_40
-VOLUME_NAME = "code_model2vec"
-VOLUME_PATH = "./code_model2vec"
-IMAGE = Image(python_version="python3.12").add_python_packages(
-	[
-		"torch>=2.7.0",  # Install torch first
-		"transformers>=4.40.0",  # Latest transformers with flash attention support
-		"lightning>=2.5.1.post0",
-		"model2vec[train]>=0.5.0",
-		"numpy>=1.26.4",
-		"scikit-learn>=1.6.1",
-		"sentence-transformers>=4.1.0",
-		"datasets>=3.2.0",  # For evaluation
-		"pandas>=2.0.0",
-		"tqdm>=4.65.0",
-	]
-)
-# =============================================================================
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-logger = logging.getLogger(__name__)
-# Add beam utilities for proper model persistence
-try:
-	from .beam_utils import (
-		create_beam_utilities,
-	)
-	BEAM_UTILS_AVAILABLE = True
-except ImportError:
-	print("Beam utilities not available - models will only be saved locally")
-	BEAM_UTILS_AVAILABLE = False
-def apply_local_patches() -> bool:
-	"""Apply patches locally without requiring Beam utilities."""
-	try:
-		# Try using patch_utils if available
-		try:
-			from .patch_utils import apply_all_patches
-			patches_applied = apply_all_patches()
-			logger.info(f"Successfully applied {patches_applied} patches via patch_utils")
-			return True
-		except ImportError:
-			logger.warning("patch_utils not available, trying direct patching")
-		return False
-	except Exception as e:
-		logger.warning(f"Failed to apply patches: {e}")
-		return False
-def simplified_code_distillation(
-	teacher_model: str,
-	output_dir: str,
-	pca_dims: int = OPTIMAL_PCA_DIMS,
-) -> Any:
-	"""
-	Simplified code-specialized distillation using only core Model2Vec.
-	This approach:
-	1. Uses a teacher model that already performs well on code tasks
-	2. Applies optimal Model2Vec parameters
-	3. Avoids additional training that may degrade quality
-	"""
-	output_path = Path(output_dir)
-	output_path.mkdir(parents=True, exist_ok=True)
-	logger.info(f"Starting simplified distillation from {teacher_model}")
-	logger.info(f"Target dimensions: {pca_dims}")
-	logger.info(f"SIF coefficient: {SIF_COEFFICIENT}")
-	logger.info(f"Zipf weighting: {APPLY_ZIPF}")
-	start_time = time.time()
-	try:
-		# Perform distillation with optimal parameters
-		model = distill(
-			model_name=teacher_model,
-			pca_dims=pca_dims,
-			apply_zipf=APPLY_ZIPF,
-			sif_coefficient=SIF_COEFFICIENT,
-			trust_remote_code=True,
-		)
-		logger.info("✅ Core distillation completed successfully")
-		# Save the model
-		model.save_pretrained(str(output_path))
-		logger.info(f"💾 Model saved to {output_path}")
-		# Log model info
-		logger.info(f"Model type: {type(model)}")
-		if hasattr(model, "embedding"):
-			logger.info(f"Embedding shape: {model.embedding.shape}")
-			logger.info(f"Embedding dtype: {model.embedding.dtype}")
-		total_time = time.time() - start_time
-		logger.info(f"🎉 Simplified distillation completed in {total_time:.2f} seconds")
-		return model
-	except ValueError as e:
-		if "Number of tokens" in str(e) and "does not match number of vectors" in str(e):
-			logger.warning(f"⚠️ Token-vector mismatch with {teacher_model} - this is a Model2Vec library issue")
-			logger.warning(f"Error details: {e}")
-			logger.warning("💡 This model has incompatible tokenization. Skipping...")
-			return None
-		raise
-	except Exception:
-		logger.exception("❌ Distillation failed")
-		return None
-def core_distill_all_teachers(use_beam_utilities: bool = False) -> dict[str, Any]:
-	"""
-	Core logic for distilling all teacher models.
-	Args:
-		use_beam_utilities: Whether to use Beam utilities for persistence
-	Returns:
-		Dictionary with distillation results
-	"""
-	# Apply patches
-	logger.info("Applying all patches...")
-	patch_success = apply_local_patches()
-	if patch_success:
-		logger.info("Successfully applied patches")
-	else:
-		logger.warning("Failed to apply patches - Microsoft models may fail")
-	# Initialize Beam utilities if requested and available
-	volume_mgr = None
-	model_mgr = None
-	if use_beam_utilities and BEAM_UTILS_AVAILABLE:
-		try:
-			volume_mgr, _, model_mgr, _ = create_beam_utilities(VOLUME_NAME, VOLUME_PATH)
-			logger.info("✅ Beam utilities initialized for model persistence")
-		except Exception as e:
-			logger.warning(f"Failed to initialize Beam utilities: {e}")
-			model_mgr = None
-	results = {}
-	successful_models = []
-	logger.info("🚀 Starting comprehensive teacher model distillation")
-	logger.info(f"📊 Processing {len(CODE_TEACHER_MODELS)} teacher models")
-	# Determine output base path
-	base_output_path = VOLUME_PATH if use_beam_utilities else OUTPUT_BASE_DIR
-	for teacher_model in CODE_TEACHER_MODELS:
-		try:
-			# Create output directory name based on teacher model
-			teacher_name = teacher_model.split("/")[-1].replace("-", "_")
-			output_dir = f"{base_output_path}/final/code_model2vec_{teacher_name}"
-			logger.info(f"\n{'=' * 60}")
-			logger.info(f"🔄 Processing teacher model: {teacher_model}")
-			logger.info(f"📁 Output directory: {output_dir}")
-			logger.info(f"{'=' * 60}")
-			# Check if model already exists
-			output_path = Path(output_dir)
-			if output_path.exists():
-				# Check for essential model files
-				has_config = (output_path / "config.json").exists()
-				has_model_file = any(
-					[
-						(output_path / "model.safetensors").exists(),
-						(output_path / "model.bin").exists(),
-						(output_path / "pytorch_model.bin").exists(),
-					]
-				)
-				if has_config and has_model_file:
-					logger.info(f"✅ Model {teacher_name} already exists - skipping distillation")
-					# Still record it as successful
-					model_info = {
-						"teacher_model": teacher_model,
-						"output_dir": output_dir,
-						"teacher_name": teacher_name,
-						"distillation_time": 0.0,
-						"status": "skipped_existing",
-					}
-					results[teacher_name] = model_info
-					successful_models.append(teacher_name)
-					logger.info(f"📁 Using existing model at: {output_dir}")
-					continue
-			# Perform distillation
-			start_time = time.time()
-			model = simplified_code_distillation(
-				teacher_model=teacher_model,
-				output_dir=output_dir,
-			)
-			distill_time = time.time() - start_time
-			if model is not None:
-				logger.info(f"✅ Distillation successful for {teacher_model}")
-				# Save to Beam volume for persistence if available
-				if model_mgr:
-					try:
-						# Save model to beam volume with teacher-specific name
-						beam_model_name = f"{teacher_name}_model"
-						model_mgr.save_model(beam_model_name, output_dir)
-						logger.info(f"💾 Saved {teacher_name} to Beam volume as {beam_model_name}")
-					except Exception as e:
-						logger.warning(f"Failed to save {teacher_name} to Beam volume: {e}")
-				# Store results
-				model_info = {
-					"teacher_model": teacher_model,
-					"output_dir": output_dir,
-					"teacher_name": teacher_name,
-					"distillation_time": distill_time,
-					"status": "success",
-				}
-				results[teacher_name] = model_info
-				successful_models.append(teacher_name)
-				logger.info(f"💾 Model saved to: {output_dir}")
-		except Exception as e:
-			logger.exception(f"❌ Failed with {teacher_model}")
-			results[teacher_model.split("/")[-1]] = {
-				"teacher_model": teacher_model,
-				"status": "failed",
-				"error": str(e),
-			}
-			continue
-	# Summary
-	if successful_models:
-		logger.info("\n🏆 DISTILLATION COMPLETE!")
-		logger.info(f"📊 Successful models: {len(successful_models)}")
-		for model_name in successful_models:
-			model_info = results[model_name]
-			logger.info(f"✅ {model_name}: {model_info['teacher_model']}")
-		# Save comprehensive results
-		results_summary = {
-			"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
-			"successful_models": successful_models,
-			"all_results": results,
-			"total_successful": len(successful_models),
-			"total_attempted": len(CODE_TEACHER_MODELS),
-		}
-		# Save results to file
-		results_file = Path(f"{base_output_path}/distillation_results.json")
-		results_file.parent.mkdir(parents=True, exist_ok=True)
-		with results_file.open("w") as f:
-			json.dump(results_summary, f, indent=2)
-		logger.info(f"📊 Results summary saved to: {results_file}")
-		return results_summary
-	logger.error("❌ No models succeeded")
-	msg = "All teacher models failed distillation"
-	raise RuntimeError(msg)
-def run_local_distillation() -> dict[str, Any]:
-	"""Run distillation locally without Beam."""
-	logger.info("🖥️ Running simplified distillation locally")
-	return core_distill_all_teachers(use_beam_utilities=False)
-@function(
-	gpu=GPU_NAME,
-	volumes=[Volume(name=VOLUME_NAME, mount_path=VOLUME_PATH)],
-	image=IMAGE,
-	secrets=["HF_ACCESS_TOKEN"],
-	env={
-		"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True,max_split_size_mb:512",
-		"TOKENIZERS_PARALLELISM": "false",
-		"CUDA_LAUNCH_BLOCKING": "0",  # Allow async CUDA operations
-		"TORCH_CUDNN_V8_API_ENABLED": "1",  # Enable optimized cuDNN
-	},
-	timeout=3600 * 12,  # 12 hours
-)
-def beam_distill_all_teachers() -> dict[str, Any]:
-	"""
-	Beam version: Try all teacher models and create distilled models from each.
-	Returns information about all models that were successfully created.
-	"""
-	logger.info("☁️ Running simplified distillation on Beam")
-	return core_distill_all_teachers(use_beam_utilities=True)
-def main() -> None:
-	"""Main function with argument parsing."""
-	global OUTPUT_BASE_DIR  # Declare global at the top  # noqa: PLW0603
-	parser = argparse.ArgumentParser(
-		description="Simplified Code-Specialized Model2Vec Distillation",
-		formatter_class=argparse.RawDescriptionHelpFormatter,
-		epilog="""
-Examples:
-  python -m src.distiller.distill_simplified                    # Run locally
-  python -m src.distiller.distill_simplified --use-beam        # Run on Beam
-  distiller distill-simple                                      # CLI shortcut (runs on Beam)
-		""",
-	)
-	parser.add_argument(
-		"--use-beam",
-		action="store_true",
-		help="Run on Beam instead of locally",
-	)
-	parser.add_argument(
-		"--output-dir",
-		type=str,
-		default=OUTPUT_BASE_DIR,
-		help=f"Output directory for models (default: {OUTPUT_BASE_DIR})",
-	)
-	args = parser.parse_args()
-	# Update output directory if specified
-	if args.output_dir != OUTPUT_BASE_DIR:
-		OUTPUT_BASE_DIR = args.output_dir
-	try:
-		if args.use_beam:
-			logger.info("🚀 Starting Beam execution...")
-			results = beam_distill_all_teachers()
-		else:
-			logger.info("🖥️ Starting local execution...")
-			results = run_local_distillation()
-		# Print final summary
-		print("\n🎉 Distillation complete!")
-		print(f"📊 Successfully created {results['total_successful']} models")
-		if args.use_beam:
-			print(f"📁 Models location: {VOLUME_PATH}/final/")
-		else:
-			print(f"📁 Models location: {OUTPUT_BASE_DIR}/final/")
-		print("\n✅ Created models:")
-		for model_name in results["successful_models"]:
-			model_info = results["all_results"][model_name]
-			print(f"   • {model_name} (from {model_info['teacher_model']})")
-	except KeyboardInterrupt:
-		logger.info("🛑 Distillation interrupted by user")
-		sys.exit(1)
-	except Exception:
-		logger.exception("❌ Distillation failed with error")
-		sys.exit(1)
-if __name__ == "__main__":
-	main()

src/distiller/evaluate.py CHANGED Viewed

@@ -1,130 +1,405 @@
 """
-CodeSearchNet Evaluation Script for Code-Specialized Embedding Models.
-This script evaluates embedding models on code search tasks using the CodeSearchNet
-dataset and methodology. It implements the same evaluation approach as the original
-CodeSearchNet challenge, including NDCG and other information retrieval metrics.
 Usage:
-    distiller evaluate  # Run evaluation on all default models with Beam
 """
 import json
 import logging
 import time
 from pathlib import Path
 from typing import Any
 import numpy as np
 import pandas as pd
-from beam import GpuType, Image, Volume, function
 from datasets import Dataset, load_dataset
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from tqdm import tqdm
-from .beam_utils import (
-	BeamCheckpointManager,
-	BeamEvaluationManager,
-	create_beam_utilities,
 )
-# Configure logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
 # =============================================================================
-# BEAM CONFIGURATION
 # =============================================================================
-GPU_NAME = GpuType.A100_40
-VOLUME_NAME = "code_model2vec"  # Same volume as distill_simplified.py
-VOLUME_PATH = "./code_model2vec"  # Same mount path as distill_simplified.py
-EVALUATION_RESULTS_DIR = "evaluation_results"  # Subdirectory within volume
-EVALUATION_CACHE_DIR = "evaluation_cache"  # Cache for datasets and models
-IMAGE = Image(python_version="python3.12").add_python_packages(
-	[
-		"torch>=2.7.0",
-		"transformers>=4.40.0",
-		"datasets>=3.2.0",
-		"sentence-transformers>=4.1.0",
-		"model2vec[train]>=0.5.0",
-		"numpy>=1.26.4",
-		"scikit-learn>=1.6.1",
-		"pandas>=2.0.0",
-		"tqdm>=4.65.0",
-	]
-)
 # =============================================================================
-# CONFIGURATION
 # =============================================================================
-CODESEARCHNET_EVAL_DATASET = "code_search_net"
-BATCH_SIZE = 32
-DEFAULT_OUTPUT_DIR = "code_evaluation_results"  # Local fallback directory
-EVALUATION_LANGUAGES = ["python", "javascript", "java", "php", "ruby", "go"]
-# Default models to evaluate (can be overridden via command line)
-DEFAULT_EVALUATION_MODELS = [
-	# Established Code Models
-	"sentence-transformers/all-MiniLM-L6-v2",
-	"microsoft/codebert-base",
-	"microsoft/graphcodebert-base",
-	"huggingface/CodeBERTa-small-v1",
-	"sentence-transformers/all-mpnet-base-v2",
-	"sentence-transformers/all-MiniLM-L12-v2",
-	# Model2Vec & Efficiency Models (Direct Competitors)
-	"minishlab/potion-base-8M",
-	"minishlab/potion-retrieval-32M",
-	# Small Transformer-Based Code Models
-	"Salesforce/codet5-base",
-]
-# =============================================================================
-# CHECKPOINT CONFIGURATION
-# =============================================================================
-# Prevent conflicts with distill.py checkpoints by using different prefixes
-EVAL_CHECKPOINT_PREFIX = "evaluation_checkpoints"
-DATASET_CHECKPOINT_PREFIX = "dataset_cache"
-MODEL_CACHE_PREFIX = "model_cache"
-# =============================================================================
-# CORE EVALUATION CLASSES
-# =============================================================================
 class CodeSearchNetEvaluator:
 	"""Evaluator for CodeSearchNet-style code search tasks."""
-	def __init__(
-		self,
-		model_path: str,
-		model_name: str | None = None,
-		checkpoint_manager: BeamCheckpointManager | None = None,
-		eval_manager: BeamEvaluationManager | None = None,
-	) -> None:
-		"""Initialize the evaluator with a model and optional Beam utilities."""
 		self.model_path = model_path
 		self.model_name = model_name or Path(model_path).name
 		self.model: SentenceTransformer | None = None
-		self.checkpoint_manager = checkpoint_manager
-		self.eval_manager = eval_manager
 		self._load_model()
 	def _load_model(self) -> None:
-		"""Load the embedding model with caching support."""
 		logger.info(f"Loading model from {self.model_path}")
-		# Check if we have a cached evaluation result for this model
-		if self.eval_manager:
-			cached_result = self.eval_manager.load_evaluation_results(self.model_name)
-			if cached_result:
-				logger.info(f"✅ Found cached evaluation results for {self.model_name}")
-				# Note: We still need to load the model for new evaluations
 		try:
 			self.model = SentenceTransformer(self.model_path, trust_remote_code=True)
 			logger.info(f"Successfully loaded model: {self.model_name}")
@@ -139,7 +414,6 @@ class CodeSearchNetEvaluator:
 			raise RuntimeError(msg)
 		embeddings = []
 		for i in tqdm(range(0, len(texts), BATCH_SIZE), desc=desc):
 			batch = texts[i : i + BATCH_SIZE]
 			batch_embeddings = self.model.encode(batch, convert_to_tensor=False, normalize_embeddings=True)
@@ -148,33 +422,25 @@ class CodeSearchNetEvaluator:
 		return np.vstack(embeddings)
 	def evaluate_language(self, language: str, max_queries: int = 1000) -> dict[str, Any]:
-		"""Evaluate on a specific programming language with checkpoint support."""
 		logger.info(f"Evaluating on {language} language (max {max_queries} queries)")
-		# Check for existing evaluation checkpoint
-		if self.checkpoint_manager:
-			cached_result = self.checkpoint_manager.load_checkpoint(f"{EVAL_CHECKPOINT_PREFIX}_{language}", 0)
-			if cached_result and cached_result.get("data", {}).get("model_name") == self.model_name:
-				logger.info(f"✅ Resuming from cached {language} evaluation")
-				return cached_result.get("data", {})
 		try:
 			# Load test split for the language
 			dataset = load_dataset(
-				CODESEARCHNET_EVAL_DATASET,
 				language,
 				split="test",
 				trust_remote_code=True,
 			)
-			# Ensure we have a Dataset object
 			if not isinstance(dataset, Dataset):
 				logger.error(f"Unexpected dataset type for {language}: {type(dataset)}")
 				return {}
-			# Sample queries for evaluation (to make it manageable)
 			if len(dataset) > max_queries:
-				rng = np.random.default_rng(42)  # Use seeded generator for reproducibility
 				indices = rng.choice(len(dataset), max_queries, replace=False)
 				dataset = dataset.select(indices)
@@ -198,642 +464,680 @@ class CodeSearchNetEvaluator:
 			logger.info(f"Found {len(queries)} valid query-code pairs for {language}")
 			# Encode queries and codes
 			query_embeddings = self.encode_texts(queries, f"Encoding {language} queries")
-			code_embeddings = self.encode_texts(codes, f"Encoding {language} codes")
-			# Compute similarities
 			similarities = cosine_similarity(query_embeddings, code_embeddings)
-			# Evaluate retrieval metrics
 			metrics = self._compute_retrieval_metrics(similarities)
-			result = {
 				"language": language,
 				"num_queries": len(queries),
 				"metrics": metrics,
-				"model_name": self.model_name,
 			}
-			# Save checkpoint
-			if self.checkpoint_manager:
-				checkpoint_data = {
-					"data": result,
-					"timestamp": time.time(),
-					"config": {
-						"language": language,
-						"max_queries": max_queries,
-						"model_name": self.model_name,
-					},
-				}
-				self.checkpoint_manager.save_checkpoint(f"{EVAL_CHECKPOINT_PREFIX}_{language}", checkpoint_data, 0)
-				logger.info(f"💾 Saved {language} evaluation checkpoint")
-			return result
 		except Exception:
-			logger.exception(f"Error evaluating {language}")
 			return {}
 	def _compute_retrieval_metrics(self, similarities: np.ndarray) -> dict[str, float]:
-		"""Compute retrieval metrics like NDCG, MRR, etc."""
-		num_queries = similarities.shape[0]
-		# For each query, the correct code is at the same index (diagonal)
-		ranks = []
 		reciprocal_ranks = []
-		ndcg_scores = []
-		for i in range(num_queries):
-			# Get similarity scores for query i
-			scores = similarities[i]
-			# Rank all codes by similarity to query i
-			ranked_indices = np.argsort(scores)[::-1]  # Descending order
-			# Find rank of the correct code (index i)
-			correct_rank = np.where(ranked_indices == i)[0][0] + 1  # 1-indexed
-			ranks.append(correct_rank)
-			reciprocal_ranks.append(1.0 / correct_rank)
-			# Compute NDCG@10
-			ndcg_scores.append(self._compute_ndcg(ranked_indices, i, k=10))
-		return {
-			"mrr": float(np.mean(reciprocal_ranks)),
-			"ndcg@1": float(
-				np.mean([self._compute_ndcg(np.argsort(similarities[i])[::-1], i, k=1) for i in range(num_queries)])
-			),
-			"ndcg@5": float(
-				np.mean([self._compute_ndcg(np.argsort(similarities[i])[::-1], i, k=5) for i in range(num_queries)])
-			),
-			"ndcg@10": float(np.mean(ndcg_scores)),
-			"recall@1": float(np.mean([1.0 if rank == 1 else 0.0 for rank in ranks])),
-			"recall@5": float(np.mean([1.0 if rank <= 5 else 0.0 for rank in ranks])),
-			"recall@10": float(np.mean([1.0 if rank <= 10 else 0.0 for rank in ranks])),
-			"mean_rank": float(np.mean(ranks)),
-			"median_rank": float(np.median(ranks)),
-		}
 	def _compute_ndcg(self, ranked_indices: np.ndarray, correct_idx: int, k: int) -> float:
 		"""Compute NDCG@k for a single query."""
-		if k == 0:
-			return 0.0
-		# Find position of correct item in top-k
-		top_k = ranked_indices[:k]
-		if correct_idx in top_k:
-			position = np.where(top_k == correct_idx)[0][0]
-			return 1.0 / np.log2(position + 2)  # +2 because log2(1) is 0
 		return 0.0
 	def evaluate_all_languages(
 		self, max_queries_per_lang: int = 1000, languages: list[str] | None = None
 	) -> dict[str, Any]:
-		"""Evaluate on all supported programming languages with comprehensive result saving."""
-		if languages is None:
-			languages = EVALUATION_LANGUAGES
-		logger.info(f"Starting evaluation on all languages for model: {self.model_name}")
-		# Check for existing comprehensive evaluation results
-		if self.eval_manager:
-			cached_comprehensive = self.eval_manager.load_evaluation_results(self.model_name)
-			if cached_comprehensive:
-				logger.info(f"✅ Found comprehensive cached evaluation for {self.model_name}")
-				return cached_comprehensive
 		start_time = time.time()
-		results: dict[str, Any] = {
 			"model_name": self.model_name,
 			"model_path": self.model_path,
-			"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
 			"languages": {},
 			"overall": {},
 		}
-		all_metrics = []
-		for language in languages:
-			logger.info(f"Evaluating {language}...")
 			lang_results = self.evaluate_language(language, max_queries_per_lang)
 			if lang_results:
-				results["languages"][language] = lang_results
-				all_metrics.append(lang_results["metrics"])
-			else:
-				logger.warning(f"Skipping {language} due to evaluation error")
-		# Compute overall metrics (average across languages)
-		if all_metrics:
 			overall_metrics = {}
-			for metric_name in all_metrics[0]:
-				values = [m[metric_name] for m in all_metrics if metric_name in m]
-				overall_metrics[metric_name] = np.mean(values)
 			results["overall"] = overall_metrics
 		total_time = time.time() - start_time
 		results["evaluation_time_seconds"] = total_time
-		# Save comprehensive results to Beam volume
-		if self.eval_manager:
-			self.eval_manager.save_evaluation_results(self.model_name, results)
-			logger.info("💾 Saved comprehensive evaluation results to Beam volume")
 		logger.info(f"Evaluation completed in {total_time:.2f} seconds")
 		return results
-def load_peer_models(peers_file: str) -> list[tuple[str, str]]:
-	"""Load peer models from CSV file."""
-	try:
-		df = pd.read_csv(peers_file)
-		models = []
-		for _, row in df.iterrows():
-			model_name = row.get("model_name", row.get("Model", ""))
-			model_path = row.get("model_path", row.get("Path", model_name))
-			if model_name:
-				models.append((model_name, model_path))
-		logger.info(f"Loaded {len(models)} peer models from {peers_file}")
-		return models
-	except Exception:
-		logger.exception("Error loading peer models from {peers_file}")
-		return []
-def save_results(
-	results: dict[str, Any],
-	output_dir: str,
-	model_name: str,
-	eval_manager: BeamEvaluationManager | None = None,
-	volume_results_dir: Path | None = None,
-) -> None:
-	"""Save evaluation results to JSON file with Beam volume support."""
-	# Save to Beam volume if available
-	if volume_results_dir:
-		volume_output_path = volume_results_dir / f"codesearchnet_eval_{model_name}.json"
 		try:
-			with volume_output_path.open("w") as f:
-				json.dump(results, f, indent=2, default=str)
-			logger.info(f"💾 Results saved to Beam volume: {volume_output_path}")
 		except Exception as e:
-			logger.warning(f"⚠️ Failed to save to Beam volume: {e}")
-	# Also try eval_manager if available (for compatibility)
-	if eval_manager:
-		success = eval_manager.save_evaluation_results(model_name, results)
-		if success:
-			logger.info(f"💾 Results also saved via eval_manager for {model_name}")
-		else:
-			logger.warning(f"⚠️ Failed to save via eval_manager for {model_name}")
-	# Always save local backup
-	output_path = Path(output_dir)
-	output_path.mkdir(parents=True, exist_ok=True)
-	# Clean model name for filename
-	safe_name = "".join(c for c in model_name if c.isalnum() or c in ("-", "_", "."))
-	filename = f"codesearchnet_eval_{safe_name}.json"
-	filepath = output_path / filename
-	with Path(filepath).open("w") as f:
-		json.dump(results, f, indent=2, default=str)
-	logger.info(f"📄 Local backup saved to {filepath}")
-def print_results_summary(results: dict[str, Any]) -> None:
-	"""Print a summary of evaluation results."""
-	model_name = results["model_name"]
-	overall = results.get("overall", {})
-	print(f"\n{'=' * 60}")
-	print(f"CodeSearchNet Evaluation Results: {model_name}")
-	print(f"{'=' * 60}")
 	if overall:
-		print("\nOverall Metrics (averaged across languages):")
-		print(f"  MRR:      {overall.get('mrr', 0):.4f}")
-		print(f"  NDCG@1:   {overall.get('ndcg@1', 0):.4f}")
-		print(f"  NDCG@5:   {overall.get('ndcg@5', 0):.4f}")
-		print(f"  NDCG@10:  {overall.get('ndcg@10', 0):.4f}")
-		print(f"  Recall@1: {overall.get('recall@1', 0):.4f}")
-		print(f"  Recall@5: {overall.get('recall@5', 0):.4f}")
-		print(f"  Recall@10: {overall.get('recall@10', 0):.4f}")
-	print("\nPer-Language Results:")
-	for lang, lang_results in results.get("languages", {}).items():
-		metrics = lang_results.get("metrics", {})
-		print(
-			f"  {lang:12s}: MRR={metrics.get('mrr', 0):.3f}, "
-			f"NDCG@10={metrics.get('ndcg@10', 0):.3f}, "
-			f"Recall@5={metrics.get('recall@5', 0):.3f}"
-		)
-def create_comparison_report(all_results: list[dict[str, Any]], output_dir: str) -> None:
-	"""Create a comparison report across all evaluated models."""
 	if not all_results:
 		return
 	output_path = Path(output_dir)
-	# Create comparison DataFrame
-	comparison_data = []
-	for results in all_results:
-		overall = results.get("overall", {})
-		row = {
-			"Model": results["model_name"],
-			"MRR": overall.get("mrr", 0),
-			"NDCG@1": overall.get("ndcg@1", 0),
-			"NDCG@5": overall.get("ndcg@5", 0),
-			"NDCG@10": overall.get("ndcg@10", 0),
-			"Recall@1": overall.get("recall@1", 0),
-			"Recall@5": overall.get("recall@5", 0),
-			"Recall@10": overall.get("recall@10", 0),
-			"Mean Rank": overall.get("mean_rank", 0),
-		}
-		comparison_data.append(row)
-	df = pd.DataFrame(comparison_data)
-	df = df.sort_values("NDCG@10", ascending=False)  # Sort by NDCG@10
-	# Save to CSV
-	csv_path = output_path / "codesearchnet_comparison.csv"
-	df.to_csv(csv_path, index=False, float_format="%.4f")
-	logger.info(f"Comparison report saved to {csv_path}")
-	# Print comparison table
-	print(f"\n{'=' * 80}")
-	print("CodeSearchNet Model Comparison")
-	print(f"{'=' * 80}")
-	print(df.to_string(index=False, float_format="%.4f"))
-def beam_evaluate_models(
 	models: list[str],
 	max_queries: int = 1000,
 	languages: list[str] | None = None,
-	output_dir: str = DEFAULT_OUTPUT_DIR,
-	volume_name: str = VOLUME_NAME,
-	mount_path: str = VOLUME_PATH,
 ) -> list[dict[str, Any]]:
-	"""Main evaluation function for Beam execution with checkpoint support."""
-	logger.info("🚀 Starting Beam-powered CodeSearchNet evaluation")
-	logger.info(f"📊 Evaluating {len(models)} models on {len(languages or EVALUATION_LANGUAGES)} languages")
-	# Initialize Beam utilities
-	volume_mgr, checkpoint_mgr, model_mgr, eval_mgr = create_beam_utilities(volume_name, mount_path)
-	# Create evaluation results directory in volume
-	results_dir = Path(mount_path) / EVALUATION_RESULTS_DIR
-	results_dir.mkdir(parents=True, exist_ok=True)
-	logger.info(f"📁 Using Beam volume: {volume_name} at {mount_path}")
-	logger.info(f"💾 Evaluation results directory: {results_dir}")
-	all_results = []
-	skipped_models = []
 	for model_path in models:
 		model_name = Path(model_path).name
-		# Check for existing evaluation results
-		existing_result_file = results_dir / f"codesearchnet_eval_{model_name}.json"
-		if existing_result_file.exists():
-			logger.info(f"✅ Model {model_name} already evaluated - loading existing results")
-			try:
-				with existing_result_file.open("r") as f:
-					existing_results = json.load(f)
-				all_results.append(existing_results)
-				skipped_models.append(model_name)
-				continue
-			except Exception as e:
-				logger.warning(f"⚠️ Failed to load existing results for {model_name}: {e}")
-				# Continue with evaluation if loading fails
 		logger.info(f"\n{'=' * 60}")
 		logger.info(f"🔍 Evaluating model: {model_name}")
-		logger.info(f"📂 Path: {model_path}")
 		logger.info(f"{'=' * 60}")
 		try:
-			# Distinguish between local paths and HuggingFace model names
-			is_huggingface_model = (
-				"/" in model_path and not model_path.startswith("/") and not Path(model_path).exists()
-			)
-			if is_huggingface_model:
-				# This is a HuggingFace model name - pass directly to evaluator
-				logger.info(f"📥 Loading HuggingFace model: {model_path}")
-				evaluator = CodeSearchNetEvaluator(
-					model_path,
-					model_name,
-					checkpoint_manager=checkpoint_mgr,
-					eval_manager=eval_mgr,
-				)
-			else:
-				# This is a local path - check if it exists in Beam volume
-				actual_model_path = model_path  # Default to original path
-				if not Path(model_path).exists() and not model_path.startswith("/"):
-					# Try to load from Beam volume
-					local_model_path = Path(mount_path) / MODEL_CACHE_PREFIX / model_name
-					logger.info(f"🔍 Trying to load {model_name} from Beam volume: {local_model_path}")
-					if model_mgr.load_model(model_name, local_model_path.parent):
-						actual_model_path = str(local_model_path)
-						logger.info(f"✅ Loaded model from Beam volume: {actual_model_path}")
-					else:
-						logger.warning(f"⚠️ Model not found locally or in Beam volume: {model_name}")
-						continue
-				evaluator = CodeSearchNetEvaluator(
-					actual_model_path,
-					model_name,
-					checkpoint_manager=checkpoint_mgr,
-					eval_manager=eval_mgr,
-				)
-			results = evaluator.evaluate_all_languages(max_queries, languages)
-			# Save results with Beam support
-			save_results(results, output_dir, model_name, eval_mgr, results_dir)
-			# Print summary
-			print_results_summary(results)
-			all_results.append(results)
 		except Exception:
 			logger.exception(f"❌ Failed to evaluate {model_name}")
 			continue
-	# Create comparison report in Beam volume
-	if len(all_results) > 1:
-		comparison_dir = Path(mount_path) / EVALUATION_RESULTS_DIR / "comparisons"
-		comparison_dir.mkdir(parents=True, exist_ok=True)
-		create_comparison_report(all_results, str(comparison_dir))
-		logger.info(f"📊 Comparison report saved to Beam volume: {comparison_dir}")
-	# Log summary of what was done
-	newly_evaluated = len(all_results) - len(skipped_models)
-	logger.info("\n✅ Beam evaluation complete!")
-	logger.info(f"📊 Newly evaluated: {newly_evaluated} models")
-	logger.info(f"⏭️  Skipped (already done): {len(skipped_models)} models")
-	logger.info(f"📁 Total results: {len(all_results)} models")
-	logger.info(f"💾 Results available in Beam volume: {volume_name}")
-	if skipped_models:
-		logger.info(f"⏭️  Skipped models: {', '.join(skipped_models)}")
-	return all_results
 @function(
 	gpu=GPU_NAME,
-	volumes=[Volume(name=VOLUME_NAME, mount_path=VOLUME_PATH)],
 	image=IMAGE,
 	secrets=["HF_ACCESS_TOKEN"],
-	env={
-		"TOKENIZERS_PARALLELISM": "false",
-		"CUDA_LAUNCH_BLOCKING": "0",
-	},
-	timeout=3600 * 6,  # 6 hours for evaluation
 )
-def main(skip_third_party: bool = False) -> None:
-	"""Main evaluation function - runs all default models on Beam."""
-	logger.info("🚀 Starting comprehensive CodeSearchNet evaluation on Beam")
-	# Use default models or skip them based on flag
-	if skip_third_party:
-		logger.info("⏭️  Skipping 3rd party models - evaluating only simplified distillation models")
-		models = []
-	else:
-		logger.info("📊 Including 3rd party peer models for comparison")
-		models = DEFAULT_EVALUATION_MODELS.copy()
-	# Discover simplified distillation models in the current directory
-	logger.info("🔍 Discovering simplified distillation models...")
-	discovered_models = discover_simplified_models(".")
-	# Add discovered models (they're already sorted alphabetically)
-	if discovered_models:
-		logger.info(f"✅ Found {len(discovered_models)} simplified models:")
-		for model_path in discovered_models:
-			models.append(model_path)
-			logger.info(f"   📁 {model_path}")
-	else:
-		logger.warning("⚠️ No simplified distillation models found")
-		if skip_third_party:
-			logger.error("❌ No models to evaluate! Either create simplified models or include 3rd party models.")
-			return
-	logger.info(f"📊 Evaluating {len(models)} models:")
-	for i, model in enumerate(models, 1):
-		logger.info(f"  {i}. {model}")
-	logger.info("\n💡 Checkpoint Info:")
-	logger.info("   - Already evaluated models will be skipped")
-	logger.info("   - Results are saved persistently to Beam volume")
-	# Run comprehensive evaluation using Beam utilities
-	results = beam_evaluate_models(
-		models=models,
-		max_queries=1000,
-		languages=EVALUATION_LANGUAGES,
-		output_dir=str(Path(VOLUME_PATH) / EVALUATION_RESULTS_DIR),
-		volume_name=VOLUME_NAME,
-		mount_path=VOLUME_PATH,
-	)
-	# Print final summary
-	print("\n🎯 Evaluation Summary:")
-	print(f"📊 Total models processed: {len(results)}")
-	print(f"💾 Results saved to Beam volume: {VOLUME_NAME}")
-	print(f"📁 Directory: {EVALUATION_RESULTS_DIR}")
-	if skip_third_party:
-		print("⏭️  3rd party models were skipped")
-	print("\n🔍 To view analysis:")
-	print("   beam run src.distiller.analyze:beam_analysis")
-	print("\n📈 To run evaluations again:")
-	print("   distiller evaluate  (will skip already completed models)")
-	print("   distiller evaluate --skip-third-party  (evaluate only simplified models)")
-def discover_simplified_models(base_path: str = ".") -> list[str]:
-	"""
-	Discover all simplified distillation models in the correct directory.
-	Looks for directories matching the pattern: ./code_model2vec/final/code_model2vec_*
-	"""
-	discovered_models: list[str] = []
-	# Look in the correct location where distill_simplified.py saves models
-	models_dir = Path(base_path) / "code_model2vec" / "final"
-	if not models_dir.exists():
-		logger.warning(f"Models directory not found: {models_dir}")
-		return discovered_models
-	# Look for simplified model directories with the updated pattern
-	pattern = "code_model2vec_*"
-	for model_dir in models_dir.glob(pattern):
-		if model_dir.is_dir() and (model_dir / "config.json").exists():
-			discovered_models.append(str(model_dir))
-			logger.info(f"🔍 Discovered simplified model: {model_dir}")
-	# Sort alphabetically for consistent ordering
-	discovered_models.sort()
-	return discovered_models
-@function(
-	gpu=GPU_NAME,
-	volumes=[Volume(name=VOLUME_NAME, mount_path=VOLUME_PATH)],
-	image=IMAGE,
-	secrets=["HF_ACCESS_TOKEN"],
-	env={
-		"TOKENIZERS_PARALLELISM": "false",
-		"CUDA_LAUNCH_BLOCKING": "0",
-	},
-	timeout=3600 * 6,  # 6 hours for evaluation
-)
-def evaluate_simplified_only() -> None:
-	"""Evaluate only simplified distillation models, skipping 3rd party models."""
-	main(skip_third_party=True)
-def run_local_evaluation(
-	models: list[str] | None = None,
 	max_queries: int = 1000,
 	languages: list[str] | None = None,
-	output_dir: str = DEFAULT_OUTPUT_DIR,
 ) -> list[dict[str, Any]]:
-	"""Main evaluation function for local execution without Beam utilities."""
-	logger.info("🖥️ Running CodeSearchNet evaluation locally")
-	if models is None:
-		models = DEFAULT_EVALUATION_MODELS.copy()
-		# Discover simplified distillation models in the current directory
-		logger.info("🔍 Discovering simplified distillation models...")
-		discovered_models = discover_simplified_models(".")
-		# Add discovered models
-		if discovered_models:
-			logger.info(f"✅ Found {len(discovered_models)} simplified models:")
-			for model_path in discovered_models:
-				models.append(model_path)
-				logger.info(f"   📁 {model_path}")
-		else:
-			logger.warning("⚠️ No simplified distillation models found")
-	if languages is None:
-		languages = EVALUATION_LANGUAGES
-	logger.info(f"📊 Evaluating {len(models)} models on {len(languages)} languages")
-	logger.info(f"📁 Using local output directory: {output_dir}")
-	# Create local output directory
-	output_path = Path(output_dir)
-	output_path.mkdir(parents=True, exist_ok=True)
-	all_results = []
-	skipped_models = []
 	for model_path in models:
 		model_name = Path(model_path).name
-		# Check for existing evaluation results locally
-		safe_name = "".join(c for c in model_name if c.isalnum() or c in ("-", "_", "."))
-		result_file = output_path / f"codesearchnet_eval_{safe_name}.json"
-		if result_file.exists():
-			logger.info(f"✅ Model {model_name} already evaluated - loading existing results")
-			try:
-				with result_file.open("r") as f:
-					existing_results = json.load(f)
-				all_results.append(existing_results)
-				skipped_models.append(model_name)
-				continue
-			except Exception as e:
-				logger.warning(f"⚠️ Failed to load existing results for {model_name}: {e}")
-		logger.info(f"\n{'=' * 60}")
-		logger.info(f"🔍 Evaluating model: {model_name}")
-		logger.info(f"📂 Path: {model_path}")
-		logger.info(f"{'=' * 60}")
 		try:
-			# Create evaluator without Beam utilities (no checkpointing)
-			evaluator = CodeSearchNetEvaluator(
-				model_path,
-				model_name,
-				checkpoint_manager=None,  # No checkpointing for local evaluation
-				eval_manager=None,
-			)
-			results = evaluator.evaluate_all_languages(max_queries, languages)
-			# Save results locally only
-			save_results(results, output_dir, model_name, eval_manager=None, volume_results_dir=None)
-			# Print summary
-			print_results_summary(results)
-			all_results.append(results)
 		except Exception:
-			logger.exception(f"❌ Failed to evaluate {model_name}")
 			continue
-	# Create comparison report locally
-	if len(all_results) > 1:
-		create_comparison_report(all_results, output_dir)
-		logger.info(f"📊 Comparison report saved locally: {output_dir}")
-	# Log summary
-	newly_evaluated = len(all_results) - len(skipped_models)
-	logger.info("\n✅ Local evaluation complete!")
-	logger.info(f"📊 Newly evaluated: {newly_evaluated} models")
-	logger.info(f"⏭️  Skipped (already done): {len(skipped_models)} models")
-	logger.info(f"📁 Total results: {len(all_results)} models")
-	logger.info(f"💾 Results available locally: {output_dir}")
-	if skipped_models:
-		logger.info(f"⏭️  Skipped models: {', '.join(skipped_models)}")
-	return all_results
-def run_local_evaluation_simplified(
-	max_queries: int = 1000,
-	languages: list[str] | None = None,
-	output_dir: str = DEFAULT_OUTPUT_DIR,
-) -> list[dict[str, Any]]:
-	"""Local evaluation function for simplified models only."""
-	logger.info("🖥️ Running simplified model evaluation locally")
-	# Discover simplified distillation models only
-	logger.info("🔍 Discovering simplified distillation models...")
-	discovered_models = discover_simplified_models(".")
-	if not discovered_models:
-		logger.error("❌ No simplified distillation models found! Run 'distiller distill-simple' first.")
-		return []
-	logger.info(f"✅ Found {len(discovered_models)} simplified models:")
-	for model_path in discovered_models:
-		logger.info(f"   📁 {model_path}")
-	return run_local_evaluation(
-		models=discovered_models,
 		max_queries=max_queries,
-		languages=languages,
-		output_dir=output_dir,
 	)
 if __name__ == "__main__":
-	main()

 """
+Comprehensive Model Evaluation Script for Code-Specialized Embedding Models.
+This script evaluates embedding models on both task performance and operational metrics:
+Task Performance:
+- CodeSearchNet evaluation (NDCG, MRR, Recall metrics)
+- Code search accuracy across programming languages
+Operational Performance:
+- Inference speed (latency and throughput)
+- Memory efficiency (RAM and GPU usage)
+- Model size and storage requirements
+- CPU vs GPU performance scaling
 Usage:
+    distiller evaluate [--use-beam] [--skip-benchmark]  # Run evaluation locally or on Beam
 """
 import json
 import logging
 import time
+import traceback
 from pathlib import Path
 from typing import Any
 import numpy as np
 import pandas as pd
+import psutil
+import torch
+import typer
+from beam import Volume, function
 from datasets import Dataset, load_dataset
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from tqdm import tqdm
+from .beam_utils import download_specific_evaluation_file
+from .config import (
+	BEAM_ENV_SETTINGS,
+	DEFAULT_EVALUATION_MODELS,
+	GPU_NAME,
+	IMAGE,
+	codesearchnet_config,
+	directories,
+	get_safe_model_name,
+	get_volume_config,
+	languages_config,
 )
 logger = logging.getLogger(__name__)
 # =============================================================================
+# EVALUATION CONFIGURATION
 # =============================================================================
+BATCH_SIZE = 32
+LOCAL_EVALUATION_DIR = directories.evaluation_results
+LOCAL_BENCHMARK_DIR = directories.benchmark_results
+LOCAL_MODELS_DIR = directories.final
+VOLUME_CONFIG = get_volume_config()
 # =============================================================================
+# CORE EVALUATION CLASSES
 # =============================================================================
+# Sample texts for benchmarking (various lengths)
+BENCHMARK_TEXTS = {
+	"short": [
+		"def add(a, b): return a + b",
+		"function multiply(x, y) { return x * y; }",
+		"class Calculator { public int subtract(int a, int b) { return a - b; } }",
+	]
+	* 100,  # 300 short texts
+	"medium": [
+		"def fibonacci(n):\n    if n <= 1:\n        return n\n    return fibonacci(n-1) + fibonacci(n-2)",
+		"function quickSort(arr) {\n    if (arr.length <= 1) return arr;\n    const pivot = arr[arr.length - 1];\n    const left = [], right = [];\n    for (let i = 0; i < arr.length - 1; i++) {\n        if (arr[i] < pivot) left.push(arr[i]);\n        else right.push(arr[i]);\n    }\n    return [...quickSort(left), pivot, ...quickSort(right)];\n}",
+	]
+	* 50,  # 100 medium texts
+	"long": [
+		"""
+def complex_algorithm(data, config):
+    '''
+    Complex data processing algorithm with multiple steps.
+    '''
+    results = []
+    # Data validation and processing steps...
+    return results
+        """.strip(),
+	]
+	* 20,  # 20 long texts
+}
+class PerformanceBenchmark:
+	"""Comprehensive performance benchmarking for embedding models."""
+	def __init__(self, model_path: str, model_name: str | None = None) -> None:
+		"""Initialize benchmarker with model."""
+		self.model_path = model_path
+		self.model_name = model_name or Path(model_path).name
+		self.model: SentenceTransformer | None = None
+		self.device = "cuda" if torch.cuda.is_available() else "cpu"
+		self.results: dict[str, Any] = {}
+	def load_model(self) -> None:
+		"""Load the embedding model."""
+		logger.info(f"Loading model from {self.model_path}")
+		start_time = time.time()
+		try:
+			self.model = SentenceTransformer(self.model_path, device=self.device, trust_remote_code=True)
+			load_time = time.time() - start_time
+			logger.info(f"✅ Model loaded in {load_time:.2f}s on {self.device}")
+			self.results["model_load_time"] = load_time
+		except Exception:
+			logger.exception("❌ Failed to load model")
+			self.results["error"] = traceback.format_exc()
+			raise
+	def measure_model_size(self) -> dict[str, float]:
+		"""Measure model size metrics."""
+		logger.info("📏 Measuring model size...")
+		size_metrics: dict[str, Any] = {}
+		# Disk size
+		try:
+			if Path(self.model_path).is_dir():
+				# Local directory - calculate size of model files only
+				model_extensions = {".safetensors", ".bin", ".json", ".txt", ".tokenizer"}
+				total_size = 0
+				model_dir = Path(self.model_path)
+				for file_path in model_dir.rglob("*"):
+					if file_path.is_file() and (
+						file_path.suffix.lower() in model_extensions or "tokenizer" in file_path.name.lower()
+					):
+						total_size += file_path.stat().st_size
+				size_metrics["disk_size_mb"] = total_size / (1024 * 1024)
+			# HuggingFace model - estimate based on model parameters
+			elif self.model is not None:
+				param_count = sum(p.numel() for p in self.model.parameters())
+				# Rough estimate: 4 bytes per parameter (float32)
+				estimated_size = param_count * 4
+				size_metrics["disk_size_mb"] = estimated_size / (1024 * 1024)
+			else:
+				size_metrics["disk_size_mb"] = 0.0
+		except Exception as e:
+			logger.warning(f"⚠️ Could not calculate disk size: {e}")
+			size_metrics["disk_size_mb"] = 0.0
+		# Memory size (if model is loaded)
+		if self.model is not None:
+			try:
+				# Parameter count
+				param_count = sum(p.numel() for p in self.model.parameters())
+				size_metrics["parameter_count"] = param_count
+				size_metrics["parameters_millions"] = param_count / 1e6
+				# Memory usage estimate
+				param_size = sum(p.numel() * p.element_size() for p in self.model.parameters())
+				buffer_size = sum(b.numel() * b.element_size() for b in self.model.buffers())
+				size_metrics["memory_size_mb"] = (param_size + buffer_size) / (1024 * 1024)
+				size_metrics["ram_usage_mb"] = size_metrics["memory_size_mb"]
+				# GPU memory if using CUDA
+				if self.device == "cuda" and torch.cuda.is_available():
+					size_metrics["gpu_memory_mb"] = torch.cuda.memory_allocated() / (1024 * 1024)
+					size_metrics["gpu_name"] = torch.cuda.get_device_name(0)
+				# Embedding dimension if available
+				if hasattr(self.model, "get_sentence_embedding_dimension"):
+					size_metrics["embedding_dim"] = self.model.get_sentence_embedding_dimension()
+			except Exception as e:
+				logger.warning(f"⚠️ Could not calculate memory size: {e}")
+		# Update results
+		self.results["size_metrics"] = size_metrics
+		return size_metrics
+	def benchmark_inference_speed(self, batch_sizes: list[int] | None = None) -> dict[str, Any]:
+		"""Benchmark inference speed with different batch sizes."""
+		if batch_sizes is None:
+			batch_sizes = [1, 8, 16, 32]
+		logger.info(f"⚡ Benchmarking inference speed with batch sizes: {batch_sizes}")
+		if self.model is None:
+			self.load_model()
+		speed_results: dict[str, Any] = {"medium": {}}
+		# Use medium-length texts for speed testing
+		test_texts = BENCHMARK_TEXTS["medium"]
+		for batch_size in batch_sizes:
+			logger.info(f"  📊 Testing batch size: {batch_size}")
+			# Prepare batch
+			batch = (
+				test_texts[:batch_size]
+				if batch_size <= len(test_texts)
+				else test_texts * ((batch_size // len(test_texts)) + 1)
+			)
+			batch = batch[:batch_size]
+			# Warmup
+			if self.model is not None:
+				_ = self.model.encode(batch[: min(4, len(batch))], convert_to_tensor=False)
+			# Benchmark multiple runs
+			latencies = []
+			num_runs = max(3, 20 // batch_size)  # More runs for smaller batches
+			for _ in range(num_runs):
+				start_time = time.time()
+				if self.model is not None:
+					_ = self.model.encode(batch, convert_to_tensor=False, normalize_embeddings=True)
+				end_time = time.time()
+				latencies.append(end_time - start_time)
+			# Calculate metrics
+			avg_latency = sum(latencies) / len(latencies)
+			throughput = batch_size / avg_latency
+			time_per_text_ms = (avg_latency / batch_size) * 1000
+			batch_key = f"batch_{batch_size}"
+			speed_results["medium"][batch_key] = {
+				"time_per_text_ms": time_per_text_ms,
+				"texts_per_second": throughput,
+				"tokens_per_second": throughput * 50,  # Estimate 50 tokens per text
+			}
+			logger.info(f"    ⚡ Latency: {avg_latency:.3f}s, Throughput: {throughput:.1f} texts/sec")
+		# Update results
+		self.results["speed_benchmarks"] = speed_results
+		return speed_results
+	def benchmark_memory_scaling(self, batch_sizes: list[int] | None = None) -> dict[str, Any]:
+		"""Benchmark memory usage scaling with batch size."""
+		if batch_sizes is None:
+			batch_sizes = [1, 8, 16, 32]
+		logger.info(f"🧠 Benchmarking memory scaling with batch sizes: {batch_sizes}")
+		if self.model is None:
+			self.load_model()
+		memory_results: dict[str, Any] = {}
+		test_texts = BENCHMARK_TEXTS["medium"]
+		for batch_size in batch_sizes:
+			logger.info(f"  📊 Testing memory with batch size: {batch_size}")
+			# Prepare batch
+			batch = (
+				test_texts[:batch_size]
+				if batch_size <= len(test_texts)
+				else test_texts * ((batch_size // len(test_texts)) + 1)
+			)
+			batch = batch[:batch_size]
+			# Clear GPU cache if using CUDA
+			if torch.cuda.is_available():
+				torch.cuda.empty_cache()
+				torch.cuda.reset_peak_memory_stats()
+			try:
+				# Run inference
+				if self.model is not None:
+					_ = self.model.encode(batch, convert_to_tensor=False)
+				# Measure peak memory
+				if torch.cuda.is_available():
+					peak_memory = torch.cuda.max_memory_allocated() / (1024 * 1024)
+					memory_per_text = peak_memory / batch_size
+				else:
+					# Use psutil for CPU memory (less accurate)
+					peak_memory = psutil.virtual_memory().used / (1024 * 1024)
+					memory_per_text = 0  # Can't accurately measure per-text on CPU
+				batch_key = f"batch_{batch_size}"
+				memory_results[batch_key] = {
+					"memory_used_mb": peak_memory,
+					"memory_per_text_mb": memory_per_text,
+					"oom": False,
+				}
+				logger.info(f"    🧠 Peak memory: {peak_memory:.1f}MB, Per text: {memory_per_text:.2f}MB")
+			except Exception as e:
+				logger.warning(f"⚠️ Memory benchmark failed for batch {batch_size}: {e}")
+				batch_key = f"batch_{batch_size}"
+				memory_results[batch_key] = {
+					"oom": True,
+					"error": str(e),
+				}
+		self.results["memory_benchmarks"] = memory_results
+		return memory_results
+	def benchmark_cpu_vs_gpu(self) -> dict[str, Any]:
+		"""Compare CPU vs GPU performance."""
+		logger.info("⚖️ Benchmarking CPU vs GPU performance")
+		if not torch.cuda.is_available():
+			logger.warning("⚠️ CUDA not available - skipping GPU benchmark")
+			return {}
+		comparison_results: dict[str, Any] = {}
+		test_texts = BENCHMARK_TEXTS["medium"][:16]  # Use 16 texts for comparison
+		for device in ["cpu", "cuda"]:
+			logger.info(f"  📊 Testing on {device.upper()}")
+			try:
+				model = SentenceTransformer(self.model_path, device=device, trust_remote_code=True)
+				# Warmup
+				_ = model.encode(test_texts[:4], convert_to_tensor=False)
+				# Benchmark
+				start_time = time.time()
+				_ = model.encode(test_texts, convert_to_tensor=False, normalize_embeddings=True)
+				end_time = time.time()
+				latency = end_time - start_time
+				throughput = len(test_texts) / latency
+				comparison_results[device] = {
+					"texts_per_second": throughput,
+				}
+				logger.info(f"    ⚡ {device.upper()}: {latency:.3f}s, {throughput:.1f} texts/sec")
+				# Clean up
+				del model
+				if device == "cuda":
+					torch.cuda.empty_cache()
+			except Exception as e:
+				logger.warning(f"⚠️ Failed to benchmark {device}: {e}")
+				comparison_results[device] = {"error": str(e)}
+		# Calculate speedup
+		if "cpu" in comparison_results and "cuda" in comparison_results:
+			cpu_throughput = comparison_results["cpu"].get("texts_per_second", 0)
+			gpu_throughput = comparison_results["cuda"].get("texts_per_second", 0)
+			if cpu_throughput > 0:
+				speedup = gpu_throughput / cpu_throughput
+				comparison_results["gpu_speedup"] = speedup
+				logger.info(f"    🚀 GPU Speedup: {speedup:.1f}x")
+		self.results["cpu_vs_gpu"] = comparison_results
+		return comparison_results
+	def run_comprehensive_benchmark(self) -> dict[str, Any]:
+		"""Run all benchmarks and return comprehensive results."""
+		logger.info(f"🏁 Starting comprehensive benchmark for {self.model_name}")
+		# Model information
+		self.results["model_name"] = self.model_name
+		self.results["model_path"] = self.model_path
+		self.results["timestamp"] = time.strftime("%Y-%m-%d %H:%M:%S")
+		# Run all benchmarks
+		try:
+			self.load_model()
+			self.measure_model_size()
+			self.benchmark_inference_speed([1, 8, 16, 32])
+			self.benchmark_memory_scaling([1, 8, 16, 32])
+			self.benchmark_cpu_vs_gpu()
+			logger.info(f"✅ Comprehensive benchmark completed for {self.model_name}")
+		except Exception:
+			logger.exception(f"❌ Benchmark failed for {self.model_name}")
+			self.results["error"] = traceback.format_exc()
+		return self.results
 class CodeSearchNetEvaluator:
 	"""Evaluator for CodeSearchNet-style code search tasks."""
+	def __init__(self, model_path: str, model_name: str | None = None) -> None:
+		"""Initialize the evaluator with a model."""
 		self.model_path = model_path
 		self.model_name = model_name or Path(model_path).name
 		self.model: SentenceTransformer | None = None
 		self._load_model()
 	def _load_model(self) -> None:
+		"""Load the embedding model."""
 		logger.info(f"Loading model from {self.model_path}")
 		try:
 			self.model = SentenceTransformer(self.model_path, trust_remote_code=True)
 			logger.info(f"Successfully loaded model: {self.model_name}")
 			raise RuntimeError(msg)
 		embeddings = []
 		for i in tqdm(range(0, len(texts), BATCH_SIZE), desc=desc):
 			batch = texts[i : i + BATCH_SIZE]
 			batch_embeddings = self.model.encode(batch, convert_to_tensor=False, normalize_embeddings=True)
 		return np.vstack(embeddings)
 	def evaluate_language(self, language: str, max_queries: int = 1000) -> dict[str, Any]:
+		"""Evaluate on a specific programming language."""
 		logger.info(f"Evaluating on {language} language (max {max_queries} queries)")
 		try:
 			# Load test split for the language
 			dataset = load_dataset(
+				codesearchnet_config.dataset_name,
 				language,
 				split="test",
 				trust_remote_code=True,
 			)
 			if not isinstance(dataset, Dataset):
 				logger.error(f"Unexpected dataset type for {language}: {type(dataset)}")
 				return {}
+			# Sample queries for evaluation
 			if len(dataset) > max_queries:
+				rng = np.random.default_rng(42)
 				indices = rng.choice(len(dataset), max_queries, replace=False)
 				dataset = dataset.select(indices)
 			logger.info(f"Found {len(queries)} valid query-code pairs for {language}")
 			# Encode queries and codes
+			start_time = time.time()
 			query_embeddings = self.encode_texts(queries, f"Encoding {language} queries")
+			code_embeddings = self.encode_texts(codes, f"Encoding {language} code")
+			encoding_time = time.time() - start_time
+			# Compute similarities and metrics
 			similarities = cosine_similarity(query_embeddings, code_embeddings)
 			metrics = self._compute_retrieval_metrics(similarities)
+			# Prepare results
+			results = {
 				"language": language,
+				"model_name": self.model_name,
 				"num_queries": len(queries),
+				"encoding_time_seconds": encoding_time,
 				"metrics": metrics,
 			}
+			logger.info(f"✅ {language} evaluation completed in {encoding_time:.2f}s")
+			return results
 		except Exception:
+			logger.exception(f"❌ Failed to evaluate {language}")
 			return {}
 	def _compute_retrieval_metrics(self, similarities: np.ndarray) -> dict[str, float]:
+		"""Compute retrieval metrics from similarity matrix."""
+		n_queries = similarities.shape[0]
+		# For each query, the correct code is at the same index
+		correct_indices = np.arange(n_queries)
+		# Rank all codes for each query
+		ranked_indices = np.argsort(similarities, axis=1)[:, ::-1]
+		metrics = {}
+		# Compute metrics for different k values
+		for k in [1, 5, 10]:
+			if k <= similarities.shape[1]:
+				# Recall@k
+				recall_k = np.mean([correct_indices[i] in ranked_indices[i, :k] for i in range(n_queries)])
+				metrics[f"recall@{k}"] = recall_k
+				# NDCG@k
+				ndcg_k = np.mean(
+					[self._compute_ndcg(ranked_indices[i], correct_indices[i], k) for i in range(n_queries)]
+				)
+				metrics[f"ndcg@{k}"] = ndcg_k
+		# Mean Reciprocal Rank
 		reciprocal_ranks = []
+		for i in range(n_queries):
+			rank = np.where(ranked_indices[i] == correct_indices[i])[0]
+			if len(rank) > 0:
+				reciprocal_ranks.append(1.0 / (rank[0] + 1))
+			else:
+				reciprocal_ranks.append(0.0)
+		metrics["mrr"] = np.mean(reciprocal_ranks)
+		# Add mean rank and median rank
+		mean_ranks = []
+		for i in range(n_queries):
+			rank = np.where(ranked_indices[i] == correct_indices[i])[0]
+			if len(rank) > 0:
+				mean_ranks.append(rank[0] + 1)  # 1-indexed
+			else:
+				mean_ranks.append(similarities.shape[1])  # Worst possible rank
+		metrics["mean_rank"] = np.mean(mean_ranks)
+		metrics["median_rank"] = np.median(mean_ranks)
+		# Ensure all values are float
+		return {k: float(v) for k, v in metrics.items()}
 	def _compute_ndcg(self, ranked_indices: np.ndarray, correct_idx: int, k: int) -> float:
 		"""Compute NDCG@k for a single query."""
+		if correct_idx in ranked_indices[:k]:
+			rank = np.where(ranked_indices[:k] == correct_idx)[0][0]
+			return 1.0 / np.log2(rank + 2)
 		return 0.0
 	def evaluate_all_languages(
 		self, max_queries_per_lang: int = 1000, languages: list[str] | None = None
 	) -> dict[str, Any]:
+		"""Evaluate on all specified languages."""
+		eval_languages = languages or languages_config.all
+		logger.info(f"🚀 Starting evaluation on {len(eval_languages)} languages")
+		logger.info(f"📊 Model: {self.model_name}")
+		logger.info(f"🔢 Max queries per language: {max_queries_per_lang}")
 		start_time = time.time()
+		results = {
 			"model_name": self.model_name,
 			"model_path": self.model_path,
 			"languages": {},
 			"overall": {},
+			"evaluation_time_seconds": 0,
 		}
+		languages_dict: dict[str, Any] = {}
+		# Evaluate each language
+		for language in eval_languages:
+			logger.info(f"\n{'=' * 50}")
+			logger.info(f"🔍 Evaluating {language}")
+			logger.info(f"{'=' * 50}")
 			lang_results = self.evaluate_language(language, max_queries_per_lang)
 			if lang_results:
+				languages_dict[language] = lang_results
+		results["languages"] = languages_dict
+		# Compute overall metrics
+		if languages_dict:
 			overall_metrics = {}
+			metric_names = list(next(iter(languages_dict.values()))["metrics"].keys())
+			for metric in metric_names:
+				values = [languages_dict[lang]["metrics"][metric] for lang in languages_dict]
+				overall_metrics[metric] = np.mean(values)
 			results["overall"] = overall_metrics
 		total_time = time.time() - start_time
 		results["evaluation_time_seconds"] = total_time
 		logger.info(f"Evaluation completed in {total_time:.2f} seconds")
 		return results
+class ComprehensiveModelEvaluator:
+	"""Combined evaluator for both task performance and operational benchmarks."""
+	def __init__(self, model_path: str, model_name: str | None = None) -> None:
+		"""Initialize the comprehensive evaluator with a model."""
+		self.model_path = model_path
+		self.model_name = model_name or Path(model_path).name
+		# Initialize sub-evaluators
+		self.codesearch_evaluator = CodeSearchNetEvaluator(model_path, model_name)
+		self.performance_benchmarker = PerformanceBenchmark(model_path, model_name)
+		self.results: dict[str, Any] = {}
+	def run_comprehensive_evaluation(
+		self,
+		max_queries_per_lang: int = 1000,
+		languages: list[str] | None = None,
+		skip_benchmark: bool = False,
+	) -> dict[str, Any]:
+		"""Run both CodeSearchNet evaluation and performance benchmarking."""
+		logger.info(f"🚀 Starting comprehensive evaluation for {self.model_name}")
+		start_time = time.time()
+		# Initialize results structure
+		self.results = {
+			"model_name": self.model_name,
+			"model_path": self.model_path,
+			"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+			"evaluation_time_seconds": 0,
+		}
 		try:
+			# 1. Run CodeSearchNet evaluation
+			logger.info("🔍 Running CodeSearchNet task evaluation...")
+			codesearch_results = self.codesearch_evaluator.evaluate_all_languages(max_queries_per_lang, languages)
+			# Extract CodeSearchNet metrics
+			self.results.update(
+				{
+					"codesearch_languages": codesearch_results.get("languages", {}),
+					"codesearch_overall": codesearch_results.get("overall", {}),
+				}
+			)
+			# 2. Run performance benchmarking (unless skipped)
+			if not skip_benchmark:
+				logger.info("⚡ Running operational performance benchmarking...")
+				benchmark_results = self.performance_benchmarker.run_comprehensive_benchmark()
+				# Extract benchmark metrics
+				self.results.update(
+					{
+						"size_metrics": benchmark_results.get("size_metrics", {}),
+						"speed_benchmarks": benchmark_results.get("speed_benchmarks", {}),
+						"memory_benchmarks": benchmark_results.get("memory_benchmarks", {}),
+						"cpu_vs_gpu": benchmark_results.get("cpu_vs_gpu", {}),
+					}
+				)
+			else:
+				logger.info("⏭️ Skipping performance benchmarking")
+				self.results["benchmark_skipped"] = True
 		except Exception as e:
+			logger.exception(f"❌ Comprehensive evaluation failed for {self.model_name}")
+			self.results["error"] = str(e)
+		# Calculate total time
+		total_time = time.time() - start_time
+		self.results["evaluation_time_seconds"] = total_time
+		logger.info(f"✅ Comprehensive evaluation completed in {total_time:.2f} seconds")
+		return self.results
+	def print_summary(self) -> None:
+		"""Print a comprehensive summary of all results."""
+		logger.info(f"\n{'=' * 60}")
+		logger.info(f"📊 COMPREHENSIVE EVALUATION RESULTS: {self.model_name}")
+		logger.info(f"{'=' * 60}")
+		# CodeSearchNet results
+		overall = self.results.get("codesearch_overall", {})
+		if overall:
+			logger.info("🔍 CodeSearchNet Performance:")
+			for metric, value in overall.items():
+				logger.info(f"  🎯 {metric.upper()}: {value:.4f}")
+		# Benchmark results
+		if not self.results.get("benchmark_skipped", False):
+			size_metrics = self.results.get("size_metrics", {})
+			if size_metrics:
+				logger.info(f"\n📏 Model Size: {size_metrics.get('disk_size_mb', 0):.1f}MB")
+				if "parameters_millions" in size_metrics:
+					logger.info(f"🔢 Parameters: {size_metrics['parameters_millions']:.1f}M")
+			speed_benchmarks = self.results.get("speed_benchmarks", {})
+			if "medium" in speed_benchmarks and "batch_32" in speed_benchmarks["medium"]:
+				batch_32 = speed_benchmarks["medium"]["batch_32"]
+				logger.info(f"⚡ Throughput (batch 32): {batch_32.get('texts_per_second', 0):.1f} texts/sec")
+			cpu_vs_gpu = self.results.get("cpu_vs_gpu", {})
+			if "gpu_speedup" in cpu_vs_gpu:
+				speedup = cpu_vs_gpu["gpu_speedup"]
+				logger.info(f"🚀 GPU speedup: {speedup:.1f}x")
+		# Language breakdown
+		languages = self.results.get("codesearch_languages", {})
+		if languages:
+			logger.info("\n📋 Language Breakdown:")
+			for lang, lang_results in languages.items():
+				metrics = lang_results.get("metrics", {})
+				ndcg10 = metrics.get("ndcg@10", 0)
+				mrr = metrics.get("mrr", 0)
+				logger.info(f"  {lang}: NDCG@10={ndcg10:.4f}, MRR={mrr:.4f}")
+# =============================================================================
+# UTILITY FUNCTIONS
+# =============================================================================
+def check_existing_results(model_name: str, local_dir: str = LOCAL_EVALUATION_DIR) -> dict[str, Any] | None:
+	"""Check if comprehensive evaluation results already exist for a model."""
+	local_path = Path(local_dir)
+	safe_model_name = get_safe_model_name(model_name)
+	# Check for new comprehensive format first
+	comprehensive_file = local_path / f"comprehensive_eval_{safe_model_name}.json"
+	if comprehensive_file.exists():
+		try:
+			with comprehensive_file.open("r") as f:
+				results = json.load(f)
+			logger.info(f"✅ Found existing comprehensive results for {model_name}")
+			return results
+		except Exception as e:
+			logger.warning(f"⚠️ Could not load existing comprehensive results for {model_name}: {e}")
+	# Fallback to legacy codesearchnet format for backward compatibility
+	legacy_file = local_path / f"codesearchnet_eval_{safe_model_name}.json"
+	if legacy_file.exists():
+		try:
+			with legacy_file.open("r") as f:
+				results = json.load(f)
+			logger.info(f"✅ Found existing legacy results for {model_name}")
+			return results
+		except Exception as e:
+			logger.warning(f"⚠️ Could not load existing legacy results for {model_name}: {e}")
+	return None
+def save_evaluation_results(results: dict[str, Any], local_dir: str = LOCAL_EVALUATION_DIR) -> bool:
+	"""Save comprehensive evaluation results to local directory as a single JSON file."""
+	try:
+		local_path = Path(local_dir)
+		local_path.mkdir(parents=True, exist_ok=True)
+		model_name = results.get("model_name", "unknown")
+		safe_model_name = get_safe_model_name(model_name)
+		# Save single comprehensive results file (CodeSearchNet + Benchmark combined)
+		result_file = local_path / f"comprehensive_eval_{safe_model_name}.json"
+		with result_file.open("w") as f:
+			json.dump(results, f, indent=2, default=str)
+		logger.info(f"💾 Saved comprehensive evaluation results for {model_name}")
+		return True
+	except Exception:
+		logger.exception("❌ Error saving evaluation results")
+		return False
+def discover_local_models(models_dir: str = LOCAL_MODELS_DIR) -> list[str]:
+	"""Discover models in the local models directory."""
+	models_path = Path(models_dir)
+	discovered_models = []
+	if models_path.exists():
+		for model_dir in models_path.iterdir():
+			if model_dir.is_dir() and (
+				any(model_dir.glob("*.json")) or any(model_dir.glob("*.bin")) or any(model_dir.glob("*.safetensors"))
+			):
+				discovered_models.append(str(model_dir))
+				logger.info(f"📁 Found local model: {model_dir.name}")
+	return discovered_models
+def print_results_summary(results: dict[str, Any]) -> None:
+	"""Print a formatted summary of comprehensive evaluation results."""
+	logger.info(f"\n{'=' * 60}")
+	logger.info(f"📊 COMPREHENSIVE EVALUATION: {results.get('model_name', 'Unknown')}")
+	logger.info(f"{'=' * 60}")
+	# CodeSearchNet results
+	overall = results.get("codesearch_overall", {})
 	if overall:
+		logger.info("🔍 CodeSearchNet Performance:")
+		for metric, value in overall.items():
+			logger.info(f"  🎯 {metric.upper()}: {value:.4f}")
+	# Benchmark results
+	if not results.get("benchmark_skipped", False):
+		size_metrics = results.get("size_metrics", {})
+		if size_metrics:
+			logger.info(f"\n📏 Model Size: {size_metrics.get('disk_size_mb', 0):.1f}MB")
+			if "parameters_millions" in size_metrics:
+				logger.info(f"🔢 Parameters: {size_metrics['parameters_millions']:.1f}M")
+		speed_benchmarks = results.get("speed_benchmarks", {})
+		if "medium" in speed_benchmarks and "batch_32" in speed_benchmarks["medium"]:
+			batch_32 = speed_benchmarks["medium"]["batch_32"]
+			logger.info(f"⚡ Throughput (batch 32): {batch_32.get('texts_per_second', 0):.1f} texts/sec")
+	# Language breakdown
+	languages = results.get("codesearch_languages", {})
+	if languages:
+		logger.info("\n📋 Language Breakdown:")
+		for lang, lang_results in languages.items():
+			metrics = lang_results.get("metrics", {})
+			ndcg10 = metrics.get("ndcg@10", 0)
+			mrr = metrics.get("mrr", 0)
+			logger.info(f"  {lang}: NDCG@10={ndcg10:.4f}, MRR={mrr:.4f}")
+def create_comparison_report(all_results: list[dict[str, Any]], output_dir: str = LOCAL_EVALUATION_DIR) -> None:
+	"""Create a comprehensive comparison report with both CodeSearchNet and benchmark data."""
 	if not all_results:
 		return
+	logger.info("📊 Creating comprehensive comparison report...")
+	# Create evaluation comparison dataframe
+	evaluation_data = []
+	benchmark_data = []
+	for result in all_results:
+		model_name = result.get("model_name", "Unknown")
+		# CodeSearchNet data
+		overall = result.get("codesearch_overall", {})
+		eval_row = {"model_name": model_name}
+		eval_row.update(overall)
+		evaluation_data.append(eval_row)
+		# Benchmark data (if available)
+		if not result.get("benchmark_skipped", False):
+			benchmark_row = {"model_name": model_name}
+			size_metrics = result.get("size_metrics", {})
+			speed_benchmarks = result.get("speed_benchmarks", {})
+			benchmark_row.update(size_metrics)
+			if "medium" in speed_benchmarks and "batch_32" in speed_benchmarks["medium"]:
+				batch_32 = speed_benchmarks["medium"]["batch_32"]
+				benchmark_row["best_throughput"] = batch_32.get("texts_per_second", 0)
+			benchmark_data.append(benchmark_row)
+	# Save comparison results
 	output_path = Path(output_dir)
+	output_path.mkdir(parents=True, exist_ok=True)
+	# Combined evaluation comparison CSV (includes both CodeSearchNet and key benchmark metrics)
+	if evaluation_data and benchmark_data:
+		# Merge evaluation and benchmark data
+		combined_data = []
+		benchmark_dict = {row["model_name"]: row for row in benchmark_data}
+		for eval_row in evaluation_data:
+			model_name = eval_row["model_name"]
+			combined_row = eval_row.copy()
+			# Add benchmark metrics if available
+			if model_name in benchmark_dict:
+				benchmark_row = benchmark_dict[model_name]
+				combined_row.update(
+					{
+						"disk_size_mb": benchmark_row.get("disk_size_mb", 0),
+						"parameters_millions": benchmark_row.get("parameters_millions", 0),
+						"best_throughput": benchmark_row.get("best_throughput", 0),
+					}
+				)
+			combined_data.append(combined_row)
+		combined_df = pd.DataFrame(combined_data)
+		combined_csv = output_path / "comprehensive_comparison.csv"
+		combined_df.to_csv(combined_csv, index=False)
+		logger.info(f"📄 Comprehensive comparison CSV saved: {combined_csv}")
+	# Detailed JSON export
+	json_path = output_path / "comprehensive_evaluation.json"
+	with json_path.open("w") as f:
+		json.dump(all_results, f, indent=2, default=str)
+	logger.info(f"📄 Comprehensive results JSON saved: {json_path}")
+# =============================================================================
+# MAIN EVALUATION FUNCTIONS
+# =============================================================================
+def run_evaluation(
 	models: list[str],
 	max_queries: int = 1000,
 	languages: list[str] | None = None,
+	use_beam: bool = False,
+	skip_benchmark: bool = False,
 ) -> list[dict[str, Any]]:
+	"""Main evaluation function that handles both local and Beam execution."""
+	logger.info(f"🚀 Starting comprehensive evaluation ({'Beam' if use_beam else 'Local'})")
+	logger.info(f"📊 Evaluating {len(models)} models on {len(languages or languages_config.all)} languages")
+	logger.info(f"⚡ Benchmarking: {'Disabled' if skip_benchmark else 'Enabled'}")
+	# Check for existing results and skip already evaluated models
+	models_to_evaluate = []
+	skipped_models = []
+	all_results = []
+	for model_path in models:
+		model_name = Path(model_path).name
+		existing_results = check_existing_results(model_name)
+		if existing_results:
+			logger.info(f"✅ Model {model_name} already evaluated, skipping")
+			all_results.append(existing_results)
+			skipped_models.append(model_name)
+		else:
+			models_to_evaluate.append(model_path)
+	if not models_to_evaluate:
+		logger.info("🎉 All models already evaluated!")
+		return all_results
+	logger.info(f"📊 Need to evaluate {len(models_to_evaluate)} models")
+	if use_beam:
+		# Run on Beam
+		new_results = _run_beam_evaluation(models_to_evaluate, max_queries, languages, skip_benchmark)
+	else:
+		# Run locally
+		new_results = _run_local_evaluation(models_to_evaluate, max_queries, languages, skip_benchmark)
+	all_results.extend(new_results)
+	# Create comparison report
+	if len(all_results) > 1:
+		create_comparison_report(all_results)
+	# Print summary
+	newly_evaluated = len(new_results)
+	logger.info(f"\n{'=' * 60}")
+	logger.info("📊 EVALUATION SUMMARY")
+	logger.info(f"{'=' * 60}")
+	logger.info(f"📊 Total models: {len(models)}")
+	logger.info(f"✅ Newly evaluated: {newly_evaluated}")
+	logger.info(f"⏭️  Skipped (already done): {len(skipped_models)}")
+	logger.info(f"🎯 Total results: {len(all_results)}")
+	logger.info(f"⚡ Benchmarking: {'Disabled' if skip_benchmark else 'Enabled'}")
+	return all_results
+def _run_local_evaluation(
+	models: list[str],
+	max_queries: int = 1000,
+	languages: list[str] | None = None,
+	skip_benchmark: bool = False,
+) -> list[dict[str, Any]]:
+	"""Run comprehensive evaluation locally."""
+	logger.info("🖥️  Running local comprehensive evaluation")
+	results = []
 	for model_path in models:
 		model_name = Path(model_path).name
 		logger.info(f"\n{'=' * 60}")
 		logger.info(f"🔍 Evaluating model: {model_name}")
 		logger.info(f"{'=' * 60}")
 		try:
+			evaluator = ComprehensiveModelEvaluator(model_path, model_name)
+			result = evaluator.run_comprehensive_evaluation(max_queries, languages, skip_benchmark)
+			# Save results locally
+			save_evaluation_results(result)
+			print_results_summary(result)
+			results.append(result)
 		except Exception:
 			logger.exception(f"❌ Failed to evaluate {model_name}")
 			continue
+	return results
 @function(
 	gpu=GPU_NAME,
+	volumes=[Volume(name=VOLUME_CONFIG.name, mount_path=VOLUME_CONFIG.mount_path)],
 	image=IMAGE,
 	secrets=["HF_ACCESS_TOKEN"],
+	env=BEAM_ENV_SETTINGS,
+	timeout=3600 * 8,  # 8 hours for comprehensive evaluation
 )
+def _beam_evaluate_single_model(
+	model_path: str,
+	max_queries: int = 1000,
+	languages: list[str] | None = None,
+	skip_benchmark: bool = False,
+) -> dict[str, Any]:
+	"""Beam function to comprehensively evaluate a single model."""
+	model_name = Path(model_path).name
+	logger.info(f"🚀 Beam comprehensive evaluation starting for {model_name}")
+	try:
+		evaluator = ComprehensiveModelEvaluator(model_path, model_name)
+		results = evaluator.run_comprehensive_evaluation(max_queries, languages, skip_benchmark)
+		# Save to Beam volume as single comprehensive file
+		volume_results_dir = Path(VOLUME_CONFIG.mount_path) / "evaluation_results"
+		volume_results_dir.mkdir(parents=True, exist_ok=True)
+		safe_model_name = get_safe_model_name(model_name)
+		result_file = volume_results_dir / f"comprehensive_eval_{safe_model_name}.json"
+		with result_file.open("w") as f:
+			json.dump(results, f, indent=2, default=str)
+		logger.info(f"💾 Saved Beam comprehensive evaluation results for {model_name}")
+		return results
+	except Exception:
+		logger.exception(f"❌ Beam comprehensive evaluation failed for {model_name}")
+		return {}
+def _run_beam_evaluation(
+	models: list[str],
 	max_queries: int = 1000,
 	languages: list[str] | None = None,
+	skip_benchmark: bool = False,
 ) -> list[dict[str, Any]]:
+	"""Run comprehensive evaluation on Beam and download results."""
+	logger.info("☁️  Running Beam comprehensive evaluation")
+	results = []
 	for model_path in models:
 		model_name = Path(model_path).name
+		logger.info(f"🚀 Starting Beam comprehensive evaluation for {model_name}")
 		try:
+			# Run evaluation on Beam
+			result = _beam_evaluate_single_model.remote(model_path, max_queries, languages, skip_benchmark)
+			if result:
+				# Download the comprehensive result file from Beam
+				success = download_specific_evaluation_file(
+					VOLUME_CONFIG.name,
+					model_name,
+					"evaluation_results",
+					LOCAL_EVALUATION_DIR,
+					file_prefix="comprehensive_eval",
+				)
+				if success:
+					logger.info(f"📥 Downloaded comprehensive results for {model_name}")
+					print_results_summary(result)
+					results.append(result)
+				else:
+					logger.warning(f"⚠️ Could not download results for {model_name}")
 		except Exception:
+			logger.exception(f"❌ Beam comprehensive evaluation failed for {model_name}")
 			continue
+	return results
+# =============================================================================
+# CLI INTERFACE
+# =============================================================================
+def main(
+	use_beam: bool = typer.Option(default=False, help="Use Beam for evaluation"),
+	skip_third_party: bool = typer.Option(default=False, help="Skip third-party models"),
+	skip_benchmark: bool = typer.Option(default=False, help="Skip performance benchmarking"),
+	max_queries: int = typer.Option(default=1000, help="Maximum queries per language"),
+) -> None:
+	"""Main comprehensive evaluation function."""
+	logger.info("🚀 Starting comprehensive model evaluation (CodeSearchNet + Performance)")
+	# Build model list
+	models = []
+	# Add third-party models if not skipped
+	if not skip_third_party:
+		logger.info("📊 Including third-party peer models for comparison")
+		models.extend(DEFAULT_EVALUATION_MODELS)
+	else:
+		logger.info("⏭️  Skipping third-party models")
+	# Discover local models from code_model2vec/final
+	logger.info("🔍 Discovering local distillation models...")
+	local_models = discover_local_models()
+	if local_models:
+		logger.info(f"✅ Found {len(local_models)} local models:")
+		for model_path in local_models:
+			models.append(model_path)
+			logger.info(f"   📁 {Path(model_path).name}")
+	else:
+		logger.warning("⚠️ No local distillation models found")
+		if skip_third_party:
+			logger.error("❌ No models to evaluate!")
+			return
+	if not models:
+		logger.error("❌ No models to evaluate!")
+		return
+	logger.info(f"📊 Will evaluate {len(models)} models:")
+	for i, model in enumerate(models, 1):
+		logger.info(f"  {i}. {Path(model).name}")
+	# Run evaluation
+	results = run_evaluation(
+		models=models,
 		max_queries=max_queries,
+		languages=languages_config.all,
+		use_beam=use_beam,
+		skip_benchmark=skip_benchmark,
 	)
+	logger.info("🎉 Comprehensive evaluation workflow completed!")
+	logger.info(f"📊 Successfully evaluated {len(results)} models")
+	logger.info(f"💾 Results saved to: {LOCAL_EVALUATION_DIR}")
+	logger.info("📄 Format: Single comprehensive JSON per model (CodeSearchNet + Benchmarks)")
 if __name__ == "__main__":
+	typer.run(main)

src/distiller/patch_utils.py CHANGED Viewed

@@ -67,6 +67,15 @@ def apply_patch_file(patch_file: Path, target_dir: Path) -> bool:
 	try:
 		logger.info(f"Applying patch: {patch_file.name}")
 		# Use patch command with the following options:
 		# -p1: strip 1 leading directory from paths
 		# -d: change to directory before applying
@@ -121,6 +130,9 @@ def apply_all_patches() -> int:
 	target_dir = get_site_packages_path()
 	logger.info(f"Applying patches to: {target_dir}")
 	success_count = 0
 	# Sort patch files for consistent ordering
@@ -132,6 +144,113 @@ def apply_all_patches() -> int:
 	return success_count
 def main() -> None:
 	"""Main function for standalone execution."""
 	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")

 	try:
 		logger.info(f"Applying patch: {patch_file.name}")
+		# Check if patch is already applied
+		if is_patch_already_applied(patch_file, target_dir):
+			logger.info(f"Patch {patch_file.name} already applied")
+			return True
+		# Clean any duplicate validation code before applying
+		if "model2vec.patch" in patch_file.name:
+			clean_duplicate_validation_code(target_dir)
 		# Use patch command with the following options:
 		# -p1: strip 1 leading directory from paths
 		# -d: change to directory before applying
 	target_dir = get_site_packages_path()
 	logger.info(f"Applying patches to: {target_dir}")
+	# Clean any existing duplicates first
+	clean_duplicate_validation_code(target_dir)
 	success_count = 0
 	# Sort patch files for consistent ordering
 	return success_count
+def is_patch_already_applied(patch_file: Path, target_dir: Path) -> bool:
+	"""
+	Check if a patch has already been applied by looking for specific markers.
+	Args:
+	    patch_file: Path to the .patch file
+	    target_dir: Target directory (usually site-packages)
+	Returns:
+	    True if patch appears to be already applied, False otherwise
+	"""
+	try:
+		# For model2vec.patch, check if the validation code is already present
+		if "model2vec.patch" in patch_file.name:
+			inference_file = target_dir / "model2vec" / "distill" / "inference.py"
+			if inference_file.exists():
+				inference_content = inference_file.read_text()
+				# Check for the specific validation code we're adding
+				if (
+					"Token-vector mismatch:" in inference_content
+					and "Truncating to prevent failure" in inference_content
+				):
+					# Also make sure it's in the right place (before return statement, not after)
+					lines = inference_content.split("\n")
+					for i, line in enumerate(lines):
+						if "return out_tokens, out_weights" in line:
+							# Check if validation code appears before this return
+							preceding_lines = lines[max(0, i - 10) : i]
+							if any("Token-vector mismatch:" in pline for pline in preceding_lines):
+								return True
+							break
+		return False
+	except Exception as e:
+		logger.warning(f"Error checking if patch {patch_file.name} is applied: {e}")
+		return False
+def clean_duplicate_validation_code(target_dir: Path) -> bool:
+	"""
+	Clean up duplicate validation code that might have been added by multiple patch applications.
+	Args:
+	    target_dir: Target directory (usually site-packages)
+	Returns:
+	    True if cleanup was successful, False otherwise
+	"""
+	try:
+		inference_file = target_dir / "model2vec" / "distill" / "inference.py"
+		if not inference_file.exists():
+			return True
+		content = inference_file.read_text()
+		lines = content.split("\n")
+		# Find all instances of the validation code
+		validation_indices = []
+		for i, line in enumerate(lines):
+			if "Token-vector mismatch:" in line:
+				validation_indices.append(i)
+		if len(validation_indices) <= 1:
+			return True  # No duplicates or no validation code
+		# Keep only the validation code that appears before a return statement
+		lines_to_keep = []
+		skip_until = -1
+		for i, line in enumerate(lines):
+			if i <= skip_until:
+				continue
+			# If this is validation code
+			if "Token-vector mismatch:" in line:
+				# Look ahead to see if there's a return statement nearby
+				has_return_after = False
+				for j in range(i, min(len(lines), i + 20)):
+					if "return out_tokens, out_weights" in lines[j]:
+						has_return_after = True
+						break
+				# Keep this validation block only if it's followed by a return
+				if has_return_after:
+					lines_to_keep.append(line)
+				else:
+					# Skip this validation block (it's a duplicate)
+					# Find the end of this validation block
+					for j in range(i + 1, len(lines)):
+						if lines[j].strip() == "" or not lines[j].startswith("    "):
+							skip_until = j - 1
+							break
+			else:
+				lines_to_keep.append(line)
+		# Write back the cleaned content
+		cleaned_content = "\n".join(lines_to_keep)
+		inference_file.write_text(cleaned_content)
+		logger.info("Cleaned duplicate validation code from inference.py")
+		return True
+	except Exception as e:
+		logger.warning(f"Error cleaning duplicate validation code: {e}")
+		return False
 def main() -> None:
 	"""Main function for standalone execution."""
 	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")

src/distiller/sync.py DELETED Viewed

@@ -1,262 +0,0 @@
-"""
-Sync utility for downloading files from Beam volume to local directory.
-This module provides functionality to download generated files from the Beam volume
-back to the local filesystem, including:
-- Final distilled model files (model.safetensors, tokenizer.json, etc.)
-- Analysis reports and charts (README.md, comparison charts, etc.)
-"""
-import logging
-import shutil
-from pathlib import Path
-from .beam_utils import create_beam_utilities
-# Configure logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-logger = logging.getLogger(__name__)
-# Beam volume configuration (must match distill.py)
-VOLUME_NAME = "gte_qwen2_m2v_code"
-VOLUME_PATH = "./gte_qwen2_m2v_code"
-# Model files to sync
-MODEL_FILES = [
-	"model.safetensors",
-	"tokenizer.json",
-	"modules.json",
-	"config.json",
-	"pytorch_model.bin",  # Backup format
-	"vocab.txt",  # If present
-]
-# Analysis directories and files
-ANALYSIS_DIRS = [
-	"analysis_results/reports",
-	"analysis_results/charts",
-	"evaluation_results",
-]
-ANALYSIS_FILES = [
-	"analysis_results/reports/analysis_report.md",
-	"analysis_results/reports/README.md",
-	"analysis_results/charts/*.png",
-	"analysis_results/charts/*.html",
-	"evaluation_results/*.json",
-	"evaluation_results/comparisons/*.csv",
-]
-def sync_model_files(output_dir: str) -> bool:
-	"""Download final model files from Beam volume."""
-	logger.info("🔄 Syncing model files from Beam volume...")
-	output_path = Path(output_dir)
-	output_path.mkdir(parents=True, exist_ok=True)
-	# First, let's debug what's actually in the volume
-	volume_root = Path(VOLUME_PATH)
-	logger.info(f"🔍 Debugging volume contents at: {volume_root}")
-	if volume_root.exists():
-		logger.info("📁 Volume root directory contents:")
-		for item in volume_root.iterdir():
-			if item.is_file():
-				logger.info(f"   📄 {item.name} ({item.stat().st_size} bytes)")
-			elif item.is_dir():
-				logger.info(f"   📁 {item.name}/ (directory)")
-				# List files in important subdirectories
-				if item.name in ["models", "checkpoints", "gte_qwen2_m2v_code"]:
-					try:
-						logger.info(f"      Contents of {item.name}/:")
-						for subitem in item.iterdir():
-							if subitem.is_file():
-								logger.info(f"         📄 {subitem.name} ({subitem.stat().st_size} bytes)")
-							else:
-								logger.info(f"         📁 {subitem.name}/")
-								# Check one level deeper for model files
-								if subitem.is_dir():
-									for subsubitem in subitem.iterdir():
-										if subsubitem.is_file() and subsubitem.name in MODEL_FILES:
-											logger.info(f"            🎯 FOUND MODEL FILE: {subsubitem}")
-					except Exception as e:
-						logger.warning(f"      Error exploring {item.name}: {e}")
-		# Also check for model files directly in root
-		logger.info("🔍 Checking for model files directly in volume root:")
-		for model_file in MODEL_FILES:
-			root_file = volume_root / model_file
-			if root_file.exists():
-				logger.info(f"   🎯 FOUND: {model_file} in root ({root_file.stat().st_size} bytes)")
-	else:
-		logger.error(f"❌ Volume root does not exist: {volume_root}")
-		return False
-	# Since training completed successfully, look for model files in all possible locations
-	model_locations = [
-		Path(VOLUME_PATH),  # Root of volume (where final model was saved)
-		Path(VOLUME_PATH) / "models" / "refined_model",  # Refined model directory
-	]
-	synced_files = []
-	for location in model_locations:
-		logger.info(f"📂 Checking model location: {location}")
-		if not location.exists():
-			logger.info(f"   ⚠️ Location does not exist: {location}")
-			continue
-		# Try to download each model file directly
-		for model_file in MODEL_FILES:
-			source_path = location / model_file
-			dest_path = output_path / model_file
-			if source_path.exists():
-				try:
-					shutil.copy2(source_path, dest_path)
-					synced_files.append(model_file)
-					logger.info(f"✅ Downloaded: {model_file}")
-				except Exception as e:
-					logger.warning(f"⚠️ Failed to copy {model_file}: {e}")
-	if synced_files:
-		logger.info(f"🎉 Successfully synced {len(synced_files)} model files:")
-		for file in synced_files:
-			logger.info(f"   ✓ {file}")
-		return True
-	logger.error("❌ No model files found to sync")
-	return False
-def sync_analysis_files(output_dir: str) -> bool:
-	"""Download analysis reports and charts from Beam volume."""
-	logger.info("🔄 Syncing analysis files from Beam volume...")
-	output_path = Path(output_dir)
-	output_path.mkdir(parents=True, exist_ok=True)
-	synced_files = []
-	# Sync analysis reports (including README.md)
-	analysis_reports_dir = Path(VOLUME_PATH) / "analysis_results" / "reports"
-	if analysis_reports_dir.exists():
-		for report_file in analysis_reports_dir.glob("*.md"):
-			dest_path = output_path / report_file.name
-			try:
-				shutil.copy2(report_file, dest_path)
-				synced_files.append(str(report_file.name))
-				logger.info(f"✅ Downloaded report: {report_file.name}")
-				# Special handling for README.md - copy to root
-				if report_file.name in {"analysis_report.md", "README.md"}:
-					root_readme = Path(output_dir) / "README.md"
-					shutil.copy2(report_file, root_readme)
-					logger.info("✅ Updated root README.md")
-			except Exception as e:
-				logger.warning(f"⚠️ Failed to copy {report_file.name}: {e}")
-	# Sync charts
-	charts_dir = Path(VOLUME_PATH) / "analysis_results" / "charts"
-	local_charts_dir = output_path / "charts"
-	if charts_dir.exists():
-		local_charts_dir.mkdir(exist_ok=True)
-		for chart_file in charts_dir.glob("*"):
-			if chart_file.is_file():
-				dest_path = local_charts_dir / chart_file.name
-				try:
-					shutil.copy2(chart_file, dest_path)
-					synced_files.append(f"charts/{chart_file.name}")
-					logger.info(f"✅ Downloaded chart: {chart_file.name}")
-				except Exception as e:
-					logger.warning(f"⚠️ Failed to copy chart {chart_file.name}: {e}")
-	# Sync evaluation results
-	eval_dir = Path(VOLUME_PATH) / "evaluation_results"
-	local_eval_dir = output_path / "evaluation_results"
-	if eval_dir.exists():
-		local_eval_dir.mkdir(exist_ok=True)
-		for eval_file in eval_dir.glob("*.json"):
-			dest_path = local_eval_dir / eval_file.name
-			try:
-				shutil.copy2(eval_file, dest_path)
-				synced_files.append(f"evaluation_results/{eval_file.name}")
-				logger.info(f"✅ Downloaded evaluation: {eval_file.name}")
-			except Exception as e:
-				logger.warning(f"⚠️ Failed to copy evaluation {eval_file.name}: {e}")
-	if synced_files:
-		logger.info(f"🎉 Successfully synced {len(synced_files)} analysis files:")
-		for file in synced_files[:10]:  # Show first 10
-			logger.info(f"   ✓ {file}")
-		if len(synced_files) > 10:
-			logger.info(f"   ... and {len(synced_files) - 10} more files")
-		return True
-	logger.error("❌ No analysis files found to sync")
-	return False
-def sync_files(
-	model_files: bool = False,
-	analysis_files: bool = False,
-	all_files: bool = False,
-	output_dir: str = ".",
-) -> None:
-	"""Main sync function to download files from Beam volume."""
-	logger.info("🚀 Starting file sync from Beam volume")
-	logger.info(f"📁 Local output directory: {output_dir}")
-	# Initialize Beam utilities (read-only)
-	try:
-		volume_mgr, checkpoint_mgr, model_mgr, eval_mgr = create_beam_utilities(VOLUME_NAME, VOLUME_PATH)
-		logger.info(f"✅ Connected to Beam volume: {VOLUME_NAME}")
-	except Exception:
-		logger.exception("❌ Failed to connect to Beam volume")
-		logger.info("Make sure you have run the distillation/evaluation on Beam first")
-		return
-	# Check what files to sync
-	sync_model = model_files or all_files
-	sync_analysis = analysis_files or all_files
-	if not (sync_model or sync_analysis):
-		logger.error("❌ No file types specified. Use --model-files, --analysis-files, or --all")
-		return
-	success_count = 0
-	# Sync model files
-	if sync_model:
-		logger.info("\n" + "=" * 60)  # noqa: G003
-		logger.info("MODEL FILES SYNC")
-		logger.info("=" * 60)
-		if sync_model_files(output_dir):
-			success_count += 1
-	# Sync analysis files
-	if sync_analysis:
-		logger.info("\n" + "=" * 60)  # noqa: G003
-		logger.info("ANALYSIS FILES SYNC")
-		logger.info("=" * 60)
-		if sync_analysis_files(output_dir):
-			success_count += 1
-	# Summary
-	logger.info("\n" + "=" * 60)  # noqa: G003
-	logger.info("SYNC SUMMARY")
-	logger.info("=" * 60)
-	total_requested = sum([sync_model, sync_analysis])
-	if success_count == total_requested:
-		logger.info("🎉 All requested files synced successfully!")
-	elif success_count > 0:
-		logger.info(f"⚠️ Partial sync: {success_count}/{total_requested} file types synced")
-	else:
-		logger.error("❌ No files were synced")
-	logger.info(f"📂 Files saved to: {Path(output_dir).absolute()}")

src/distiller/utils.py ADDED Viewed

	@@ -0,0 +1,373 @@

+"""
+Common utilities for the distiller package.
+This module provides shared functionality used across multiple components
+including model discovery, result management, and initialization helpers.
+"""
+import json
+import logging
+from pathlib import Path
+from types import TracebackType
+from typing import Any
+from .beam_utils import (
+	BeamCheckpointManager,
+	BeamEvaluationManager,
+	BeamModelManager,
+	BeamVolumeManager,
+	create_beam_utilities,
+)
+from .config import VolumeConfig, get_safe_model_name, get_volume_config, setup_logging
+logger = logging.getLogger(__name__)
+# =============================================================================
+# BEAM UTILITIES MANAGEMENT
+# =============================================================================
+class BeamContext:
+	"""Context manager for Beam utilities with consistent initialization."""
+	def __init__(self, workflow: str, volume_config: VolumeConfig | None = None) -> None:
+		"""
+		Initialize Beam context.
+		Args:
+		    workflow: Workflow type (distill, evaluate, benchmark, etc.)
+		    volume_config: Optional custom volume config, otherwise inferred from workflow
+		"""
+		self.workflow = workflow
+		self.volume_config = volume_config or get_volume_config()
+		self.volume_manager: BeamVolumeManager | None = None
+		self.checkpoint_manager: BeamCheckpointManager | None = None
+		self.model_manager: BeamModelManager | None = None
+		self.evaluation_manager: BeamEvaluationManager | None = None
+	def __enter__(self) -> tuple[BeamVolumeManager, BeamCheckpointManager, BeamModelManager, BeamEvaluationManager]:
+		"""Enter context and initialize utilities."""
+		logger.info(f"🚀 Initializing Beam utilities for {self.workflow}")
+		logger.info(f"📁 Volume: {self.volume_config.name} at {self.volume_config.mount_path}")
+		self.volume_manager, self.checkpoint_manager, self.model_manager, self.evaluation_manager = (
+			create_beam_utilities(self.volume_config.name, self.volume_config.mount_path)
+		)
+		return self.volume_manager, self.checkpoint_manager, self.model_manager, self.evaluation_manager
+	def __exit__(
+		self,
+		exc_type: type[BaseException] | None,
+		exc_val: BaseException | None,
+		exc_tb: TracebackType | None,
+	) -> None:
+		"""Exit context with cleanup if needed."""
+		if exc_type:
+			logger.error(f"❌ Error in Beam context for {self.workflow}: {exc_val}")
+		else:
+			logger.info(f"✅ Beam context for {self.workflow} completed successfully")
+def get_beam_utilities() -> tuple[BeamVolumeManager, BeamCheckpointManager, BeamModelManager, BeamEvaluationManager]:
+	"""
+	Get Beam utilities for a specific workflow.
+	Returns:
+	    Tuple of (volume_manager, checkpoint_manager, model_manager, evaluation_manager)
+	"""
+	volume_config = get_volume_config()
+	return create_beam_utilities(volume_config.name, volume_config.mount_path)
+# =============================================================================
+# MODEL DISCOVERY
+# =============================================================================
+def discover_simplified_models(base_path: str | Path = ".") -> list[str]:
+	"""
+	Discover simplified distillation models in the specified directory.
+	Args:
+	    base_path: Base path to search for models
+	Returns:
+	    List of model paths sorted alphabetically
+	"""
+	base = Path(base_path)
+	# Look for models in common locations
+	search_patterns = [
+		"code_model2vec/final/**/",
+		"final/**/",
+		"code_model2vec_*/",
+		"*/config.json",
+		"*.safetensors",
+	]
+	discovered_models = []
+	for pattern in search_patterns:
+		matches = list(base.glob(pattern))
+		for match in matches:
+			if match.is_dir():
+				# Check if it's a valid model directory
+				if (match / "config.json").exists() or (match / "model.safetensors").exists():
+					discovered_models.append(str(match))
+			elif match.name == "config.json":
+				# Add parent directory if config.json found
+				discovered_models.append(str(match.parent))
+	# Remove duplicates and sort
+	unique_models = sorted(set(discovered_models))
+	logger.info(f"🔍 Discovered {len(unique_models)} models in {base_path}")
+	for model in unique_models:
+		logger.info(f"   📁 {model}")
+	return unique_models
+def validate_model_path(model_path: str | Path, volume_manager: BeamVolumeManager | None = None) -> str | None:
+	"""
+	Validate and resolve model path, checking local filesystem and Beam volumes.
+	Args:
+	    model_path: Path to model (can be local path or HuggingFace model name)
+	    volume_manager: Optional volume manager for Beam volume checks
+	Returns:
+	    Resolved model path or None if not found
+	"""
+	path = Path(model_path)
+	# Check if it's a HuggingFace model name
+	if "/" in str(model_path) and not path.exists() and not str(model_path).startswith("/"):
+		logger.info(f"📥 Treating as HuggingFace model: {model_path}")
+		return str(model_path)
+	# Check local filesystem
+	if path.exists():
+		logger.info(f"✅ Found local model: {model_path}")
+		return str(path)
+	# Check Beam volume if available
+	if volume_manager:
+		volume_path = Path(volume_manager.mount_path) / path.name
+		if volume_path.exists():
+			logger.info(f"✅ Found model in Beam volume: {volume_path}")
+			return str(volume_path)
+		# Check volume root
+		root_path = Path(volume_manager.mount_path)
+		if (root_path / "config.json").exists():
+			logger.info(f"✅ Found model in Beam volume root: {root_path}")
+			return str(root_path)
+	logger.warning(f"⚠️ Model not found: {model_path}")
+	return None
+# =============================================================================
+# RESULT MANAGEMENT
+# =============================================================================
+def save_results_with_backup(
+	results: dict[str, Any],
+	primary_path: str | Path,
+	model_name: str,
+	result_type: str = "evaluation",
+	volume_manager: BeamVolumeManager | None = None,
+	evaluation_manager: BeamEvaluationManager | None = None,
+) -> bool:
+	"""
+	Save results with multiple backup strategies.
+	Args:
+	    results: Results dictionary to save
+	    primary_path: Primary save location
+	    model_name: Model name for filename generation
+	    result_type: Type of results (evaluation, benchmark, etc.)
+	    volume_manager: Optional volume manager for Beam storage
+	    evaluation_manager: Optional evaluation manager for specialized storage
+	Returns:
+	    True if saved successfully to at least one location
+	"""
+	success_count = 0
+	safe_name = get_safe_model_name(model_name)
+	# Save to primary location
+	try:
+		primary = Path(primary_path)
+		primary.mkdir(parents=True, exist_ok=True)
+		filename = f"{result_type}_{safe_name}.json"
+		filepath = primary / filename
+		with filepath.open("w") as f:
+			json.dump(results, f, indent=2, default=str)
+		logger.info(f"💾 Saved {result_type} results to: {filepath}")
+		success_count += 1
+	except Exception as e:
+		logger.warning(f"⚠️ Failed to save to primary location: {e}")
+	# Save to Beam volume if available
+	if volume_manager:
+		try:
+			volume_path = Path(volume_manager.mount_path) / f"{result_type}_results"
+			volume_path.mkdir(parents=True, exist_ok=True)
+			filename = f"{result_type}_{safe_name}.json"
+			filepath = volume_path / filename
+			with filepath.open("w") as f:
+				json.dump(results, f, indent=2, default=str)
+			logger.info(f"💾 Saved {result_type} results to Beam volume: {filepath}")
+			success_count += 1
+		except Exception as e:
+			logger.warning(f"⚠️ Failed to save to Beam volume: {e}")
+	# Save via evaluation manager if available and appropriate
+	if evaluation_manager and result_type == "evaluation":
+		try:
+			success = evaluation_manager.save_evaluation_results(model_name, results)
+			if success:
+				logger.info(f"💾 Saved via evaluation manager for {model_name}")
+				success_count += 1
+		except Exception as e:
+			logger.warning(f"⚠️ Failed to save via evaluation manager: {e}")
+	return success_count > 0
+def load_existing_results(
+	model_name: str,
+	result_type: str = "evaluation",
+	search_paths: list[str | Path] | None = None,
+	volume_manager: BeamVolumeManager | None = None,
+	evaluation_manager: BeamEvaluationManager | None = None,
+) -> dict[str, Any] | None:
+	"""
+	Load existing results from multiple possible locations.
+	Args:
+	    model_name: Model name to search for
+	    result_type: Type of results to load
+	    search_paths: Additional paths to search
+	    volume_manager: Optional volume manager
+	    evaluation_manager: Optional evaluation manager
+	Returns:
+	    Results dictionary if found, None otherwise
+	"""
+	safe_name = get_safe_model_name(model_name)
+	filename = f"{result_type}_{safe_name}.json"
+	# Search in provided paths
+	if search_paths:
+		for search_path in search_paths:
+			filepath = Path(search_path) / filename
+			if filepath.exists():
+				try:
+					with filepath.open("r") as f:
+						results = json.load(f)
+					logger.info(f"📂 Loaded existing {result_type} results from: {filepath}")
+					return results
+				except Exception as e:
+					logger.warning(f"⚠️ Failed to load from {filepath}: {e}")
+	# Search in Beam volume
+	if volume_manager:
+		volume_path = Path(volume_manager.mount_path) / f"{result_type}_results" / filename
+		if volume_path.exists():
+			try:
+				with volume_path.open("r") as f:
+					results = json.load(f)
+				logger.info(f"📂 Loaded existing {result_type} results from Beam volume: {volume_path}")
+				return results
+			except Exception as e:
+				logger.warning(f"⚠️ Failed to load from Beam volume: {e}")
+	# Try evaluation manager
+	if evaluation_manager and result_type == "evaluation":
+		try:
+			results = evaluation_manager.load_evaluation_results(model_name)
+			if results:
+				logger.info(f"📂 Loaded existing {result_type} results via evaluation manager")
+				return results
+		except Exception as e:
+			logger.warning(f"⚠️ Failed to load via evaluation manager: {e}")
+	logger.info(f"ℹ️ No existing {result_type} results found for {model_name}")
+	return None
+# =============================================================================
+# WORKFLOW HELPERS
+# =============================================================================
+def print_workflow_summary(
+	workflow_name: str,
+	total_items: int,
+	processed_items: int,
+	skipped_items: int,
+	execution_time: float | None = None,
+) -> None:
+	"""Print a standardized workflow summary."""
+	logger.info(f"\n✅ {workflow_name} complete!")
+	logger.info(f"📊 Total items: {total_items}")
+	logger.info(f"✨ Newly processed: {processed_items}")
+	logger.info(f"⏭️  Skipped (already done): {skipped_items}")
+	if execution_time:
+		logger.info(f"⏱️  Execution time: {execution_time:.2f} seconds")
+def check_existing_results(
+	items: list[str],
+	result_type: str,
+	search_paths: list[str | Path] | None = None,
+	volume_manager: BeamVolumeManager | None = None,
+) -> tuple[list[str], list[str]]:
+	"""
+	Check which items already have results and which need processing.
+	Args:
+	    items: List of items (model names, etc.) to check
+	    result_type: Type of results to check for
+	    search_paths: Paths to search for existing results
+	    volume_manager: Optional volume manager
+	Returns:
+	    Tuple of (items_to_process, items_to_skip)
+	"""
+	to_process = []
+	to_skip = []
+	for item in items:
+		existing = load_existing_results(item, result_type, search_paths, volume_manager)
+		if existing:
+			to_skip.append(item)
+		else:
+			to_process.append(item)
+	return to_process, to_skip
+# =============================================================================
+# INITIALIZATION
+# =============================================================================
+def initialize_distiller_logging(level: int = logging.INFO) -> None:
+	"""Initialize logging for distiller package."""
+	setup_logging(level)
+	logger.info("🚀 Distiller package initialized")
+# Ensure logging is set up when module is imported
+initialize_distiller_logging()