chore: update dependencies and configuration for improved training

This commit updates the model configuration in `.codemap.yml` to use a lighter version of the model. Additionally, it enhances the `pyproject.toml` and `uv.lock` files by adding new dependencies such as `jinja2`, `joblib`, `rich`, and `safetensors`, while also replacing `tokenlearn` with `tokenizers`. The report has been adjusted to reflect changes in model performance metrics and the dataset configuration has been improved to support optimized dataset usage during training.

Files changed (12) hide show

.codemap.yml +1 -1
REPORT.md +18 -90
patches/model2vec.patch +0 -39
patches/tokenlearn.patch +0 -25
pyproject.toml +14 -3
src/distiller/__main__.py +52 -2
src/distiller/analyze.py +1 -1
src/distiller/config.py +7 -1
src/distiller/dataset.py +659 -0
src/distiller/distill.py +345 -194
src/distiller/patch_utils.py +0 -276
uv.lock +21 -55

.codemap.yml CHANGED Viewed

@@ -5,7 +5,7 @@
 # LLM Configuration - Controls which model is used for AI operations
 llm:
   # Format: "provider:model-name", e.g., "openai:gpt-4o", "anthropic:claude-3-opus"
-  model: "google-gla:gemini-2.0-flash"
   temperature: 0.5  # Lower for more deterministic outputs, higher for creativity
   max_input_tokens: 1000000  # Maximum tokens in input
   max_output_tokens: 10000  # Maximum tokens in responses

 # LLM Configuration - Controls which model is used for AI operations
 llm:
   # Format: "provider:model-name", e.g., "openai:gpt-4o", "anthropic:claude-3-opus"
+  model: "google-gla:gemini-2.0-flash-lite"
   temperature: 0.5  # Lower for more deterministic outputs, higher for creativity
   max_input_tokens: 1000000  # Maximum tokens in input
   max_output_tokens: 10000  # Maximum tokens in responses

REPORT.md CHANGED Viewed

@@ -28,8 +28,8 @@ This report presents a comprehensive analysis of Model2Vec distillation experime
 | code_model2vec_all_MiniLM_L6_v2 | [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | 0.7385 | 0.7049 | 0.7910 | 🥈 2nd |
 | code_model2vec_jina_embeddings_v2_base_code | [jina-embeddings-v2-base-code](https://huggingface.co/jina-embeddings-v2-base-code) | 0.7381 | 0.6996 | 0.8130 | 🥉 3rd |
 | code_model2vec_paraphrase_MiniLM_L6_v2 | [sentence-transformers/paraphrase-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2) | 0.7013 | 0.6638 | 0.7665 | #4 |
-| code_model2vec_all_mpnet_base_v2_fine_tuned | [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) | 0.6906 | 0.6372 | 0.7917 | #5 |
-| code_model2vec_Reason_ModernColBERT | [lightonai/Reason-ModernColBERT](https://huggingface.co/lightonai/Reason-ModernColBERT) | 0.6598 | 0.6228 | 0.7260 | #6 |
 | code_model2vec_bge_m3 | [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3) | 0.4863 | 0.4439 | 0.5514 | #7 |
 | code_model2vec_jina_embeddings_v3 | [jinaai/jina-embeddings-v3](https://huggingface.co/jinaai/jina-embeddings-v3) | 0.4755 | 0.4416 | 0.5456 | #8 |
 | code_model2vec_nomic_embed_text_v2_moe | [nomic-ai/nomic-embed-text-v2-moe](https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe) | 0.4532 | 0.4275 | 0.5094 | #9 |
@@ -50,8 +50,8 @@ Our distilled models exhibit consistent architectural characteristics across dif
 | all_MiniLM_L6_v2 | 29,525 | 7.6M | 256 | 14.4MB |
 | jina_embeddings_v2_base_code | 61,053 | 15.6M | 256 | 29.8MB |
 | paraphrase_MiniLM_L6_v2 | 29,525 | 7.6M | 256 | 14.4MB |
-| all_mpnet_base_v2_fine_tuned | 77,316 | 19.8M | 256 | 75.5MB |
 | Reason_ModernColBERT | 50,254 | 12.9M | 256 | 24.5MB |
 | bge_m3 | 249,999 | 64.0M | 256 | 122.1MB |
 | jina_embeddings_v3 | 249,999 | 64.0M | 256 | 122.1MB |
 | nomic_embed_text_v2_moe | 249,999 | 64.0M | 256 | 122.1MB |
@@ -69,9 +69,9 @@ Our distilled models exhibit consistent architectural characteristics across dif
 #### Key Insights from Model Specifications:
-- **Vocabulary Consistency**: All models use vocabulary sizes ranging from 29,525 to 249,999 tokens (avg: 104,501)
-- **Parameter Efficiency**: Models range from 7.6M to 64.0M parameters (avg: 26.8M)
-- **Storage Efficiency**: Disk usage ranges from 14.4MB to 122.1MB (avg: 53.7MB)
 - **Embedding Dimensions**: Consistent 256 dimensions across all models (optimized for efficiency)
@@ -81,85 +81,13 @@ Our distilled models exhibit consistent architectural characteristics across dif
 - **Best Teacher Model**: code_model2vec_all_mpnet_base_v2 (NDCG@10: 0.7387)
 - **Least Effective Teacher**: code_model2vec_codebert_base (NDCG@10: 0.2779)
 - **Performance Range**: 62.4% difference between best and worst
-- **Average Performance**: 0.5302 NDCG@10
 ## 🎯 Language Performance Radar Charts
 ### Best Model vs Peer Models Comparison
-![Comparative Radar Chart](analysis_charts/comparative_radar.png)
-*Comparative view showing how the best simplified distillation model performs against top peer models across programming languages.*
-### Individual Model Performance by Language
-#### code_model2vec_all_mpnet_base_v2 (Teacher: [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)) - NDCG@10: 0.7387
-![code_model2vec_all_mpnet_base_v2 Radar Chart](analysis_charts/radar_code_model2vec_all_mpnet_base_v2.png)
-#### code_model2vec_all_MiniLM_L6_v2 (Teacher: [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)) - NDCG@10: 0.7385
-![code_model2vec_all_MiniLM_L6_v2 Radar Chart](analysis_charts/radar_code_model2vec_all_MiniLM_L6_v2.png)
-#### code_model2vec_jina_embeddings_v2_base_code (Teacher: [jina-embeddings-v2-base-code](https://huggingface.co/jina-embeddings-v2-base-code)) - NDCG@10: 0.7381
-![code_model2vec_jina_embeddings_v2_base_code Radar Chart](analysis_charts/radar_code_model2vec_jina_embeddings_v2_base_code.png)
-#### code_model2vec_paraphrase_MiniLM_L6_v2 (Teacher: [sentence-transformers/paraphrase-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2)) - NDCG@10: 0.7013
-![code_model2vec_paraphrase_MiniLM_L6_v2 Radar Chart](analysis_charts/radar_code_model2vec_paraphrase_MiniLM_L6_v2.png)
-#### code_model2vec_all_mpnet_base_v2_fine_tuned (Teacher: [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)) - NDCG@10: 0.6906
-![code_model2vec_all_mpnet_base_v2_fine_tuned Radar Chart](analysis_charts/radar_code_model2vec_all_mpnet_base_v2_fine_tuned.png)
-#### code_model2vec_Reason_ModernColBERT (Teacher: [lightonai/Reason-ModernColBERT](https://huggingface.co/lightonai/Reason-ModernColBERT)) - NDCG@10: 0.6598
-![code_model2vec_Reason_ModernColBERT Radar Chart](analysis_charts/radar_code_model2vec_Reason_ModernColBERT.png)
-#### code_model2vec_bge_m3 (Teacher: [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3)) - NDCG@10: 0.4863
-![code_model2vec_bge_m3 Radar Chart](analysis_charts/radar_code_model2vec_bge_m3.png)
-#### code_model2vec_jina_embeddings_v3 (Teacher: [jinaai/jina-embeddings-v3](https://huggingface.co/jinaai/jina-embeddings-v3)) - NDCG@10: 0.4755
-![code_model2vec_jina_embeddings_v3 Radar Chart](analysis_charts/radar_code_model2vec_jina_embeddings_v3.png)
-#### code_model2vec_nomic_embed_text_v2_moe (Teacher: [nomic-ai/nomic-embed-text-v2-moe](https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe)) - NDCG@10: 0.4532
-![code_model2vec_nomic_embed_text_v2_moe Radar Chart](analysis_charts/radar_code_model2vec_nomic_embed_text_v2_moe.png)
-#### code_model2vec_gte_Qwen2_1.5B_instruct (Teacher: [Alibaba-NLP/gte-Qwen2-1.5B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct)) - NDCG@10: 0.4238
-![code_model2vec_gte_Qwen2_1.5B_instruct Radar Chart](analysis_charts/radar_code_model2vec_gte_Qwen2_15B_instruct.png)
-#### code_model2vec_Qodo_Embed_1_1.5B (Teacher: [Qodo/Qodo-Embed-1-1.5B](https://huggingface.co/Qodo/Qodo-Embed-1-1.5B)) - NDCG@10: 0.4101
-![code_model2vec_Qodo_Embed_1_1.5B Radar Chart](analysis_charts/radar_code_model2vec_Qodo_Embed_1_15B.png)
-#### code_model2vec_graphcodebert_base (Teacher: [microsoft/codebert-base](https://huggingface.co/microsoft/codebert-base)) - NDCG@10: 0.3420
-![code_model2vec_graphcodebert_base Radar Chart](analysis_charts/radar_code_model2vec_graphcodebert_base.png)
-#### code_model2vec_Linq_Embed_Mistral (Teacher: [Linq-AI-Research/Linq-Embed-Mistral](https://huggingface.co/Linq-AI-Research/Linq-Embed-Mistral)) - NDCG@10: 0.2868
-![code_model2vec_Linq_Embed_Mistral Radar Chart](analysis_charts/radar_code_model2vec_Linq_Embed_Mistral.png)
-#### code_model2vec_codebert_base (Teacher: [microsoft/codebert-base](https://huggingface.co/microsoft/codebert-base)) - NDCG@10: 0.2779
-![code_model2vec_codebert_base Radar Chart](analysis_charts/radar_code_model2vec_codebert_base.png)
-## 🏆 Peer Model Comparison
-![Peer Comparison](analysis_charts/peer_comparison.png)
-*Comparison with established code-specialized embedding models using actual evaluation results.*
-### Complete Model Ranking
 | Rank | Model | Type | NDCG@10 | MRR | Recall@5 |
 |------|-------|------|---------|-----|----------|
 | 1 | Alibaba-NLP/gte-Qwen2-1.5B-instruct | General | 0.9729 | 0.9676 | 0.9825 |
@@ -180,10 +108,10 @@ Our distilled models exhibit consistent architectural characteristics across dif
 | 16 | code_model2vec_all_MiniLM_L6_v2 | **🔥 Simplified Distillation** | 0.7385 | 0.7049 | 0.7910 |
 | 17 | code_model2vec_jina_embeddings_v2_base_code | **🔥 Simplified Distillation** | 0.7381 | 0.6996 | 0.8130 |
 | 18 | code_model2vec_paraphrase_MiniLM_L6_v2 | **🔥 Simplified Distillation** | 0.7013 | 0.6638 | 0.7665 |
-| 19 | code_model2vec_all_mpnet_base_v2_fine_tuned | **🎓 Fine-tuned Distillation** | 0.6906 | 0.6372 | 0.7917 |
-| 20 | code_model2vec_Reason_ModernColBERT | **🔥 Simplified Distillation** | 0.6598 | 0.6228 | 0.7260 |
-| 21 | potion-multilingual-128M | Model2Vec | 0.6124 | 0.5683 | 0.7017 |
-| 22 | huggingface/CodeBERTa-small-v1 | Code-Specific | 0.5903 | 0.5350 | 0.6779 |
 | 23 | Salesforce/codet5-base | Code-Specific | 0.4872 | 0.4500 | 0.5742 |
 | 24 | code_model2vec_bge_m3 | **🔥 Simplified Distillation** | 0.4863 | 0.4439 | 0.5514 |
 | 25 | code_model2vec_jina_embeddings_v3 | **🔥 Simplified Distillation** | 0.4755 | 0.4416 | 0.5456 |
@@ -243,12 +171,12 @@ Our distilled models exhibit consistent architectural characteristics across dif
 | Language | Best Model Performance | Average Performance | Language Difficulty |
 |----------|------------------------|--------------------|--------------------|
-| Go | 0.9780 | 0.6978 | Easy |
-| Java | 0.9921 | 0.6618 | Easy |
-| Javascript | 0.9550 | 0.5877 | Easy |
-| Php | 1.0000 | 0.6355 | Easy |
-| Python | 1.0000 | 0.8615 | Easy |
-| Ruby | 0.9493 | 0.6398 | Easy |
 ## 🎯 Conclusions and Recommendations
@@ -302,5 +230,5 @@ Based on the evaluation results across all simplified distillation models:
 ---
-*Report generated on 2025-05-31 16:36:16 using automated analysis pipeline.*
 *For questions about methodology or results, please refer to the CodeSearchNet documentation.*

 | code_model2vec_all_MiniLM_L6_v2 | [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | 0.7385 | 0.7049 | 0.7910 | 🥈 2nd |
 | code_model2vec_jina_embeddings_v2_base_code | [jina-embeddings-v2-base-code](https://huggingface.co/jina-embeddings-v2-base-code) | 0.7381 | 0.6996 | 0.8130 | 🥉 3rd |
 | code_model2vec_paraphrase_MiniLM_L6_v2 | [sentence-transformers/paraphrase-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2) | 0.7013 | 0.6638 | 0.7665 | #4 |
+| code_model2vec_Reason_ModernColBERT | [lightonai/Reason-ModernColBERT](https://huggingface.co/lightonai/Reason-ModernColBERT) | 0.6598 | 0.6228 | 0.7260 | #5 |
+| code_model2vec_all_mpnet_base_v2_fine_tuned | [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) | 0.5347 | 0.4875 | 0.6200 | #6 |
 | code_model2vec_bge_m3 | [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3) | 0.4863 | 0.4439 | 0.5514 | #7 |
 | code_model2vec_jina_embeddings_v3 | [jinaai/jina-embeddings-v3](https://huggingface.co/jinaai/jina-embeddings-v3) | 0.4755 | 0.4416 | 0.5456 | #8 |
 | code_model2vec_nomic_embed_text_v2_moe | [nomic-ai/nomic-embed-text-v2-moe](https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe) | 0.4532 | 0.4275 | 0.5094 | #9 |
 | all_MiniLM_L6_v2 | 29,525 | 7.6M | 256 | 14.4MB |
 | jina_embeddings_v2_base_code | 61,053 | 15.6M | 256 | 29.8MB |
 | paraphrase_MiniLM_L6_v2 | 29,525 | 7.6M | 256 | 14.4MB |
 | Reason_ModernColBERT | 50,254 | 12.9M | 256 | 24.5MB |
+| all_mpnet_base_v2_fine_tuned | 29,528 | 7.6M | 256 | 28.8MB |
 | bge_m3 | 249,999 | 64.0M | 256 | 122.1MB |
 | jina_embeddings_v3 | 249,999 | 64.0M | 256 | 122.1MB |
 | nomic_embed_text_v2_moe | 249,999 | 64.0M | 256 | 122.1MB |
 #### Key Insights from Model Specifications:
+- **Vocabulary Consistency**: All models use vocabulary sizes ranging from 29,525 to 249,999 tokens (avg: 101,087)
+- **Parameter Efficiency**: Models range from 7.6M to 64.0M parameters (avg: 25.9M)
+- **Storage Efficiency**: Disk usage ranges from 14.4MB to 122.1MB (avg: 50.4MB)
 - **Embedding Dimensions**: Consistent 256 dimensions across all models (optimized for efficiency)
 - **Best Teacher Model**: code_model2vec_all_mpnet_base_v2 (NDCG@10: 0.7387)
 - **Least Effective Teacher**: code_model2vec_codebert_base (NDCG@10: 0.2779)
 - **Performance Range**: 62.4% difference between best and worst
+- **Average Performance**: 0.5190 NDCG@10
 ## 🎯 Language Performance Radar Charts
 ### Best Model vs Peer Models Comparison
 | Rank | Model | Type | NDCG@10 | MRR | Recall@5 |
 |------|-------|------|---------|-----|----------|
 | 1 | Alibaba-NLP/gte-Qwen2-1.5B-instruct | General | 0.9729 | 0.9676 | 0.9825 |
 | 16 | code_model2vec_all_MiniLM_L6_v2 | **🔥 Simplified Distillation** | 0.7385 | 0.7049 | 0.7910 |
 | 17 | code_model2vec_jina_embeddings_v2_base_code | **🔥 Simplified Distillation** | 0.7381 | 0.6996 | 0.8130 |
 | 18 | code_model2vec_paraphrase_MiniLM_L6_v2 | **🔥 Simplified Distillation** | 0.7013 | 0.6638 | 0.7665 |
+| 19 | code_model2vec_Reason_ModernColBERT | **🔥 Simplified Distillation** | 0.6598 | 0.6228 | 0.7260 |
+| 20 | potion-multilingual-128M | Model2Vec | 0.6124 | 0.5683 | 0.7017 |
+| 21 | huggingface/CodeBERTa-small-v1 | Code-Specific | 0.5903 | 0.5350 | 0.6779 |
+| 22 | code_model2vec_all_mpnet_base_v2_fine_tuned | **🎓 Fine-tuned Distillation** | 0.5347 | 0.4875 | 0.6200 |
 | 23 | Salesforce/codet5-base | Code-Specific | 0.4872 | 0.4500 | 0.5742 |
 | 24 | code_model2vec_bge_m3 | **🔥 Simplified Distillation** | 0.4863 | 0.4439 | 0.5514 |
 | 25 | code_model2vec_jina_embeddings_v3 | **🔥 Simplified Distillation** | 0.4755 | 0.4416 | 0.5456 |
 | Language | Best Model Performance | Average Performance | Language Difficulty |
 |----------|------------------------|--------------------|--------------------|
+| Go | 0.9780 | 0.6923 | Easy |
+| Java | 0.9921 | 0.6545 | Easy |
+| Javascript | 0.9550 | 0.5831 | Easy |
+| Php | 1.0000 | 0.6325 | Easy |
+| Python | 1.0000 | 0.8599 | Easy |
+| Ruby | 0.9493 | 0.6333 | Easy |
 ## 🎯 Conclusions and Recommendations
 ---
+*Report generated on 2025-05-31 21:07:06 using automated analysis pipeline.*
 *For questions about methodology or results, please refer to the CodeSearchNet documentation.*

patches/model2vec.patch DELETED Viewed

@@ -1,39 +0,0 @@
---- a/model2vec/train/base.py
-+++ b/model2vec/train/base.py
-@@ -35,7 +35,7 @@ class FinetunableStaticModel(nn.Module):
-             )
-             self.vectors = vectors.float()
--        self.embeddings = nn.Embedding.from_pretrained(vectors.clone(), freeze=False, padding_idx=pad_id)
-+        self.embeddings = nn.Embedding.from_pretrained(self.vectors.clone(), freeze=False, padding_idx=pad_id)
-         self.head = self.construct_head()
-         self.w = self.construct_weights()
-         self.tokenizer = tokenizer
---- a/model2vec/distill/distillation.py
-+++ b/model2vec/distill/distillation.py
-@@ -137,7 +137,10 @@ def distill_from_model(
-         # Get the language from the model card.
-         try:
-             info = model_info(model_name)
--            language = info.cardData.get("language", None)
-+            if info is not None and hasattr(info, 'cardData') and info.cardData is not None:
-+                language = info.cardData.get("language", None)
-+            else:
-+                language = None
-         except RepositoryNotFoundError:
-             logger.info("No model info found for the model. Setting language to None.")
-             language = None
---- a/model2vec/distill/inference.py
-+++ b/model2vec/distill/inference.py
-@@ -109,5 +109,12 @@ def create_embeddings(
-     out_tokens.extend([Token(x, False) for x in tokens])
-     out_weights = np.stack(intermediate_weights)
-+    # Validate token-vector consistency to prevent failures
-+    if len(out_tokens) != out_weights.shape[0]:
-+        logger.warning(f"Token-vector mismatch: {len(out_tokens)} tokens vs {out_weights.shape[0]} vectors. Truncating to prevent failure.")
-+        min_count = min(len(out_tokens), out_weights.shape[0])
-+        out_tokens = out_tokens[:min_count]
-+        out_weights = out_weights[:min_count]
-+
-     return out_tokens, out_weights

patches/tokenlearn.patch DELETED Viewed

@@ -1,25 +0,0 @@
---- a/tokenlearn/pretrain.py
-+++ b/tokenlearn/pretrain.py
-@@ -38,7 +38,10 @@ class FinetunableStaticModel(nn.Module):
-         """Run the model using input IDs."""
-         input_ids = input_ids.view(-1)
-         input_ids = input_ids[input_ids != self.pad_token_id]
--        w = self.w[input_ids]
-+        # Fix for index out of bounds issue
-+        # Clamp input_ids to valid range to prevent IndexError during training
-+        valid_input_ids = torch.clamp(input_ids, 0, self.w.shape[0] - 1)
-+        w = self.w[valid_input_ids]
-         return self.sub_forward(w)
-     def forward(self, x):
-@@ -46,7 +49,10 @@ class FinetunableStaticModel(nn.Module):
-         # Add a small epsilon to avoid division by zero
-         length = zeros.sum(1) + 1e-16
--        embedded = self.embeddings(input_ids)
-+        # Fix for embedding index out of bounds issue
-+        # Clamp input_ids to valid embedding range
-+        valid_input_ids = torch.clamp(input_ids, 0, self.embeddings.num_embeddings - 1)
-+        embedded = self.embeddings(valid_input_ids)
-         # Zero out the padding
-         embedded = torch.bmm(w[:, None, :], embedded).squeeze(1)
-         # Simulate actual mean

pyproject.toml CHANGED Viewed

@@ -19,24 +19,31 @@ dependencies = [
     "flash-attn>=2.7.4.post1",
     "hatchling>=1.27.0",
     "iso639>=0.1.4",
     "kaleido==1.0.0rc13",
     "lightning>=2.5.1.post0",
     "matplotlib>=3.10.3",
-    "model2vec[train]>=0.5.0",
     "mteb>=1.14.15",
     "numpy>=1.26.4",
     "plotly>=6.1.1",
     "psutil>=7.0.0",
     "pydantic>=2.11.5",
     "requests>=2.32.3",
     "scikit-learn>=1.6.1",
     "seaborn>=0.13.2",
     "sentence-transformers>=4.1.0",
     "setuptools>=80.8.0",
     "smart-open[s3]>=7.1.0",
     "statsmodels>=0.14.4",
-    "tokenlearn>=0.2.0",
     "torch>=2.7.0",
     "typer>=0.16.0",
 ]
@@ -78,7 +85,9 @@ exclude = [
     "__pycache__",
     "build",
     "dist",
-    "vendor"
 ]
 [tool.ruff.lint]
@@ -114,6 +123,8 @@ ignore = [
     "E501", # Line too long
     "PLR2004",
     "RUF001",
 ]
 [tool.ruff.lint.mccabe]

     "flash-attn>=2.7.4.post1",
     "hatchling>=1.27.0",
     "iso639>=0.1.4",
+    "jinja2>=3.0.0",
+    "joblib>=1.0.0",
     "kaleido==1.0.0rc13",
     "lightning>=2.5.1.post0",
     "matplotlib>=3.10.3",
+    "more-itertools>=10.5.0",
     "mteb>=1.14.15",
     "numpy>=1.26.4",
     "plotly>=6.1.1",
     "psutil>=7.0.0",
     "pydantic>=2.11.5",
     "requests>=2.32.3",
+    "rich>=10.0.0",
+    "safetensors>=0.3.0",
     "scikit-learn>=1.6.1",
     "seaborn>=0.13.2",
     "sentence-transformers>=4.1.0",
     "setuptools>=80.8.0",
+    "skops>=0.11.0",
     "smart-open[s3]>=7.1.0",
     "statsmodels>=0.14.4",
+    "tokenizers>=0.20",
     "torch>=2.7.0",
+    "transformers<=4.52.1",
+    "tqdm>=4.65.0",
     "typer>=0.16.0",
 ]
     "__pycache__",
     "build",
     "dist",
+    "vendor",
+    "src/distiller/model2vec",
+    "src/distiller/tokenlearn"
 ]
 [tool.ruff.lint]
     "E501", # Line too long
     "PLR2004",
     "RUF001",
+    "D100", # Missing docstring in public module
+    "D101", # Missing docstring in public class
 ]
 [tool.ruff.lint.mccabe]

src/distiller/__main__.py CHANGED Viewed

@@ -17,12 +17,41 @@ def distill(
 	train: Annotated[bool, typer.Option(help="Enable advanced training (CodeSearchNet fine-tuning)")] = False,
 	teacher_models: Annotated[list[str] | None, typer.Option(help="Specific teacher models to distill")] = None,
 	pca_dims: Annotated[int | None, typer.Option(help="PCA dimensions (uses config default if not specified)")] = None,
 ) -> None:
 	"""Run unified Model2Vec distillation with optional training."""
 	from .distill import main as distill_main
-	# Call the distill main function with arguments
-	distill_main(use_beam, train, teacher_models, pca_dims)
 @app.command()
@@ -53,5 +82,26 @@ def analyze(
 	analyze_main(results_dir or "code_model2vec/evaluation_results", model_name, output, export_csv)
 if __name__ == "__main__":
 	app()

 	train: Annotated[bool, typer.Option(help="Enable advanced training (CodeSearchNet fine-tuning)")] = False,
 	teacher_models: Annotated[list[str] | None, typer.Option(help="Specific teacher models to distill")] = None,
 	pca_dims: Annotated[int | None, typer.Option(help="PCA dimensions (uses config default if not specified)")] = None,
+	clear_cache: Annotated[
+		bool, typer.Option(help="Clear HuggingFace cache for problematic models before distillation")
+	] = False,
+	clear_checkpoints: Annotated[
+		bool, typer.Option(help="Clear tokenlearn checkpoints to force fresh featurization and training")
+	] = False,
+	skip_ptr: Annotated[
+		bool, typer.Option("--skip-ptr", help="Skip post-training re-regularization (PCA + SIF weighting) step")
+	] = False,
+	use_optimized_dataset: Annotated[
+		bool,
+		typer.Option(
+			"--use-optimized-dataset", help="Use the pre-created optimized dataset from code_model2vec/dataset"
+		),
+	] = False,
+	dataset_path: Annotated[
+		str | None,
+		typer.Option("--dataset-path", help="Path to custom dataset directory (defaults to code_model2vec/dataset)"),
+	] = None,
 ) -> None:
 	"""Run unified Model2Vec distillation with optional training."""
 	from .distill import main as distill_main
+	# Call the distill main function with all arguments
+	distill_main(
+		use_beam,
+		train,
+		teacher_models,
+		pca_dims,
+		clear_cache,
+		clear_checkpoints,
+		skip_ptr,
+		use_optimized_dataset,
+		dataset_path,
+	)
 @app.command()
 	analyze_main(results_dir or "code_model2vec/evaluation_results", model_name, output, export_csv)
+@app.command()
+def dataset(
+	max_samples_per_lang: Annotated[int, typer.Option(help="Maximum samples per language")] = 50000,
+	min_doc_words: Annotated[int, typer.Option(help="Minimum words in documentation")] = 3,
+	max_doc_words: Annotated[int, typer.Option(help="Maximum words in documentation")] = 100,
+	min_code_chars: Annotated[int, typer.Option(help="Minimum characters in code")] = 50,
+	max_code_chars: Annotated[int, typer.Option(help="Maximum characters in code")] = 2000,
+	output_dir: Annotated[str | None, typer.Option(help="Output directory for dataset")] = None,
+	simple_format: Annotated[
+		bool, typer.Option(help="Create only simple format (not multiple training formats)")
+	] = False,
+) -> None:
+	"""Create optimized training dataset from CodeSearchNet for code search tasks."""
+	from .dataset import main as dataset_main
+	# Call the dataset main function with arguments
+	dataset_main(
+		max_samples_per_lang, min_doc_words, max_doc_words, min_code_chars, max_code_chars, output_dir, simple_format
+	)
 if __name__ == "__main__":
 	app()

src/distiller/analyze.py CHANGED Viewed

@@ -510,7 +510,7 @@ class CodeSearchNetAnalyzer:
 			try:
 				# Try to load the model and get specifications
-				from model2vec import StaticModel
 				model = StaticModel.from_pretrained(str(model_dir))

 			try:
 				# Try to load the model and get specifications
+				from distiller.model2vec import StaticModel
 				model = StaticModel.from_pretrained(str(model_dir))

src/distiller/config.py CHANGED Viewed

@@ -212,13 +212,19 @@ class DistillationConfig(BaseModel):
 	# Tokenlearn-specific parameters (POTION approach)
 	tokenlearn_dataset: str = "sentence-transformers/codesearchnet"  # Dataset for tokenlearn featurization
 	tokenlearn_dataset_name: str = "pair"  # Use 'pair' configuration (only available config)
-	tokenlearn_text_key: str = "code"  # Text field to use from the dataset ('code' or 'comment')
 	tokenlearn_timeout_featurize: int = 21600  # 6 hour timeout for featurization (dataset needs ~5 hours)
 	tokenlearn_timeout_train: int = 7200  # 2 hour timeout for training
 	# Post-training configuration
 	skip_post_training_regularization: bool = False  # Skip PCA + SIF re-regularization step
 distillation_config = DistillationConfig()

 	# Tokenlearn-specific parameters (POTION approach)
 	tokenlearn_dataset: str = "sentence-transformers/codesearchnet"  # Dataset for tokenlearn featurization
 	tokenlearn_dataset_name: str = "pair"  # Use 'pair' configuration (only available config)
+	tokenlearn_text_key: str = (
+		"combined_text"  # Text field to use from the dataset ('combined_text' for doc-code pairs)
+	)
 	tokenlearn_timeout_featurize: int = 21600  # 6 hour timeout for featurization (dataset needs ~5 hours)
 	tokenlearn_timeout_train: int = 7200  # 2 hour timeout for training
 	# Post-training configuration
 	skip_post_training_regularization: bool = False  # Skip PCA + SIF re-regularization step
+	# Dataset configuration
+	use_optimized_dataset: bool = True  # Use the pre-created optimized dataset from dataset.py
+	custom_dataset_path: str | None = "code_model2vec/dataset"  # Path to custom dataset directory
 distillation_config = DistillationConfig()

src/distiller/dataset.py ADDED Viewed

	@@ -0,0 +1,659 @@

+"""
+Custom Dataset Generation for Code-Specialized Model Training.
+This module creates optimized training datasets from CodeSearchNet that are specifically
+designed to improve performance on code search evaluation tasks.
+Features:
+- High-quality doc-code pairs optimized for retrieval
+- Balanced sampling across programming languages
+- Multiple training formats (doc-only, code-only, combined)
+- Quality filtering and data cleaning
+- Train/test/eval splits with proper stratification
+- Efficient parquet format output
+"""
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Annotated, Any
+import pandas as pd
+import typer
+from datasets import load_dataset
+from tqdm import tqdm
+from .config import languages_config
+# Set up logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+# Dataset configuration
+DATASET_OUTPUT_DIR = Path("code_model2vec/dataset")
+DEFAULT_MAX_SAMPLES_PER_LANG = 50000
+DEFAULT_MIN_DOC_WORDS = 3
+DEFAULT_MAX_DOC_WORDS = 100
+DEFAULT_MIN_CODE_CHARS = 50
+DEFAULT_MAX_CODE_CHARS = 2000
+def create_optimized_dataset(
+	max_samples_per_lang: int = DEFAULT_MAX_SAMPLES_PER_LANG,
+	min_doc_words: int = DEFAULT_MIN_DOC_WORDS,
+	max_doc_words: int = DEFAULT_MAX_DOC_WORDS,
+	min_code_chars: int = DEFAULT_MIN_CODE_CHARS,
+	max_code_chars: int = DEFAULT_MAX_CODE_CHARS,
+	output_dir: Path | None = None,
+	create_multiple_formats: bool = True,
+) -> dict[str, Any]:
+	"""
+	Create optimized training dataset from CodeSearchNet for code search tasks.
+	Args:
+	    max_samples_per_lang: Maximum samples per programming language
+	    min_doc_words: Minimum words in documentation
+	    max_doc_words: Maximum words in documentation
+	    min_code_chars: Minimum characters in code
+	    max_code_chars: Maximum characters in code
+	    output_dir: Output directory for dataset
+	    create_multiple_formats: Create multiple training formats
+	Returns:
+	    Dictionary with dataset statistics and file paths
+	"""
+	output_dir = DATASET_OUTPUT_DIR if output_dir is None else Path(output_dir)
+	output_dir.mkdir(parents=True, exist_ok=True)
+	logger.info("🚀 Starting optimized CodeSearchNet dataset creation...")
+	logger.info(f"📁 Output directory: {output_dir}")
+	logger.info(f"📊 Target: {max_samples_per_lang} samples per language")
+	logger.info(f"🔍 Languages: {', '.join(languages_config.all)}")
+	start_time = time.time()
+	all_samples = []
+	language_stats = {}
+	# Process each programming language
+	for language in languages_config.all:
+		logger.info(f"\n🔄 Processing {language}...")
+		try:
+			# Load CodeSearchNet dataset for this language
+			dataset = load_dataset("code_search_net", language, split="train", trust_remote_code=True)
+			language_samples = []
+			processed_count = 0
+			quality_filtered = 0
+			# Process examples with quality filtering
+			for example in tqdm(dataset, desc=f"Processing {language}", unit="examples"):
+				processed_count += 1
+				# Extract documentation and code
+				doc_string = example.get("func_documentation_string", "").strip()
+				code_string = example.get("func_code_string", "").strip()
+				func_name = example.get("func_name", "").strip()
+				# Quality filters
+				if not _passes_quality_filters(
+					doc_string, code_string, func_name, min_doc_words, max_doc_words, min_code_chars, max_code_chars
+				):
+					continue
+				quality_filtered += 1
+				# Create optimized training samples
+				samples = _create_training_samples(
+					doc_string, code_string, func_name, language, create_multiple_formats
+				)
+				language_samples.extend(samples)
+				# Stop if we have enough samples
+				if len(language_samples) >= max_samples_per_lang:
+					break
+			# Truncate to exact target size
+			language_samples = language_samples[:max_samples_per_lang]
+			all_samples.extend(language_samples)
+			# Track statistics
+			language_stats[language] = {
+				"processed": processed_count,
+				"quality_filtered": quality_filtered,
+				"final_samples": len(language_samples),
+				"quality_rate": quality_filtered / processed_count if processed_count > 0 else 0,
+			}
+			logger.info(f"✅ {language}: {len(language_samples)} samples from {quality_filtered} quality examples")
+		except Exception:
+			logger.exception(f"❌ Failed to process {language}")
+			language_stats[language] = {
+				"processed": 0,
+				"quality_filtered": 0,
+				"final_samples": 0,
+				"quality_rate": 0.0,
+			}
+	# Create DataFrame
+	logger.info(f"\n📊 Creating dataset with {len(all_samples)} total samples...")
+	df = pd.DataFrame(all_samples)
+	# Create stratified splits
+	train_df, test_df = _create_stratified_splits(df)
+	# Save datasets
+	dataset_files = _save_datasets(output_dir, train_df, test_df)
+	# Save metadata
+	metadata = {
+		"creation_time": time.strftime("%Y-%m-%d %H:%M:%S"),
+		"total_samples": len(all_samples),
+		"train_samples": len(train_df),
+		"test_samples": len(test_df),
+		"languages": languages_config.all,
+		"language_stats": language_stats,
+		"quality_filters": {
+			"min_doc_words": min_doc_words,
+			"max_doc_words": max_doc_words,
+			"min_code_chars": min_code_chars,
+			"max_code_chars": max_code_chars,
+		},
+		"files": dataset_files,
+		"processing_time": time.time() - start_time,
+	}
+	metadata_file = output_dir / "metadata.json"
+	with metadata_file.open("w") as f:
+		json.dump(metadata, f, indent=2)
+	logger.info(f"\n🎉 Dataset creation completed in {metadata['processing_time']:.2f} seconds!")
+	logger.info("📊 Final statistics:")
+	logger.info(f"  - Total samples: {metadata['total_samples']}")
+	logger.info(f"  - Train: {metadata['train_samples']}")
+	logger.info(f"  - Test: {metadata['test_samples']}")
+	logger.info(f"💾 Metadata saved to: {metadata_file}")
+	return metadata
+def _passes_quality_filters(
+	doc_string: str,
+	code_string: str,
+	func_name: str,
+	min_doc_words: int,
+	max_doc_words: int,
+	min_code_chars: int,
+	max_code_chars: int,
+) -> bool:
+	"""Apply quality filters optimized for code retrieval following RAG best practices."""
+	# Basic existence checks
+	if not doc_string or not code_string or not func_name:
+		return False
+	# Documentation quality filters for code retrieval
+	doc_words = len(doc_string.split())
+	if doc_words < min_doc_words or doc_words > max_doc_words:
+		return False
+	# Code quality filters
+	code_length = len(code_string)
+	if code_length < min_code_chars or code_length > max_code_chars:
+		return False
+	# Content quality filters for code retrieval
+	doc_lower = doc_string.lower()
+	code_string.lower()
+	# Skip low-quality documentation (expanded for code context)
+	skip_phrases = [
+		"todo",
+		"fixme",
+		"hack",
+		"temp",
+		"test",
+		"placeholder",
+		"not implemented",
+		"coming soon",
+		"tbd",
+		"xxx",
+		"broken",
+		"deprecated",
+		"legacy",
+		"old version",
+		"outdated",
+	]
+	if any(phrase in doc_lower for phrase in skip_phrases):
+		return False
+	# Ensure meaningful documentation for code retrieval
+	if func_name.lower() in doc_lower and doc_words < 5:
+		return False
+	# Code structure validation (more comprehensive for retrieval)
+	has_function = any(
+		pattern in code_string for pattern in ["def ", "function ", "class ", "public ", "private ", "static "]
+	)
+	if not has_function:
+		return False
+	# Skip trivial or incomplete code
+	trivial_code_patterns = [
+		"pass",
+		"return None",
+		"return;",
+		"throw new Error",
+		"# TODO",
+		"// TODO",
+		"print(",
+		"console.log(",
+	]
+	if any(pattern in code_string for pattern in trivial_code_patterns) and len(code_string) < 100:
+		return False
+	# Ensure documentation describes functionality (not just naming)
+	generic_docs = [
+		"returns a value",
+		"does something",
+		"helper function",
+		"utility method",
+		"this function",
+		"this method",
+		"returns the result",
+		"performs operation",
+	]
+	if any(generic in doc_lower for generic in generic_docs):
+		return False
+	# Ensure documentation has descriptive content for retrieval
+	descriptive_words = [
+		"parse",
+		"convert",
+		"transform",
+		"calculate",
+		"validate",
+		"format",
+		"filter",
+		"sort",
+		"search",
+		"find",
+		"create",
+		"generate",
+		"process",
+		"handle",
+		"manage",
+		"update",
+		"modify",
+		"remove",
+		"delete",
+		"add",
+	]
+	if not any(word in doc_lower for word in descriptive_words) and doc_words < 8:
+		return False
+	# Code-documentation alignment check (key for retrieval quality)
+	return _check_code_doc_alignment(doc_string, code_string, func_name)
+def _check_code_doc_alignment(doc_string: str, code_string: str, func_name: str) -> bool:
+	"""Check if documentation and code are well-aligned for retrieval tasks."""
+	doc_lower = doc_string.lower()
+	code_lower = code_string.lower()
+	# Function name should relate to documentation
+	func_base = func_name.lower().replace("_", " ").replace("-", " ")
+	# Check for obvious mismatches
+	doc_has_return = any(word in doc_lower for word in ["return", "returns", "gives", "outputs"])
+	code_has_return = "return " in code_lower
+	# If doc mentions returning something, code should have returns
+	if doc_has_return and not code_has_return and len(code_string.split("\n")) > 3:
+		return False
+	# Check for parameter mentions alignment
+	any(word in doc_lower for word in ["parameter", "param", "argument", "input"])
+	"(" in func_name and func_name.count("(") == 1
+	# Basic semantic alignment
+	action_words = ["sort", "parse", "convert", "validate", "format", "filter", "search", "calculate"]
+	doc_actions = [word for word in action_words if word in doc_lower]
+	[word for word in action_words if word in code_lower or word in func_base]
+	# If documentation mentions specific actions, code or function name should reflect them
+	return not (doc_actions and not any(action in code_lower or action in func_base for action in doc_actions))
+def _create_training_samples(
+	doc_string: str,
+	code_string: str,
+	func_name: str,
+	language: str,
+	create_multiple_formats: bool,
+) -> list[dict[str, Any]]:
+	"""Create optimized training samples for code retrieval with proper training schema."""
+	samples = []
+	if create_multiple_formats:
+		# Format 1: Documentation query → Code (direct evaluation format)
+		query_1 = doc_string
+		text_1 = _format_training_text(query_1, code_string, language)
+		samples.append(
+			{
+				"language": language,
+				"query": query_1,
+				"code": code_string,
+				"text": text_1,
+			}
+		)
+		# Format 2: How-to query (realistic developer search)
+		query_2 = _generate_how_to_query(doc_string, func_name, language)
+		text_2 = _format_training_text(query_2, code_string, language)
+		samples.append(
+			{
+				"language": language,
+				"query": query_2,
+				"code": code_string,
+				"text": text_2,
+			}
+		)
+		# Format 3: Functional requirement query
+		query_3 = _generate_functional_query(doc_string, func_name)
+		text_3 = _format_training_text(query_3, code_string, language)
+		samples.append(
+			{
+				"language": language,
+				"query": query_3,
+				"code": code_string,
+				"text": text_3,
+			}
+		)
+		# Format 4: Implementation-specific query
+		query_4 = _generate_implementation_query(doc_string, func_name, language)
+		text_4 = _format_training_text(query_4, code_string, language)
+		samples.append(
+			{
+				"language": language,
+				"query": query_4,
+				"code": code_string,
+				"text": text_4,
+			}
+		)
+	else:
+		# Simple format - direct documentation to code
+		query = doc_string
+		text = _format_training_text(query, code_string, language)
+		samples.append(
+			{
+				"language": language,
+				"query": query,
+				"code": code_string,
+				"text": text,
+			}
+		)
+	return samples
+def _format_training_text(query: str, code: str, language: str) -> str:
+	"""Format query and code into a single training text chunk with markdown-style code blocks."""
+	# Clean up query but preserve internal code formatting
+	query_clean = query.strip()
+	code_clean = code.strip()
+	# Create training text with proper markdown format and newline separation
+	# Structure: query + empty line + markdown code block with language
+	return f"{query_clean}\n\n```{language}\n{code_clean}\n```"
+def _generate_how_to_query(doc_string: str, func_name: str, language: str) -> str:
+	"""Generate realistic 'how to' queries that developers might actually search for."""
+	# Extract key action words from documentation
+	doc_lower = doc_string.lower()
+	func_lower = func_name.lower()
+	# Common developer query patterns
+	if "sort" in doc_lower or "sort" in func_lower:
+		return f"How to sort data in {language}"
+	if "parse" in doc_lower or "parse" in func_lower:
+		return f"How to parse data in {language}"
+	if "convert" in doc_lower or "transform" in doc_lower or "convert" in func_lower:
+		return f"How to convert data in {language}"
+	if "validate" in doc_lower or "check" in doc_lower or "validate" in func_lower:
+		return f"How to validate input in {language}"
+	if "calculate" in doc_lower or "compute" in doc_lower or "calc" in func_lower:
+		return f"How to calculate values in {language}"
+	if "format" in doc_lower or "format" in func_lower:
+		return f"How to format output in {language}"
+	if "filter" in doc_lower or "filter" in func_lower:
+		return f"How to filter data in {language}"
+	if "search" in doc_lower or "find" in doc_lower or "search" in func_lower or "find" in func_lower:
+		return f"How to search through data in {language}"
+	# Use function name for more specific queries
+	if func_name and len(func_name) > 2:
+		# Extract meaningful words from function name
+		func_words = func_name.replace("_", " ").replace("-", " ").strip()
+		if func_words:
+			return f"How to {func_words.lower()} in {language}"
+	# Fallback to more generic query
+	action = doc_string.split()[0] if doc_string.split() else "implement"
+	return f"How to {action.lower()} in {language}"
+def _generate_functional_query(doc_string: str, func_name: str) -> str:
+	"""Generate functional requirement queries focusing on what the code accomplishes."""
+	# Clean up documentation to create natural query
+	doc_clean = doc_string.strip().rstrip(".")
+	# Transform to question format
+	if doc_clean.startswith(("Returns", "Return")):
+		return f"Function that {doc_clean.lower()}"
+	if doc_clean.startswith(("Creates", "Create")):
+		return f"Code to {doc_clean.lower()}"
+	if doc_clean.startswith(("Checks", "Check")):
+		return f"Function to {doc_clean.lower()}"
+	# Use function name to enhance the query if available
+	if func_name and len(func_name) > 2:
+		func_words = func_name.replace("_", " ").replace("-", " ").strip()
+		if func_words and len(doc_clean) < 30:  # Only for short docs
+			return f"Function named '{func_name}' that {doc_clean.lower()}"
+	return f"Implementation that {doc_clean.lower()}"
+def _generate_implementation_query(doc_string: str, func_name: str, language: str) -> str:
+	"""Generate implementation-specific queries with technical details."""
+	doc_lower = doc_string.lower()
+	func_lower = func_name.lower() if func_name else ""
+	# Add language-specific implementation details
+	if language == "python":
+		if "list" in doc_lower or "array" in doc_lower or "list" in func_lower:
+			return f"Python function to {doc_string.lower()} using lists"
+		if "dict" in doc_lower or "hash" in doc_lower or "dict" in func_lower:
+			return f"Python function to {doc_string.lower()} using dictionaries"
+		# Include function name for context if available
+		if func_name and len(func_name) > 2:
+			return f"Python implementation of {func_name}: {doc_string.lower()}"
+		return f"Python implementation: {doc_string.lower()}"
+	if language == "java":
+		func_suffix = f" ({func_name})" if func_name and len(func_name) > 2 else ""
+		return f"Java method to {doc_string.lower()}{func_suffix}"
+	if language == "javascript":
+		func_suffix = f" ({func_name})" if func_name and len(func_name) > 2 else ""
+		return f"JavaScript function to {doc_string.lower()}{func_suffix}"
+	if language == "php":
+		func_suffix = f" ({func_name})" if func_name and len(func_name) > 2 else ""
+		return f"PHP function to {doc_string.lower()}{func_suffix}"
+	if language == "ruby":
+		func_suffix = f" ({func_name})" if func_name and len(func_name) > 2 else ""
+		return f"Ruby method to {doc_string.lower()}{func_suffix}"
+	if language == "go":
+		func_suffix = f" ({func_name})" if func_name and len(func_name) > 2 else ""
+		return f"Go function to {doc_string.lower()}{func_suffix}"
+	return f"{language} code to {doc_string.lower()}"
+def _create_stratified_splits(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
+	"""Create stratified train/test splits preserving language distribution."""
+	# Define split ratios
+	train_ratio = 0.9
+	# test_ratio = 0.1 (remainder)
+	train_dfs = []
+	test_dfs = []
+	# Split by language to ensure balanced representation
+	for language in df["language"].unique():
+		lang_df = df[df["language"] == language].copy()
+		n_samples = len(lang_df)
+		# Calculate split sizes
+		n_train = int(n_samples * train_ratio)
+		# Remainder goes to test
+		# Shuffle and split
+		lang_df = lang_df.sample(frac=1, random_state=42).reset_index(drop=True)
+		train_dfs.append(lang_df[:n_train])
+		test_dfs.append(lang_df[n_train:])
+	# Combine and shuffle again
+	train_df = pd.concat(train_dfs, ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
+	test_df = pd.concat(test_dfs, ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
+	logger.info("📊 Created stratified splits:")
+	logger.info(f"  - Train: {len(train_df)} samples")
+	logger.info(f"  - Test: {len(test_df)} samples")
+	return train_df, test_df
+def _save_datasets(
+	output_dir: Path,
+	train_df: pd.DataFrame,
+	test_df: pd.DataFrame,
+) -> dict[str, str]:
+	"""Save datasets in parquet format with compression."""
+	dataset_files = {}
+	# Save each split
+	for split_name, df in [("train", train_df), ("test", test_df)]:
+		filepath = output_dir / f"{split_name}.parquet"
+		df.to_parquet(
+			filepath,
+			compression="snappy",
+			index=False,
+		)
+		dataset_files[split_name] = str(filepath)
+		logger.info(f"💾 Saved {split_name}: {len(df)} samples → {filepath}")
+	# Also save a combined dataset for convenience
+	combined_df = pd.concat([train_df, test_df], ignore_index=True)
+	combined_filepath = output_dir / "combined.parquet"
+	combined_df.to_parquet(combined_filepath, compression="snappy", index=False)
+	dataset_files["combined"] = str(combined_filepath)
+	logger.info(f"💾 Saved combined: {len(combined_df)} samples → {combined_filepath}")
+	return dataset_files
+def load_optimized_dataset(
+	output_dir: Path | None = None,
+	split: str = "train",
+) -> pd.DataFrame:
+	"""
+	Load a previously created optimized dataset.
+	Args:
+	    output_dir: Directory containing the dataset files
+	    split: Which split to load ('train', 'test', 'combined')
+	Returns:
+	    DataFrame with the requested dataset split
+	"""
+	if output_dir is None:
+		output_dir = DATASET_OUTPUT_DIR
+	filepath = output_dir / f"{split}.parquet"
+	if not filepath.exists():
+		available_files = list(output_dir.glob("*.parquet"))
+		available_splits = [f.stem for f in available_files]
+		msg = f"Dataset split '{split}' not found at {filepath}. Available splits: {available_splits}"
+		raise FileNotFoundError(msg)
+	logger.info(f"📂 Loading {split} dataset from {filepath}")
+	df = pd.read_parquet(filepath)
+	logger.info(f"✅ Loaded {len(df)} samples")
+	return df
+def main(
+	max_samples_per_lang: Annotated[
+		int, typer.Option(help="Maximum samples per language")
+	] = DEFAULT_MAX_SAMPLES_PER_LANG,
+	min_doc_words: Annotated[int, typer.Option(help="Minimum words in documentation")] = DEFAULT_MIN_DOC_WORDS,
+	max_doc_words: Annotated[int, typer.Option(help="Maximum words in documentation")] = DEFAULT_MAX_DOC_WORDS,
+	min_code_chars: Annotated[int, typer.Option(help="Minimum characters in code")] = DEFAULT_MIN_CODE_CHARS,
+	max_code_chars: Annotated[int, typer.Option(help="Maximum characters in code")] = DEFAULT_MAX_CODE_CHARS,
+	output_dir: Annotated[str | None, typer.Option(help="Output directory for dataset")] = None,
+	simple_format: Annotated[
+		bool, typer.Option(help="Create only simple format (not multiple training formats)")
+	] = False,
+) -> None:
+	"""Create optimized training dataset from CodeSearchNet for code search tasks."""
+	logger.info("🚀 Starting optimized dataset creation command...")
+	# Convert output_dir to Path if provided
+	output_path = Path(output_dir) if output_dir else None
+	# Create the dataset
+	try:
+		metadata = create_optimized_dataset(
+			max_samples_per_lang=max_samples_per_lang,
+			min_doc_words=min_doc_words,
+			max_doc_words=max_doc_words,
+			min_code_chars=min_code_chars,
+			max_code_chars=max_code_chars,
+			output_dir=output_path,
+			create_multiple_formats=not simple_format,
+		)
+		logger.info("✅ Dataset creation completed successfully!")
+		logger.info(f"📁 Output directory: {metadata['files']['train']}")
+		# Print summary statistics
+		print("\n" + "=" * 60)
+		print("📊 DATASET CREATION SUMMARY")
+		print("=" * 60)
+		print(f"Total samples created: {metadata['total_samples']:,}")
+		print(f"Processing time: {metadata['processing_time']:.2f} seconds")
+		print("\nSplit distribution:")
+		print(f"  • Train: {metadata['train_samples']:,} samples")
+		print(f"  • Test:  {metadata['test_samples']:,} samples")
+		print("\nLanguage distribution:")
+		for lang, stats in metadata["language_stats"].items():
+			if "error" not in stats:
+				print(f"  • {lang}: {stats['final_samples']:,} samples ({stats['quality_rate']:.1%} quality rate)")
+		print(f"\nDataset files saved to: {output_path or DATASET_OUTPUT_DIR}")
+		print("=" * 60)
+	except Exception as e:
+		logger.exception("❌ Dataset creation failed")
+		raise typer.Exit(1) from e
+if __name__ == "__main__":
+	typer.run(main)

src/distiller/distill.py CHANGED Viewed

@@ -28,13 +28,14 @@ import time
 from pathlib import Path
 from typing import Annotated, Any
 import torch
 import typer
 from beam import function
-from datasets import load_dataset
-from model2vec.distill import distill
 from sentence_transformers import SentenceTransformer
 # Try to import flash_attn to check if it's available
 from .beam_utils import (
 	BeamCheckpointManager,
@@ -145,25 +146,6 @@ def load_model_with_flash_attention(model_path: str, device: str = "auto") -> Se
 # =============================================================================
-def apply_local_patches() -> bool:
-	"""Apply patches locally without requiring Beam utilities."""
-	try:
-		try:
-			from .patch_utils import apply_all_patches
-			patches_applied = apply_all_patches()
-			logger.info(f"Successfully applied {patches_applied} patches via patch_utils")
-			return True
-		except ImportError:
-			logger.warning("patch_utils not available, trying direct patching")
-		return False
-	except Exception as e:
-		logger.warning(f"Failed to apply patches: {e}")
-		return False
 def get_current_config_hash(enable_training: bool) -> str:
 	"""Generate a hash of current configuration parameters for checkpoint validation."""
 	import hashlib
@@ -217,22 +199,22 @@ def check_existing_final_model(teacher_name: str, enable_training: bool = False)
 	model_name = f"code_model2vec_{teacher_name}"
 	if enable_training:
 		model_name += "_fine_tuned"
-	model_dir = final_dir / model_name
-	if model_dir.exists():
 		# Check for essential model files
-		has_config = (model_dir / "config.json").exists()
 		has_model_file = any(
 			[
-				(model_dir / "model.safetensors").exists(),
-				(model_dir / "model.bin").exists(),
-				(model_dir / "pytorch_model.bin").exists(),
 			]
 		)
 		if has_config and has_model_file:
 			logger.info(f"✅ Found existing final model: {teacher_name}{'_fine_tuned' if enable_training else ''}")
-			return str(model_dir)
 	return None
@@ -427,11 +409,65 @@ def simple_distillation(
 		return None
 def load_codesearchnet_dataset(
 	max_samples: int = 50000,
 	checkpoint_manager: BeamCheckpointManager | None = None,
 ) -> list[str]:
 	"""Load and format the CodeSearchNet dataset for token frequency computation."""
 	logger.info(f"Loading CodeSearchNet dataset from {codesearchnet_config.dataset_name}")
 	logger.info(f"Limiting to {max_samples} samples for training efficiency")
 	logger.info(f"Languages: {', '.join(languages_config.all)}")
@@ -482,6 +518,8 @@ def load_codesearchnet_dataset(
 			try:
 				# Load training split for the specific language (same format as evaluate.py)
 				dataset = load_dataset(
 					codesearchnet_config.dataset_name,
 					language,
@@ -709,8 +747,33 @@ def compute_token_frequencies_for_sif(
 	logger.info("📊 Computing token frequencies for SIF weighting...")
 	try:
-		# Load CodeSearchNet dataset to compute frequencies (limited sample for efficiency)
-		dataset_texts = load_codesearchnet_dataset(max_samples=10000)
 		logger.info(f"📊 Computing frequencies on {len(dataset_texts)} texts...")
@@ -763,7 +826,6 @@ def apply_post_training_regularization(
 	"""
 	import json
-	import numpy as np
 	from sklearn.decomposition import PCA
 	logger.info("🔧 Starting post-training re-regularization (POTION Step 4)")
@@ -836,7 +898,7 @@ def apply_post_training_regularization(
 	final_embeddings = embeddings_pca.astype(np.float32)
 	# Create new model with updated embeddings
-	from model2vec.model import StaticModel
 	# Save tokenizer and config from original model
 	tokenizer = model.tokenizer
@@ -866,7 +928,6 @@ def tokenlearn_training(
 	3. Tokenlearn training
 	4. Post-training re-regularization (PCA + SIF weighting)
 	"""
-	import subprocess
 	from pathlib import Path
 	logger.info("🧪 Starting tokenlearn training (POTION approach)...")
@@ -914,6 +975,9 @@ def tokenlearn_training(
 	logger.info(f"📊 Using teacher model: {teacher_model_name}")
 	# Check if featurization already completed (checkpoint detection)
 	featurization_complete_marker = features_dir / ".featurization_complete"
 	if featurization_complete_marker.exists() and verify_featurization_output(features_dir):
@@ -936,47 +1000,42 @@ def tokenlearn_training(
 		logger.info(f"📊 Using teacher model: {teacher_model_name}")
 		try:
-			# Use configured dataset for code specialization
-			featurize_cmd = [
-				"python",
-				"-m",
-				"tokenlearn.featurize",
-				"--model-name",
-				str(teacher_model_name),
-				"--output-dir",
-				str(features_dir),
-				"--dataset-path",
-				str(distillation_config.tokenlearn_dataset),
-				"--dataset-name",
-				str(distillation_config.tokenlearn_dataset_name),
-				"--dataset-split",
-				"train",
-				"--key",
-				str(distillation_config.tokenlearn_text_key),  # Use configured text field
-				"--batch-size",
-				"1024",  # Optimized batch size for A100-40G
-			]
 			logger.info("🔄 Running tokenlearn featurization...")
-			logger.info(
-				f"📊 Dataset: {distillation_config.tokenlearn_dataset} (config: {distillation_config.tokenlearn_dataset_name})"
-			)
-			logger.info(f"📝 Text field: {distillation_config.tokenlearn_text_key}")
-			logger.info(f"Command: {' '.join(featurize_cmd)}")
-			print(f"\n🔄 Executing: {' '.join(featurize_cmd)}\n")
-			result = subprocess.run(  # noqa: S603
-				featurize_cmd,
-				text=True,
-				timeout=distillation_config.tokenlearn_timeout_featurize,
-				check=False,
-			)
-			if result.returncode != 0:
-				logger.error(f"❌ Featurization failed with return code: {result.returncode}")
-				logger.error("💥 Tokenlearn featurization is required for training - cannot proceed")
-				msg = f"Tokenlearn featurization failed with return code: {result.returncode}"
-				raise RuntimeError(msg)
 			logger.info("✅ Featurization completed successfully")
@@ -1025,65 +1084,74 @@ def tokenlearn_training(
 		logger.info("🔄 No valid training checkpoint found - starting training...")
 		try:
-			train_cmd = [
-				"python",
-				"-m",
-				"tokenlearn.train",
-				"--model-name",
-				str(teacher_model_name),
-				"--data-path",
-				str(features_dir),
-				"--save-path",
-				str(trained_dir),
-			]
-			logger.info("🔄 Running tokenlearn training...")
-			logger.info(f"Command: {' '.join(train_cmd)}")
-			print(f"\n🎓 Executing: {' '.join(train_cmd)}\n")
-			result = subprocess.run(  # noqa: S603
-				train_cmd,
-				text=True,
-				capture_output=True,  # Capture stdout and stderr
-				timeout=distillation_config.tokenlearn_timeout_train,
-				check=False,
-			)
-			if result.returncode != 0:
-				logger.error(f"❌ Tokenlearn training failed with return code: {result.returncode}")
-				# Log the actual error output for debugging
-				if result.stderr:
-					logger.error(f"stderr: {result.stderr}")
-				if result.stdout:
-					logger.info(f"stdout: {result.stdout}")
-				# Check if it's the token-vector mismatch issue
-				error_output = str(result.stderr) + str(result.stdout)
-				if "Number of tokens" in error_output and "does not match number of vectors" in error_output:
-					logger.error("🔧 Token-vector mismatch detected in tokenlearn")
-					logger.error("💥 This is a known issue with tokenlearn/Model2Vec integration")
 					# Create training marker to indicate we tried but failed
 					training_fallback_marker = trained_dir / ".training_fallback"
 					training_fallback_marker.touch()
-					logger.error("❌ Tokenlearn training failed due to token-vector mismatch")
-					msg = f"Tokenlearn training failed with token-vector mismatch: {error_output}"
-					raise RuntimeError(msg)
-				logger.error("💥 Tokenlearn training failed with different error")
-				msg = f"Tokenlearn training failed with return code: {result.returncode}"
-				raise RuntimeError(msg)
-			logger.info("✅ Tokenlearn training completed successfully")
-			# Create checkpoint marker to indicate training is complete
-			training_complete_marker.touch()
-			logger.info(f"💾 Created training checkpoint: {training_complete_marker}")
 		except Exception as e:
-			logger.exception("💥 Tokenlearn training failed")
-			logger.exception("💥 Tokenlearn training is required - cannot proceed")
-			msg = f"Tokenlearn training failed: {e}"
 			raise RuntimeError(msg) from e
 	# Step 4: Load the trained model and apply post-training re-regularization
@@ -1098,7 +1166,7 @@ def tokenlearn_training(
 		raise RuntimeError(msg)
 	try:
-		from model2vec.model import StaticModel
 		# Load the trained model from tokenlearn
 		trained_model_path = trained_dir / "model"
@@ -1213,12 +1281,13 @@ def distill_single_teacher(
 		existing_final = check_existing_final_model(teacher_name, enable_training)
 		if existing_final:
 			logger.info(f"✅ Final model already exists: {teacher_name}{'_fine_tuned' if enable_training else ''}")
 			return {
 				"teacher_model": teacher_model,
 				"teacher_name": teacher_name,
 				"status": "skipped_existing_final",
 				"final_path": existing_final,
-				"distillation_time": 0.0,
 			}
 		# Step 1.5: Sync existing checkpoints from Beam if using Beam utilities
@@ -1236,7 +1305,7 @@ def distill_single_teacher(
 			logger.info(f"✅ Found existing base model: {teacher_name}")
 			if enable_training:
 				# Load base model for training
-				from model2vec.model import StaticModel
 				base_model = StaticModel.from_pretrained(existing_base)
 		elif use_beam_utilities:
@@ -1244,7 +1313,7 @@ def distill_single_teacher(
 			if synced:
 				existing_base = str(base_dir)
 				if enable_training:
-					from model2vec.model import StaticModel
 					base_model = StaticModel.from_pretrained(existing_base)
@@ -1263,11 +1332,13 @@ def distill_single_teacher(
 				base_model = simple_distillation(teacher_model, str(base_dir), pca_dims)
 			if base_model is None:
 				return {
 					"teacher_model": teacher_model,
 					"teacher_name": teacher_name,
 					"status": "failed_base_distillation",
 					"error": "Simple distillation failed",
 				}
 			# Sync base model and checkpoints to Beam
@@ -1280,71 +1351,74 @@ def distill_single_teacher(
 			existing_base = str(base_dir)
-			# Step 3: Handle final model creation
-			if enable_training and base_model is not None:
 				# Perform tokenlearn training (POTION approach)
-				logger.info(f"🧪 Starting tokenlearn training for {teacher_name}")
-				try:
-					# Load teacher model for training
-					device = "cuda" if torch.cuda.is_available() else "cpu"
-					teacher_st_model = load_model_with_flash_attention(teacher_model, device)
-					# Perform tokenlearn training (POTION approach)
-					final_model = tokenlearn_training(
-						base_model,
-						teacher_st_model,
-						checkpoint_mgr,
-						skip_post_training_regularization=distillation_config.skip_post_training_regularization,
-					)
-					# Save final model
-					final_dir.mkdir(parents=True, exist_ok=True)
-					final_model.save_pretrained(str(final_dir))
-					# Sync final model and training checkpoints to Beam
-					if use_beam_utilities:
-						sync_model_to_beam(f"{teacher_name}_final", str(final_dir), use_beam_utilities)
-						if checkpoint_mgr:
-							sync_checkpoints_to_beam(
-								VOLUME_CONFIG.name, f"training_{teacher_name}", directories.checkpoints
-							)
 					del teacher_st_model
-					if torch.cuda.is_available():
-						torch.cuda.empty_cache()
-				except RuntimeError as e:
-					# Training failed - clean up and return failure
-					logger.exception(f"❌ Training failed for {teacher_name}")
-					# Clean up teacher model if it was loaded
-					if "teacher_st_model" in locals():
-						del teacher_st_model
-					if torch.cuda.is_available():
-						torch.cuda.empty_cache()
-					return {
-						"teacher_model": teacher_model,
-						"teacher_name": teacher_name,
-						"status": "failed_training",
-						"error": f"Training failed: {e!s}",
-						"base_path": existing_base,  # Base model was created successfully
-					}
-			else:
-				# Copy base to final (no training)
-				logger.info(f"📁 Copying base to final for {teacher_name}")
-				if not copy_base_to_final(teacher_name, enable_training):
-					return {
-						"teacher_model": teacher_model,
-						"teacher_name": teacher_name,
-						"status": "failed_copy_to_final",
-						"error": "Failed to copy base to final",
-					}
-			total_time = time.time() - start_time
 		return {
 			"teacher_model": teacher_model,
 			"teacher_name": teacher_name,
@@ -1357,11 +1431,13 @@ def distill_single_teacher(
 	except Exception as e:
 		logger.exception(f"❌ Failed to process {teacher_model}")
 		return {
 			"teacher_model": teacher_model,
 			"teacher_name": teacher_name,
 			"status": "failed",
 			"error": str(e),
 		}
@@ -1382,13 +1458,6 @@ def run_local_distillation(
 	if teacher_models is None:
 		teacher_models = DEFAULT_TEACHER_MODELS
-	# Apply patches
-	patch_success = apply_local_patches()
-	if patch_success:
-		logger.info("✅ Successfully applied patches")
-	else:
-		logger.warning("⚠️ Failed to apply patches - some models may fail")
 	results = {}
 	successful_models = []
@@ -1468,13 +1537,6 @@ def _beam_distill_internal(
 	clear_cache: bool = False,
 ) -> dict[str, Any]:
 	"""Shared internal implementation for beam distillation."""
-	# Apply patches
-	patch_success = apply_local_patches()
-	if patch_success:
-		logger.info("✅ Successfully applied patches")
-	else:
-		logger.warning("⚠️ Failed to apply patches - some models may fail")
 	if teacher_models is None:
 		teacher_models = DEFAULT_TEACHER_MODELS
@@ -1647,6 +1709,16 @@ def main(
 	skip_ptr: Annotated[
 		bool, typer.Option("--skip-ptr", help="Skip post-training re-regularization (PCA + SIF weighting) step")
 	] = False,
 ) -> None:
 	"""Unified distillation command with optional training."""
 	logger.info("🚀 Starting unified Model2Vec distillation workflow")
@@ -1656,6 +1728,13 @@ def main(
 	if skip_ptr and train:
 		logger.info("⏭️ Post-training re-regularization will be skipped (PCA + SIF weighting disabled)")
 	logger.info(f"🎓 Training mode: {'Tokenlearn (POTION) training' if train else 'Basic distillation only'}")
 	logger.info(f"☁️  Execution: {'Beam' if use_beam else 'Local'}")
@@ -1894,7 +1973,7 @@ def salesforce_model_distillation(
 			logger.info("✅ Successfully loaded with SentenceTransformer method")
 		# Now use Model2Vec's distill_from_model function directly
-		from model2vec.distill.distillation import distill_from_model
 		distilled_model = distill_from_model(
 			model=model,
@@ -2004,7 +2083,7 @@ def baai_bge_model_distillation(
 			return None
 		# Now use Model2Vec's distill_from_model function directly
-		from model2vec.distill.distillation import distill_from_model
 		distilled_model = distill_from_model(
 			model=model,
@@ -2090,5 +2169,77 @@ def verify_training_output(trained_dir: Path) -> bool:
 	return False
 if __name__ == "__main__":
 	typer.run(main)

 from pathlib import Path
 from typing import Annotated, Any
+import numpy as np
 import torch
 import typer
 from beam import function
 from sentence_transformers import SentenceTransformer
+from distiller.model2vec.distill import distill
 # Try to import flash_attn to check if it's available
 from .beam_utils import (
 	BeamCheckpointManager,
 # =============================================================================
 def get_current_config_hash(enable_training: bool) -> str:
 	"""Generate a hash of current configuration parameters for checkpoint validation."""
 	import hashlib
 	model_name = f"code_model2vec_{teacher_name}"
 	if enable_training:
 		model_name += "_fine_tuned"
+	final_path = final_dir / model_name
+	if final_path.exists():
 		# Check for essential model files
+		has_config = (final_path / "config.json").exists()
 		has_model_file = any(
 			[
+				(final_path / "model.safetensors").exists(),
+				(final_path / "model.bin").exists(),
+				(final_path / "pytorch_model.bin").exists(),
 			]
 		)
 		if has_config and has_model_file:
 			logger.info(f"✅ Found existing final model: {teacher_name}{'_fine_tuned' if enable_training else ''}")
+			return str(final_path)
 	return None
 		return None
+def load_optimized_dataset(
+	max_samples: int = 50000,
+	checkpoint_manager: BeamCheckpointManager | None = None,
+	dataset_path: str | None = None,
+) -> list[str]:
+	"""Load our pre-created optimized dataset for tokenlearn training."""
+	from .dataset import DATASET_OUTPUT_DIR
+	from .dataset import load_optimized_dataset as load_dataset_func
+	# Use configuration if not provided as parameter
+	if dataset_path is None:
+		dataset_path = distillation_config.custom_dataset_path
+	dataset_dir = Path(dataset_path) if dataset_path else DATASET_OUTPUT_DIR
+	logger.info(f"🎯 Loading optimized dataset from {dataset_dir}")
+	logger.info(f"📊 Target samples: {max_samples}")
+	try:
+		# Load the training split of our optimized dataset
+		df = load_dataset_func(output_dir=dataset_dir, split="train")
+		# Extract the text column (which contains our formatted query + code)
+		texts = df["text"].tolist()
+		# Shuffle for better training distribution
+		import random
+		random.seed(42)
+		random.shuffle(texts)
+		# Limit to max_samples
+		if len(texts) > max_samples:
+			texts = texts[:max_samples]
+		logger.info(f"✅ Loaded {len(texts)} optimized training samples")
+		# Log language distribution
+		languages = df["language"].value_counts()
+		logger.info("📊 Language distribution:")
+		for lang, count in languages.items():
+			percentage = (count / len(df)) * 100
+			logger.info(f"  {lang}: {count} samples ({percentage:.1f}%)")
+		return texts
+	except Exception as e:
+		logger.warning(f"⚠️ Failed to load optimized dataset: {e}")
+		logger.info("🔄 Falling back to original CodeSearchNet loading...")
+		return load_codesearchnet_dataset(max_samples, checkpoint_manager)
 def load_codesearchnet_dataset(
 	max_samples: int = 50000,
 	checkpoint_manager: BeamCheckpointManager | None = None,
 ) -> list[str]:
 	"""Load and format the CodeSearchNet dataset for token frequency computation."""
+	from datasets import load_dataset
 	logger.info(f"Loading CodeSearchNet dataset from {codesearchnet_config.dataset_name}")
 	logger.info(f"Limiting to {max_samples} samples for training efficiency")
 	logger.info(f"Languages: {', '.join(languages_config.all)}")
 			try:
 				# Load training split for the specific language (same format as evaluate.py)
+				from datasets import load_dataset
 				dataset = load_dataset(
 					codesearchnet_config.dataset_name,
 					language,
 	logger.info("📊 Computing token frequencies for SIF weighting...")
 	try:
+		# Load dataset to compute frequencies (limited sample for efficiency)
+		if distillation_config.use_optimized_dataset:
+			# Use the custom optimized dataset
+			from .dataset import load_optimized_dataset as load_custom_dataset
+			custom_dataset_dir = (
+				Path(distillation_config.custom_dataset_path)
+				if distillation_config.custom_dataset_path
+				else Path("code_model2vec/dataset")
+			)
+			if custom_dataset_dir.exists() and (custom_dataset_dir / "train.parquet").exists():
+				train_df = load_custom_dataset(output_dir=custom_dataset_dir, split="train")
+				# Sample a subset for frequency computation
+				sample_size = min(10000, len(train_df))
+				train_df_sample = train_df.sample(n=sample_size, random_state=42)
+				dataset_texts = train_df_sample["text"].tolist()
+				logger.info(f"📊 Using {len(dataset_texts)} samples from custom optimized dataset")
+			else:
+				# Fallback to original dataset loading
+				dataset_texts = load_codesearchnet_dataset(max_samples=10000)
+				logger.info(
+					f"📊 Custom dataset not found, using original CodeSearchNet with {len(dataset_texts)} texts"
+				)
+		else:
+			dataset_texts = load_codesearchnet_dataset(max_samples=10000)
+			logger.info(f"📊 Using original CodeSearchNet with {len(dataset_texts)} texts")
 		logger.info(f"📊 Computing frequencies on {len(dataset_texts)} texts...")
 	"""
 	import json
 	from sklearn.decomposition import PCA
 	logger.info("🔧 Starting post-training re-regularization (POTION Step 4)")
 	final_embeddings = embeddings_pca.astype(np.float32)
 	# Create new model with updated embeddings
+	from distiller.model2vec.model import StaticModel
 	# Save tokenizer and config from original model
 	tokenizer = model.tokenizer
 	3. Tokenlearn training
 	4. Post-training re-regularization (PCA + SIF weighting)
 	"""
 	from pathlib import Path
 	logger.info("🧪 Starting tokenlearn training (POTION approach)...")
 	logger.info(f"📊 Using teacher model: {teacher_model_name}")
+	# Prepare dataset for tokenlearn featurization
+	dataset_path, dataset_name, text_key = _prepare_tokenlearn_dataset(persistent_tokenlearn_dir)
 	# Check if featurization already completed (checkpoint detection)
 	featurization_complete_marker = features_dir / ".featurization_complete"
 	if featurization_complete_marker.exists() and verify_featurization_output(features_dir):
 		logger.info(f"📊 Using teacher model: {teacher_model_name}")
 		try:
+			# Use direct function call instead of subprocess
+			from datasets import load_dataset
+			from distiller.tokenlearn.featurize import featurize
 			logger.info("🔄 Running tokenlearn featurization...")
+			logger.info(f"📊 Dataset: {dataset_path} (config: {dataset_name})")
+			logger.info(f"📝 Text field: {text_key}")
+			# Load the dataset
+			if dataset_name is None:
+				# For local JSON files, don't pass name parameter
+				dataset = load_dataset(
+					"json",
+					data_files=dataset_path,
+					split="train",
+					streaming=True,
+				)
+			else:
+				# For remote datasets with specific configurations
+				dataset = load_dataset(
+					dataset_path,
+					name=dataset_name,
+					split="train",
+					streaming=True,
+				)
+			# Call featurization function directly
+			featurize(
+				dataset=iter(dataset),
+				model=teacher_model,
+				output_dir=str(features_dir),
+				max_means=50000,  # IMPROVEMENT: Limit means to prevent overfitting
+				batch_size=512,  # IMPROVEMENT: Smaller batch for better gradients
+				text_key=text_key,
+			)
 			logger.info("✅ Featurization completed successfully")
 		logger.info("🔄 No valid training checkpoint found - starting training...")
 		try:
+			# Use direct function call instead of subprocess
+			from distiller.tokenlearn.train import train_model
+			from distiller.tokenlearn.utils import collect_means_and_texts
+			# IMPROVED APPROACH: Try optimized parameters first
+			logger.info("🚀 Attempting IMPROVED tokenlearn training with optimized parameters...")
+			logger.info("📊 Using smaller vocabulary and conservative PCA to prevent overfitting")
+			# Collect training data from features directory
+			paths = sorted(features_dir.glob("*.json"))
+			train_txt, train_vec = collect_means_and_texts(paths)
+			logger.info(f"📊 Collected {len(train_txt)} texts and {train_vec.shape[0]} vectors for training")
+			try:
+				# Try improved parameters first
+				trained_model = train_model(
+					model_name=str(teacher_model_name),
+					train_txt=train_txt,
+					train_vec=train_vec,
+					device="cuda" if torch.cuda.is_available() else "cpu",
+					vocab_size=25000,  # IMPROVEMENT: Smaller vocabulary to prevent overfitting
+					pca_dims=256,  # IMPROVEMENT: Conservative PCA dimensions
+				)
+				# Save the trained model
+				trained_model.save_pretrained(str(trained_dir))
+				logger.info("✅ IMPROVED tokenlearn training completed successfully")
+				training_complete_marker.touch()
+				logger.info(f"💾 Created improved training checkpoint: {training_complete_marker}")
+			except Exception as e:
+				logger.warning(f"⚠️ Improved training failed: {e}")
+				logger.info("🔄 Falling back to CONSERVATIVE tokenlearn training...")
+				# FALLBACK: Ultra-conservative training approach
+				try:
+					trained_model = train_model(
+						model_name=str(teacher_model_name),
+						train_txt=train_txt,
+						train_vec=train_vec,
+						device="cuda" if torch.cuda.is_available() else "cpu",
+						vocab_size=15000,  # FALLBACK: Even smaller vocabulary
+						pca_dims=128,  # FALLBACK: Smaller PCA dimensions
+					)
+					# Save the trained model
+					trained_model.save_pretrained(str(trained_dir))
+					logger.info("✅ Conservative tokenlearn training completed successfully")
+					training_complete_marker.touch()
+					logger.info(f"💾 Created conservative training checkpoint: {training_complete_marker}")
+				except Exception as e2:
+					logger.exception("❌ Conservative tokenlearn training also failed")
+					logger.exception("💥 All training approaches failed - check output above for details")
 					# Create training marker to indicate we tried but failed
 					training_fallback_marker = trained_dir / ".training_fallback"
 					training_fallback_marker.touch()
+					logger.exception("💥 Tokenlearn training failed completely")
+					msg = f"All tokenlearn training approaches failed: {e2}"
+					raise RuntimeError(msg) from e2
 		except Exception as e:
+			logger.warning("💥 All tokenlearn training approaches failed")
+			logger.exception("💥 All training approaches failed completely - cannot proceed")
+			msg = f"All training approaches failed: {e}"
 			raise RuntimeError(msg) from e
 	# Step 4: Load the trained model and apply post-training re-regularization
 		raise RuntimeError(msg)
 	try:
+		from distiller.model2vec.model import StaticModel
 		# Load the trained model from tokenlearn
 		trained_model_path = trained_dir / "model"
 		existing_final = check_existing_final_model(teacher_name, enable_training)
 		if existing_final:
 			logger.info(f"✅ Final model already exists: {teacher_name}{'_fine_tuned' if enable_training else ''}")
+			total_time = time.time() - start_time
 			return {
 				"teacher_model": teacher_model,
 				"teacher_name": teacher_name,
 				"status": "skipped_existing_final",
 				"final_path": existing_final,
+				"distillation_time": total_time,
 			}
 		# Step 1.5: Sync existing checkpoints from Beam if using Beam utilities
 			logger.info(f"✅ Found existing base model: {teacher_name}")
 			if enable_training:
 				# Load base model for training
+				from distiller.model2vec.model import StaticModel
 				base_model = StaticModel.from_pretrained(existing_base)
 		elif use_beam_utilities:
 			if synced:
 				existing_base = str(base_dir)
 				if enable_training:
+					from distiller.model2vec.model import StaticModel
 					base_model = StaticModel.from_pretrained(existing_base)
 				base_model = simple_distillation(teacher_model, str(base_dir), pca_dims)
 			if base_model is None:
+				total_time = time.time() - start_time
 				return {
 					"teacher_model": teacher_model,
 					"teacher_name": teacher_name,
 					"status": "failed_base_distillation",
 					"error": "Simple distillation failed",
+					"distillation_time": total_time,
 				}
 			# Sync base model and checkpoints to Beam
 			existing_base = str(base_dir)
+		# Step 3: Handle final model creation
+		if enable_training and base_model is not None:
+			# Perform tokenlearn training (POTION approach)
+			logger.info(f"🧪 Starting tokenlearn training for {teacher_name}")
+			try:
+				# Load teacher model for training
+				device = "cuda" if torch.cuda.is_available() else "cpu"
+				teacher_st_model = load_model_with_flash_attention(teacher_model, device)
 				# Perform tokenlearn training (POTION approach)
+				final_model = tokenlearn_training(
+					base_model,
+					teacher_st_model,
+					checkpoint_mgr,
+					skip_post_training_regularization=distillation_config.skip_post_training_regularization,
+				)
+				# Save final model
+				final_dir.mkdir(parents=True, exist_ok=True)
+				final_model.save_pretrained(str(final_dir))
+				# Sync final model and training checkpoints to Beam
+				if use_beam_utilities:
+					sync_model_to_beam(f"{teacher_name}_final", str(final_dir), use_beam_utilities)
+					if checkpoint_mgr:
+						sync_checkpoints_to_beam(
+							VOLUME_CONFIG.name, f"training_{teacher_name}", directories.checkpoints
+						)
+				del teacher_st_model
+				if torch.cuda.is_available():
+					torch.cuda.empty_cache()
+			except RuntimeError as e:
+				# Training failed - clean up and return failure
+				logger.exception(f"❌ Training failed for {teacher_name}")
+				# Clean up teacher model if it was loaded
+				if "teacher_st_model" in locals():
 					del teacher_st_model
+				if torch.cuda.is_available():
+					torch.cuda.empty_cache()
+				total_time = time.time() - start_time
+				return {
+					"teacher_model": teacher_model,
+					"teacher_name": teacher_name,
+					"status": "failed_training",
+					"error": f"Training failed: {e!s}",
+					"base_path": existing_base,  # Base model was created successfully
+					"distillation_time": total_time,
+				}
+		else:
+			# Copy base to final (no training)
+			logger.info(f"📁 Copying base to final for {teacher_name}")
+			if not copy_base_to_final(teacher_name, enable_training):
+				total_time = time.time() - start_time
+				return {
+					"teacher_model": teacher_model,
+					"teacher_name": teacher_name,
+					"status": "failed_copy_to_final",
+					"error": "Failed to copy base to final",
+					"distillation_time": total_time,
+				}
+		total_time = time.time() - start_time
 		return {
 			"teacher_model": teacher_model,
 			"teacher_name": teacher_name,
 	except Exception as e:
 		logger.exception(f"❌ Failed to process {teacher_model}")
+		total_time = time.time() - start_time
 		return {
 			"teacher_model": teacher_model,
 			"teacher_name": teacher_name,
 			"status": "failed",
 			"error": str(e),
+			"distillation_time": total_time,
 		}
 	if teacher_models is None:
 		teacher_models = DEFAULT_TEACHER_MODELS
 	results = {}
 	successful_models = []
 	clear_cache: bool = False,
 ) -> dict[str, Any]:
 	"""Shared internal implementation for beam distillation."""
 	if teacher_models is None:
 		teacher_models = DEFAULT_TEACHER_MODELS
 	skip_ptr: Annotated[
 		bool, typer.Option("--skip-ptr", help="Skip post-training re-regularization (PCA + SIF weighting) step")
 	] = False,
+	use_optimized_dataset: Annotated[
+		bool,
+		typer.Option(
+			"--use-optimized-dataset", help="Use the pre-created optimized dataset from code_model2vec/dataset"
+		),
+	] = False,
+	dataset_path: Annotated[
+		str | None,
+		typer.Option("--dataset-path", help="Path to custom dataset directory (defaults to code_model2vec/dataset)"),
+	] = None,
 ) -> None:
 	"""Unified distillation command with optional training."""
 	logger.info("🚀 Starting unified Model2Vec distillation workflow")
 	if skip_ptr and train:
 		logger.info("⏭️ Post-training re-regularization will be skipped (PCA + SIF weighting disabled)")
+	# Set dataset configuration
+	distillation_config.use_optimized_dataset = use_optimized_dataset
+	distillation_config.custom_dataset_path = dataset_path
+	if use_optimized_dataset and train:
+		dataset_source = dataset_path or "code_model2vec/dataset"
+		logger.info(f"🎯 Using optimized dataset from: {dataset_source}")
 	logger.info(f"🎓 Training mode: {'Tokenlearn (POTION) training' if train else 'Basic distillation only'}")
 	logger.info(f"☁️  Execution: {'Beam' if use_beam else 'Local'}")
 			logger.info("✅ Successfully loaded with SentenceTransformer method")
 		# Now use Model2Vec's distill_from_model function directly
+		from distiller.model2vec.distill.distillation import distill_from_model
 		distilled_model = distill_from_model(
 			model=model,
 			return None
 		# Now use Model2Vec's distill_from_model function directly
+		from distiller.model2vec.distill.distillation import distill_from_model
 		distilled_model = distill_from_model(
 			model=model,
 	return False
+def _prepare_tokenlearn_dataset(tokenlearn_dir: Path) -> tuple[str, str | None, str]:
+	"""
+	Prepare dataset for tokenlearn featurization.
+	Returns:
+		Tuple of (dataset_path, dataset_name, text_key) for tokenlearn
+	"""
+	if distillation_config.use_optimized_dataset:
+		return _prepare_custom_dataset_for_tokenlearn(tokenlearn_dir)
+	return _prepare_original_dataset_for_tokenlearn()
+def _prepare_custom_dataset_for_tokenlearn(tokenlearn_dir: Path) -> tuple[str, str | None, str]:
+	"""Prepare custom optimized dataset for tokenlearn featurization."""
+	logger.info("🎯 Preparing custom optimized dataset for tokenlearn...")
+	# Import the dataset module
+	from .dataset import create_optimized_dataset, load_optimized_dataset
+	# Define paths
+	custom_dataset_dir = (
+		Path(distillation_config.custom_dataset_path)
+		if distillation_config.custom_dataset_path
+		else Path("code_model2vec/dataset")
+	)
+	tokenlearn_dataset_dir = tokenlearn_dir / "custom_dataset"
+	# Check if we need to create the custom dataset
+	if not custom_dataset_dir.exists() or not (custom_dataset_dir / "train.parquet").exists():
+		logger.info("📊 Custom dataset not found - creating optimized dataset...")
+		create_optimized_dataset(
+			max_samples_per_lang=10000,  # Reasonable size for tokenlearn
+			output_dir=custom_dataset_dir,
+			create_multiple_formats=False,  # Use simple format for tokenlearn
+		)
+	# Load the custom dataset
+	logger.info(f"📂 Loading custom dataset from {custom_dataset_dir}")
+	train_df = load_optimized_dataset(output_dir=custom_dataset_dir, split="train")
+	# Prepare dataset for tokenlearn (save as JSON files that load_dataset can read)
+	tokenlearn_dataset_dir.mkdir(parents=True, exist_ok=True)
+	# Save as JSON file that tokenlearn can load with load_dataset()
+	train_json_path = tokenlearn_dataset_dir / "train.json"
+	# Create JSON lines format
+	import json
+	with train_json_path.open("w") as f:
+		for text in train_df["text"]:
+			json.dump({"text": text}, f)
+			f.write("\n")
+	logger.info(f"✅ Prepared custom dataset with {len(train_df)} samples for tokenlearn")
+	logger.info(f"💾 Saved JSON dataset to {train_json_path}")
+	# Return the JSON file path directly (not directory) and no config name for JSON loading
+	return str(train_json_path), None, "text"
+def _prepare_original_dataset_for_tokenlearn() -> tuple[str, str, str]:
+	"""Prepare original CodeSearchNet dataset for tokenlearn featurization."""
+	logger.info("📊 Using original CodeSearchNet dataset for tokenlearn...")
+	return (
+		str(distillation_config.tokenlearn_dataset),  # "sentence-transformers/codesearchnet"
+		str(distillation_config.tokenlearn_dataset_name),  # "pair"
+		str(distillation_config.tokenlearn_text_key),  # "combined_text"
+	)
 if __name__ == "__main__":
 	typer.run(main)

src/distiller/patch_utils.py DELETED Viewed

@@ -1,276 +0,0 @@
-"""
-Patch utilities for applying fixes to installed packages.
-This module provides functionality to automatically apply all patches
-from the patches directory to fix bugs in third-party libraries.
-"""
-import logging
-import subprocess
-import sys
-from pathlib import Path
-logger = logging.getLogger(__name__)
-def find_patches_directory() -> Path:
-	"""Find the patches directory relative to the current script location."""
-	# Go up from src/distiller/ to project root, then to patches/
-	current_file = Path(__file__)
-	project_root = current_file.parent.parent.parent  # Go up 3 levels: distiller -> src -> project_root
-	patches_dir = project_root / "patches"
-	if not patches_dir.exists():
-		# Alternative: try relative to current working directory
-		patches_dir = Path("patches")
-	return patches_dir
-def get_site_packages_path() -> Path:
-	"""Get the site-packages directory path."""
-	import site
-	# Try to get the site-packages from the current environment
-	site_packages_dirs = site.getsitepackages()
-	# Prefer the first site-packages directory
-	if site_packages_dirs:
-		return Path(site_packages_dirs[0])
-	# Fallback: try to find it relative to Python executable
-	python_path = Path(sys.executable)
-	if python_path.name == "python" or python_path.name.startswith("python"):
-		# Standard virtual environment structure
-		venv_lib = python_path.parent.parent / "lib"
-		for item in venv_lib.iterdir():
-			if item.name.startswith("python"):
-				site_packages = item / "site-packages"
-				if site_packages.exists():
-					return site_packages
-	# Last resort: use current directory
-	return Path()
-def apply_patch_file(patch_file: Path, target_dir: Path) -> bool:
-	"""
-	Apply a single patch file to the target directory.
-	Args:
-	    patch_file: Path to the .patch file
-	    target_dir: Target directory (usually site-packages)
-	Returns:
-	    True if patch was applied successfully, False otherwise
-	"""
-	try:
-		logger.info(f"Applying patch: {patch_file.name}")
-		# Check if patch is already applied
-		if is_patch_already_applied(patch_file, target_dir):
-			logger.info(f"Patch {patch_file.name} already applied")
-			return True
-		# Clean any duplicate validation code before applying
-		if "model2vec.patch" in patch_file.name:
-			clean_duplicate_validation_code(target_dir)
-		# Use patch command with the following options:
-		# -p1: strip 1 leading directory from paths
-		# -d: change to directory before applying
-		# -f: force (don't ask questions)
-		# -N: don't reverse patches that appear to be already applied
-		result = subprocess.run(  # noqa: S603
-			["patch", "-p1", "-d", str(target_dir), "-f", "-N"],  # noqa: S607
-			input=patch_file.read_text(),
-			text=True,
-			capture_output=True,
-			check=False,  # Don't raise exception on non-zero exit
-		)
-		if result.returncode == 0:
-			logger.info(f"Successfully applied patch: {patch_file.name}")
-			return True
-		if "already applied" in result.stderr.lower() or "reversed" in result.stderr.lower():
-			logger.info(f"Patch {patch_file.name} already applied")
-			return True
-		logger.warning(f"Failed to apply patch {patch_file.name}: {result.stderr}")
-		return False
-	except FileNotFoundError:
-		logger.exception("'patch' command not found. Please install patch utility.")
-		return False
-	except Exception:
-		logger.exception(f"Error applying patch {patch_file.name}")
-		return False
-def apply_all_patches() -> int:
-	"""
-	Apply all patches from the patches directory.
-	Returns:
-	    Number of patches successfully applied
-	"""
-	patches_dir = find_patches_directory()
-	if not patches_dir.exists():
-		logger.warning(f"Patches directory not found: {patches_dir}")
-		return 0
-	# Find all .patch files
-	patch_files = list(patches_dir.glob("*.patch"))
-	if not patch_files:
-		logger.info("No patch files found")
-		return 0
-	# Get target directory (site-packages)
-	target_dir = get_site_packages_path()
-	logger.info(f"Applying patches to: {target_dir}")
-	# Clean any existing duplicates first
-	clean_duplicate_validation_code(target_dir)
-	success_count = 0
-	# Sort patch files for consistent ordering
-	for patch_file in sorted(patch_files):
-		if apply_patch_file(patch_file, target_dir):
-			success_count += 1
-	logger.info(f"Applied {success_count}/{len(patch_files)} patches successfully")
-	return success_count
-def is_patch_already_applied(patch_file: Path, target_dir: Path) -> bool:
-	"""
-	Check if a patch has already been applied by looking for specific markers.
-	Args:
-	    patch_file: Path to the .patch file
-	    target_dir: Target directory (usually site-packages)
-	Returns:
-	    True if patch appears to be already applied, False otherwise
-	"""
-	try:
-		# For model2vec.patch, check if the validation code is already present
-		if "model2vec.patch" in patch_file.name:
-			inference_file = target_dir / "model2vec" / "distill" / "inference.py"
-			if inference_file.exists():
-				inference_content = inference_file.read_text()
-				# Check for the specific validation code we're adding
-				if (
-					"Token-vector mismatch:" in inference_content
-					and "Truncating to prevent failure" in inference_content
-				):
-					# Also make sure it's in the right place (before return statement, not after)
-					lines = inference_content.split("\n")
-					for i, line in enumerate(lines):
-						if "return out_tokens, out_weights" in line:
-							# Check if validation code appears before this return
-							preceding_lines = lines[max(0, i - 10) : i]
-							if any("Token-vector mismatch:" in pline for pline in preceding_lines):
-								return True
-							break
-		# For tokenlearn.patch, check if the indexing fix is already present
-		if "tokenlearn.patch" in patch_file.name:
-			pretrain_file = target_dir / "tokenlearn" / "pretrain.py"
-			if pretrain_file.exists():
-				pretrain_content = pretrain_file.read_text()
-				# Check for the specific fix we're adding
-				if (
-					"Fix for index out of bounds issue" in pretrain_content
-					and "torch.clamp(input_ids, 0, self.w.shape[0] - 1)" in pretrain_content
-				):
-					return True
-		return False
-	except Exception as e:
-		logger.warning(f"Error checking if patch {patch_file.name} is applied: {e}")
-		return False
-def clean_duplicate_validation_code(target_dir: Path) -> bool:
-	"""
-	Clean up duplicate validation code that might have been added by multiple patch applications.
-	Args:
-	    target_dir: Target directory (usually site-packages)
-	Returns:
-	    True if cleanup was successful, False otherwise
-	"""
-	try:
-		inference_file = target_dir / "model2vec" / "distill" / "inference.py"
-		if not inference_file.exists():
-			return True
-		content = inference_file.read_text()
-		lines = content.split("\n")
-		# Find all instances of the validation code
-		validation_indices = []
-		for i, line in enumerate(lines):
-			if "Token-vector mismatch:" in line:
-				validation_indices.append(i)
-		if len(validation_indices) <= 1:
-			return True  # No duplicates or no validation code
-		# Keep only the validation code that appears before a return statement
-		lines_to_keep = []
-		skip_until = -1
-		for i, line in enumerate(lines):
-			if i <= skip_until:
-				continue
-			# If this is validation code
-			if "Token-vector mismatch:" in line:
-				# Look ahead to see if there's a return statement nearby
-				has_return_after = False
-				for j in range(i, min(len(lines), i + 20)):
-					if "return out_tokens, out_weights" in lines[j]:
-						has_return_after = True
-						break
-				# Keep this validation block only if it's followed by a return
-				if has_return_after:
-					lines_to_keep.append(line)
-				else:
-					# Skip this validation block (it's a duplicate)
-					# Find the end of this validation block
-					for j in range(i + 1, len(lines)):
-						if lines[j].strip() == "" or not lines[j].startswith("    "):
-							skip_until = j - 1
-							break
-			else:
-				lines_to_keep.append(line)
-		# Write back the cleaned content
-		cleaned_content = "\n".join(lines_to_keep)
-		inference_file.write_text(cleaned_content)
-		logger.info("Cleaned duplicate validation code from inference.py")
-		return True
-	except Exception as e:
-		logger.warning(f"Error cleaning duplicate validation code: {e}")
-		return False
-def main() -> None:
-	"""Main function for standalone execution."""
-	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-	print("Applying all patches...")
-	success_count = apply_all_patches()
-	print(f"Done. Applied {success_count} patches.")
-if __name__ == "__main__":
-	main()

uv.lock CHANGED Viewed

@@ -774,24 +774,31 @@ dependencies = [
     { name = "flash-attn" },
     { name = "hatchling" },
     { name = "iso639" },
     { name = "kaleido" },
     { name = "lightning" },
     { name = "matplotlib" },
-    { name = "model2vec", extra = ["train"] },
     { name = "mteb" },
     { name = "numpy" },
     { name = "plotly" },
     { name = "psutil" },
     { name = "pydantic" },
     { name = "requests" },
     { name = "scikit-learn" },
     { name = "seaborn" },
     { name = "sentence-transformers" },
     { name = "setuptools" },
     { name = "smart-open", extra = ["s3"] },
     { name = "statsmodels" },
-    { name = "tokenlearn" },
     { name = "torch" },
     { name = "typer" },
 ]
@@ -813,24 +820,31 @@ requires-dist = [
     { name = "flash-attn", specifier = ">=2.7.4.post1" },
     { name = "hatchling", specifier = ">=1.27.0" },
     { name = "iso639", specifier = ">=0.1.4" },
     { name = "kaleido", specifier = "==1.0.0rc13" },
     { name = "lightning", specifier = ">=2.5.1.post0" },
     { name = "matplotlib", specifier = ">=3.10.3" },
-    { name = "model2vec", extras = ["train"], specifier = ">=0.5.0" },
     { name = "mteb", specifier = ">=1.14.15" },
     { name = "numpy", specifier = ">=1.26.4" },
     { name = "plotly", specifier = ">=6.1.1" },
     { name = "psutil", specifier = ">=7.0.0" },
     { name = "pydantic", specifier = ">=2.11.5" },
     { name = "requests", specifier = ">=2.32.3" },
     { name = "scikit-learn", specifier = ">=1.6.1" },
     { name = "seaborn", specifier = ">=0.13.2" },
     { name = "sentence-transformers", specifier = ">=4.1.0" },
     { name = "setuptools", specifier = ">=80.8.0" },
     { name = "smart-open", extras = ["s3"], specifier = ">=7.1.0" },
     { name = "statsmodels", specifier = ">=0.14.4" },
-    { name = "tokenlearn", specifier = ">=0.2.0" },
     { name = "torch", specifier = ">=2.7.0" },
     { name = "typer", specifier = ">=0.16.0" },
 ]
@@ -1187,38 +1201,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
 ]
-[[package]]
-name = "model2vec"
-version = "0.5.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "jinja2" },
-    { name = "joblib" },
-    { name = "numpy" },
-    { name = "rich" },
-    { name = "safetensors" },
-    { name = "setuptools" },
-    { name = "tokenizers" },
-    { name = "tqdm" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/93/18/c546916657e47e52b6e25b231803903bcf4e7ef2497fe41e9869236d7dee/model2vec-0.5.0.tar.gz", hash = "sha256:0771fd99d5c58fac631a2faa233759a8cec7a3be6e9aeeeeeca2d5e7048d1c7b", size = 2665840 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/66/ab/5263bc4605e9960fece76b710c01fef33859dc6ae72832d5987db75eed63/model2vec-0.5.0-py3-none-any.whl", hash = "sha256:12f14a18556975c037961a836a702388876bfec1ff76176f056884d219735271", size = 44578 },
-]
-[package.optional-dependencies]
-distill = [
-    { name = "scikit-learn" },
-    { name = "torch" },
-    { name = "transformers" },
-]
-train = [
-    { name = "lightning" },
-    { name = "scikit-learn" },
-    { name = "skops" },
-    { name = "torch" },
-]
 [[package]]
 name = "more-itertools"
 version = "10.7.0"
@@ -2492,22 +2474,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e6/b6/072a8e053ae600dcc2ac0da81a23548e3b523301a442a6ca900e92ac35be/tokenizers-0.21.1-cp39-abi3-win_amd64.whl", hash = "sha256:0f0dcbcc9f6e13e675a66d7a5f2f225a736745ce484c1a4e07476a89ccdad382", size = 2435481 },
 ]
-[[package]]
-name = "tokenlearn"
-version = "0.2.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "datasets" },
-    { name = "model2vec", extra = ["distill"] },
-    { name = "more-itertools" },
-    { name = "sentence-transformers" },
-    { name = "torch" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/58/b6/f9587ea271a9a7464cd25025b65f471d49bbceb48cc90742a89ac085edfd/tokenlearn-0.2.0.tar.gz", hash = "sha256:7a8faa0f51a510d185a40bef197a88116464adb8ce85ffd12c1d6905369c2375", size = 149042 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/40/3d/1c2b2e80ffd929bb8e7930d6a48e3b4252676cdc6c0c38f13a6f0f374b9c/tokenlearn-0.2.0-py3-none-any.whl", hash = "sha256:7a05e2800420eb2914c30e7377adeb14822c63585a0b9ed018bc82735dae1f29", size = 11970 },
-]
 [[package]]
 name = "torch"
 version = "2.7.0"
@@ -2580,7 +2546,7 @@ wheels = [
 [[package]]
 name = "transformers"
-version = "4.52.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -2594,9 +2560,9 @@ dependencies = [
     { name = "tokenizers" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/07/42/271bcf364788337ac24e7f200005ac7142aaf022206bd6119d2daca22c04/transformers-4.52.3.tar.gz", hash = "sha256:2e1de29374f27920aaf6d589d4e6339f33def2fb08809e1a1d792e040e9fbce7", size = 8951324 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/36/f8/1f086942bc6a044e4e68dacf6de761a45367795efd5f57ad356765691c79/transformers-4.52.3-py3-none-any.whl", hash = "sha256:cd04059da50e7cf2a617ce3143ba8beffbf119f8c25a0717c3454fd9d0f19609", size = 10460322 },
 ]
 [[package]]

     { name = "flash-attn" },
     { name = "hatchling" },
     { name = "iso639" },
+    { name = "jinja2" },
+    { name = "joblib" },
     { name = "kaleido" },
     { name = "lightning" },
     { name = "matplotlib" },
+    { name = "more-itertools" },
     { name = "mteb" },
     { name = "numpy" },
     { name = "plotly" },
     { name = "psutil" },
     { name = "pydantic" },
     { name = "requests" },
+    { name = "rich" },
+    { name = "safetensors" },
     { name = "scikit-learn" },
     { name = "seaborn" },
     { name = "sentence-transformers" },
     { name = "setuptools" },
+    { name = "skops" },
     { name = "smart-open", extra = ["s3"] },
     { name = "statsmodels" },
+    { name = "tokenizers" },
     { name = "torch" },
+    { name = "tqdm" },
+    { name = "transformers" },
     { name = "typer" },
 ]
     { name = "flash-attn", specifier = ">=2.7.4.post1" },
     { name = "hatchling", specifier = ">=1.27.0" },
     { name = "iso639", specifier = ">=0.1.4" },
+    { name = "jinja2", specifier = ">=3.0.0" },
+    { name = "joblib", specifier = ">=1.0.0" },
     { name = "kaleido", specifier = "==1.0.0rc13" },
     { name = "lightning", specifier = ">=2.5.1.post0" },
     { name = "matplotlib", specifier = ">=3.10.3" },
+    { name = "more-itertools", specifier = ">=10.5.0" },
     { name = "mteb", specifier = ">=1.14.15" },
     { name = "numpy", specifier = ">=1.26.4" },
     { name = "plotly", specifier = ">=6.1.1" },
     { name = "psutil", specifier = ">=7.0.0" },
     { name = "pydantic", specifier = ">=2.11.5" },
     { name = "requests", specifier = ">=2.32.3" },
+    { name = "rich", specifier = ">=10.0.0" },
+    { name = "safetensors", specifier = ">=0.3.0" },
     { name = "scikit-learn", specifier = ">=1.6.1" },
     { name = "seaborn", specifier = ">=0.13.2" },
     { name = "sentence-transformers", specifier = ">=4.1.0" },
     { name = "setuptools", specifier = ">=80.8.0" },
+    { name = "skops", specifier = ">=0.11.0" },
     { name = "smart-open", extras = ["s3"], specifier = ">=7.1.0" },
     { name = "statsmodels", specifier = ">=0.14.4" },
+    { name = "tokenizers", specifier = ">=0.20" },
     { name = "torch", specifier = ">=2.7.0" },
+    { name = "tqdm", specifier = ">=4.65.0" },
+    { name = "transformers", specifier = "<=4.52.1" },
     { name = "typer", specifier = ">=0.16.0" },
 ]
     { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
 ]
 [[package]]
 name = "more-itertools"
 version = "10.7.0"
     { url = "https://files.pythonhosted.org/packages/e6/b6/072a8e053ae600dcc2ac0da81a23548e3b523301a442a6ca900e92ac35be/tokenizers-0.21.1-cp39-abi3-win_amd64.whl", hash = "sha256:0f0dcbcc9f6e13e675a66d7a5f2f225a736745ce484c1a4e07476a89ccdad382", size = 2435481 },
 ]
 [[package]]
 name = "torch"
 version = "2.7.0"
 [[package]]
 name = "transformers"
+version = "4.52.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
     { name = "tokenizers" },
     { name = "tqdm" },
 ]
+sdist = { url = "https://files.pythonhosted.org/packages/4a/de/f3f3a0649dc522aeff55a5739e06e132c875c53701307a2ddd7ce7528ec5/transformers-4.52.1.tar.gz", hash = "sha256:c380d583ed9c7ebe3e30ca5e55ec1249db39eb9ee277f8e74dab1abc6a03c938", size = 8944009 }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/b8/1e/2b00e5021c3545d4a0ae32f3d332ae29e62a6259092f1468976e7b9d4adb/transformers-4.52.1-py3-none-any.whl", hash = "sha256:604b2bb357c480dc5883b7944e8562c967f6b06f63dfb6a1c4665d13d067148f", size = 10459023 },
 ]
 [[package]]