Sarthak commited on May 25

Commit

ecfceb8

1 Parent(s): 7bb46ce

initial commit

Files changed (31) hide show

.gitattributes +4 -35
.gitignore +14 -0
.python-version +1 -0
LICENSE +201 -0
MTEB_evaluate.py +350 -0
README.md +3 -3
config.json +13 -0
distill.py +116 -0
evaluate.py +422 -0
evaluation/memory_comparison.png +3 -0
evaluation/similarity_matrix.png +3 -0
evaluation/size_comparison.png +3 -0
evaluation/speed_comparison.png +3 -0
model.safetensors +3 -0
modules.json +14 -0
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AmazonCounterfactualClassification.json +179 -0
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/Banking77Classification.json +73 -0
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CQADupstackProgrammersRetrieval.json +158 -0
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/STSBenchmark.json +26 -0
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/SprintDuplicateQuestions.json +58 -0
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/StackExchangeClustering.json +47 -0
mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/model_meta.json +1 -0
mteb_results/mteb_parsed_results.json +3 -0
mteb_results/mteb_raw_results.json +7 -0
mteb_results/mteb_report.txt +21 -0
mteb_results/mteb_summary.json +20 -0
pipeline.skops +3 -0
pyproject.toml +101 -0
tokenizer.json +3 -0
train_code_classification.py +365 -0
uv.lock +0 -0

.gitattributes CHANGED Viewed

@@ -1,37 +1,6 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text

 tokenizer.json filter=lfs diff=lfs merge=lfs -text
+model.safetensors filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text
+evaluation/** filter=lfs diff=lfs merge=lfs -text
+*.skops* filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+# Cache
+.ruff_cache
+.mypy_cache

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

MTEB_evaluate.py ADDED Viewed

	@@ -0,0 +1,350 @@

+#!/usr/bin/env python
+"""
+MTEB Evaluation Script for Distilled Model - Code-Focused Tasks.
+This script evaluates the distilled gte-Qwen2-7B-instruct model using MTEB
+(Massive Text Embedding Benchmark) with a focus on tasks relevant for code:
+- Classification: Tests ability to distinguish between different categories (e.g., programming languages)
+- Clustering: Tests ability to group similar code by functionality
+- STS: Tests semantic similarity understanding between code snippets
+- Retrieval: Tests code search and duplicate detection capabilities
+Features:
+- Incremental evaluation: Skips tasks that already have results in mteb_results/
+- Combines existing and new results automatically
+- Saves results in multiple formats for analysis
+Usage:
+    python MTEB_evaluate.py
+Configuration:
+    - Set EVAL_ALL_TASKS = False to use only CODE_SPECIFIC_TASKS
+    - Modify CODE_SPECIFIC_TASKS for granular task selection
+"""
+import json
+import logging
+import sys
+import time
+from pathlib import Path
+import mteb
+from model2vec import StaticModel
+from mteb import ModelMeta
+from evaluation import (
+	CustomMTEB,
+	get_tasks,
+	make_leaderboard,
+	parse_mteb_results,
+	summarize_results,
+)
+# =============================================================================
+# CONFIGURATION CONSTANTS
+# =============================================================================
+# Model Configuration
+MODEL_PATH = "."  # Path to the distilled model directory
+MODEL_NAME = "gte-Qwen2-7B-instruct-M2V-Distilled"  # Name for the model in results
+# Evaluation Configuration
+OUTPUT_DIR = "mteb_results"  # Directory to save evaluation results
+EVAL_ALL_TASKS = True
+# Specific tasks most relevant for code evaluation (focused selection)
+CODE_SPECIFIC_TASKS = [
+	# Classification - Programming language/category classification
+	"Banking77Classification",  # Fine-grained classification (77 classes)
+	# Clustering - Code grouping by functionality
+	"StackExchangeClustering.v2",  # Technical Q&A clustering (most relevant)
+	# STS - Code similarity understanding
+	"STSBenchmark",  # Standard semantic similarity benchmark
+	# Retrieval - Code search capabilities
+	"CQADupstackProgrammersRetrieval",  # Programming Q&A retrieval
+	# PairClassification - Duplicate/similar code detection
+	"SprintDuplicateQuestions",  # Duplicate question detection
+]
+# Evaluation settings
+EVAL_SPLITS = ["test"]  # Dataset splits to evaluate on
+VERBOSITY = 2  # MTEB verbosity level
+# =============================================================================
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+def check_existing_results(output_path: Path, tasks: list) -> list:
+	"""Check for existing task results and filter out completed tasks."""
+	remaining_tasks = []
+	completed_tasks = []
+	for task in tasks:
+		task_name = task.metadata.name
+		# MTEB saves results as {model_name}__{task_name}.json
+		result_file = output_path / MODEL_NAME / f"{task_name}.json"
+		if result_file.exists():
+			completed_tasks.append(task_name)
+			logger.info(f"Skipping {task_name} - results already exist")
+		else:
+			remaining_tasks.append(task)
+	if completed_tasks:
+		logger.info(f"Found existing results for {len(completed_tasks)} tasks: {completed_tasks}")
+	return remaining_tasks
+def load_existing_parsed_results(output_path: Path) -> dict:
+	"""Load existing parsed results if they exist."""
+	parsed_results_file = output_path / "mteb_parsed_results.json"
+	if parsed_results_file.exists():
+		try:
+			with parsed_results_file.open("r") as f:
+				return json.load(f)
+		except (json.JSONDecodeError, OSError) as e:
+			logger.warning(f"Could not load existing parsed results: {e}")
+	return {}
+def load_and_display_existing_results(output_path: Path) -> None:
+	"""Load and display existing MTEB results."""
+	summary_file = output_path / "mteb_summary.json"
+	if summary_file.exists():
+		with summary_file.open("r") as f:
+			summary = json.load(f)
+		logger.info("=" * 80)
+		logger.info("EXISTING MTEB EVALUATION RESULTS:")
+		logger.info("=" * 80)
+		stats = summary.get("summary_stats")
+		if stats:
+			logger.info(f"Total Datasets: {stats.get('total_datasets', 'N/A')}")
+			logger.info(f"Average Score: {stats.get('average_score', 0):.4f}")
+			logger.info(f"Median Score: {stats.get('median_score', 0):.4f}")
+		logger.info("=" * 80)
+	else:
+		logger.info("No existing summary found. Individual task results may still exist.")
+def run_mteb_evaluation() -> None:
+	"""Run MTEB evaluation using the evaluation package."""
+	output_path = Path(OUTPUT_DIR)
+	output_path.mkdir(parents=True, exist_ok=True)
+	logger.info(f"Loading model from {MODEL_PATH}")
+	model = StaticModel.from_pretrained(MODEL_PATH)
+	logger.info("Model loaded successfully")
+	# Set up model metadata for MTEB
+	model.mteb_model_meta = ModelMeta(  # type: ignore[attr-defined]
+		name=MODEL_NAME, revision="distilled", release_date=None, languages=["eng"]
+	)
+	# Get specific code-relevant tasks (focused selection)
+	logger.info("Getting focused code-relevant MTEB tasks")
+	logger.info(f"Selected specific tasks: {CODE_SPECIFIC_TASKS}")
+	if EVAL_ALL_TASKS:
+		all_tasks = get_tasks()
+	else:
+		all_tasks = [mteb.get_task(task_name, languages=["eng"]) for task_name in CODE_SPECIFIC_TASKS]
+	logger.info(f"Found {len(all_tasks)} total tasks")
+	# Check for existing results and filter out completed tasks
+	tasks = check_existing_results(output_path, all_tasks)
+	logger.info(f"Will evaluate {len(tasks)} remaining tasks")
+	if not tasks:
+		logger.info("No new tasks to evaluate - all tasks already completed!")
+		# Load and display existing results
+		logger.info("Loading existing results...")
+		try:
+			load_and_display_existing_results(output_path)
+		except (json.JSONDecodeError, OSError, KeyError) as e:
+			logger.warning(f"Could not load existing results: {e}")
+		return
+	# Define the CustomMTEB object with the specified tasks
+	evaluation = CustomMTEB(tasks=tasks)
+	# Run the evaluation
+	logger.info("Starting MTEB evaluation...")
+	start_time = time.time()
+	results = evaluation.run(model, eval_splits=EVAL_SPLITS, output_folder=str(output_path), verbosity=VERBOSITY)
+	end_time = time.time()
+	evaluation_time = end_time - start_time
+	logger.info(f"Evaluation completed in {evaluation_time:.2f} seconds")
+	# Parse the results and summarize them
+	logger.info("Parsing and summarizing results...")
+	parsed_results = parse_mteb_results(mteb_results=results, model_name=MODEL_NAME)
+	# Load existing results if any and combine them
+	existing_results = load_existing_parsed_results(output_path)
+	if existing_results:
+		logger.info("Combining with existing results...")
+		# Convert to dict for merging
+		parsed_dict = dict(parsed_results) if hasattr(parsed_results, "items") else {}
+		# Simple merge - existing results take precedence to avoid overwriting
+		for key, value in existing_results.items():
+			if key not in parsed_dict:
+				parsed_dict[key] = value
+		parsed_results = parsed_dict
+	task_scores = summarize_results(parsed_results)
+	# Save results in different formats
+	save_results(output_path, results, parsed_results, task_scores, evaluation_time)
+	# Print the results in a leaderboard format
+	logger.info("MTEB Evaluation Results:")
+	logger.info("=" * 80)
+	leaderboard = make_leaderboard(task_scores)  # type: ignore[arg-type]
+	logger.info(leaderboard.to_string(index=False))
+	logger.info("=" * 80)
+	logger.info(f"Evaluation completed successfully. Results saved to {OUTPUT_DIR}")
+def save_results(
+	output_path: Path, raw_results: list, parsed_results: dict, task_scores: dict, evaluation_time: float
+) -> None:
+	"""Save evaluation results in multiple formats."""
+	# Save raw results
+	raw_results_file = output_path / "mteb_raw_results.json"
+	with raw_results_file.open("w") as f:
+		json.dump(raw_results, f, indent=2, default=str)
+	logger.info(f"Raw results saved to {raw_results_file}")
+	# Save parsed results
+	parsed_results_file = output_path / "mteb_parsed_results.json"
+	with parsed_results_file.open("w") as f:
+		json.dump(parsed_results, f, indent=2, default=str)
+	logger.info(f"Parsed results saved to {parsed_results_file}")
+	# Generate summary statistics
+	summary_stats = generate_summary_stats(task_scores)
+	# Save task scores summary
+	summary = {
+		"model_name": MODEL_NAME,
+		"evaluation_time_seconds": evaluation_time,
+		"task_scores": task_scores,
+		"summary_stats": summary_stats,
+	}
+	summary_file = output_path / "mteb_summary.json"
+	with summary_file.open("w") as f:
+		json.dump(summary, f, indent=2, default=str)
+	logger.info(f"Summary saved to {summary_file}")
+	# Save human-readable report
+	report_file = output_path / "mteb_report.txt"
+	generate_report(output_path, task_scores, summary_stats, evaluation_time)
+	logger.info(f"Report saved to {report_file}")
+def generate_summary_stats(task_scores: dict) -> dict:
+	"""Generate summary statistics from task scores."""
+	if not task_scores:
+		return {}
+	# Extract all individual dataset scores
+	all_scores = []
+	for model_data in task_scores.values():
+		if isinstance(model_data, dict) and "dataset_scores" in model_data:
+			dataset_scores = model_data["dataset_scores"]
+			if isinstance(dataset_scores, dict):
+				all_scores.extend(
+					[
+						float(score)
+						for score in dataset_scores.values()
+						if isinstance(score, int | float) and str(score).lower() != "nan"
+					]
+				)
+	if not all_scores:
+		return {}
+	import numpy as np
+	return {
+		"total_datasets": len(all_scores),
+		"average_score": float(np.mean(all_scores)),
+		"median_score": float(np.median(all_scores)),
+		"std_dev": float(np.std(all_scores)),
+		"min_score": float(np.min(all_scores)),
+		"max_score": float(np.max(all_scores)),
+	}
+def generate_report(output_path: Path, task_scores: dict, summary_stats: dict, evaluation_time: float) -> None:
+	"""Generate human-readable evaluation report."""
+	report_file = output_path / "mteb_report.txt"
+	with report_file.open("w") as f:
+		f.write("=" * 80 + "\n")
+		f.write("MTEB Evaluation Report\n")
+		f.write("=" * 80 + "\n\n")
+		f.write(f"Model: {MODEL_NAME}\n")
+		f.write(f"Model Path: {MODEL_PATH}\n")
+		f.write(f"Evaluation Time: {evaluation_time:.2f} seconds\n")
+		# Write summary stats
+		if summary_stats:
+			f.write(f"Total Datasets: {summary_stats['total_datasets']}\n\n")
+			f.write("Summary Statistics:\n")
+			f.write(f"  Average Score: {summary_stats['average_score']:.4f}\n")
+			f.write(f"  Median Score: {summary_stats['median_score']:.4f}\n")
+			f.write(f"  Standard Deviation: {summary_stats['std_dev']:.4f}\n")
+			f.write(f"  Score Range: {summary_stats['min_score']:.4f} - {summary_stats['max_score']:.4f}\n\n")
+		else:
+			f.write("Summary Statistics: No valid results found\n\n")
+		# Write leaderboard
+		f.write("Detailed Results:\n")
+		f.write("-" * 50 + "\n")
+		if task_scores:
+			leaderboard = make_leaderboard(task_scores)  # type: ignore[arg-type]
+			f.write(leaderboard.to_string(index=False))
+		else:
+			f.write("No results available\n")
+		f.write("\n\n" + "=" * 80 + "\n")
+def main() -> None:
+	"""Main evaluation function."""
+	logger.info(f"Starting MTEB evaluation for {MODEL_NAME}")
+	logger.info(f"Model path: {MODEL_PATH}")
+	logger.info(f"Output directory: {OUTPUT_DIR}")
+	logger.info("Running focused MTEB evaluation on code-relevant tasks:")
+	logger.info("  - Classification: Programming language classification")
+	logger.info("  - Clustering: Code clustering by functionality")
+	logger.info("  - STS: Semantic similarity between code snippets")
+	logger.info("  - Retrieval: Code search and retrieval")
+	try:
+		run_mteb_evaluation()
+		logger.info("Evaluation pipeline completed successfully!")
+	except Exception:
+		logger.exception("Evaluation failed")
+		sys.exit(1)
+if __name__ == "__main__":
+	main()

README.md CHANGED Viewed

@@ -1,3 +1,3 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+---

config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "model_type": "model2vec",
+    "architectures": [
+        "StaticModel"
+    ],
+    "tokenizer_name": "Alibaba-NLP/gte-Qwen2-7B-instruct",
+    "apply_pca": 256,
+    "apply_zipf": null,
+    "sif_coefficient": 0.0001,
+    "hidden_dim": 256,
+    "seq_length": 1000000,
+    "normalize": true
+}

distill.py ADDED Viewed

	@@ -0,0 +1,116 @@

+#!/usr/bin/env python
+"""
+Script to distill Alibaba-NLP/gte-Qwen2-7B-instruct using Model2Vec.
+This script performs the following operations:
+1. Downloads the Alibaba-NLP/gte-Qwen2-7B-instruct model
+2. Distills it using Model2Vec to create a smaller, faster static model
+3. Saves the distilled model for further use
+"""
+import logging
+import shutil
+import time
+from pathlib import Path
+from model2vec.distill import distill
+# =============================================================================
+# CONFIGURATION CONSTANTS
+# =============================================================================
+# Model Configuration
+MODEL_NAME = "Alibaba-NLP/gte-Qwen2-7B-instruct"  # Model name or path for the source model
+OUTPUT_DIR = "."  # Directory to save the distilled model (current directory)
+PCA_DIMS = 256  # Dimensions for PCA reduction (smaller = faster but less accurate)
+# Hub Configuration
+SAVE_TO_HUB = False  # Whether to push the model to HuggingFace Hub
+HUB_MODEL_ID = None  # Model ID for HuggingFace Hub (if saving to hub)
+# Generation Configuration
+SKIP_README = True  # Skip generating the README file
+# =============================================================================
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+def main() -> None:
+	"""Run the distillation process for Alibaba-NLP/gte-Qwen2-7B-instruct."""
+	# Create output directory if it doesn't exist
+	output_dir = Path(OUTPUT_DIR)
+	output_dir.mkdir(parents=True, exist_ok=True)
+	logger.info(f"Starting distillation of {MODEL_NAME}")
+	logger.info(f"Distilled model will be saved to {output_dir}")
+	logger.info(f"Using PCA dimensions: {PCA_DIMS}")
+	logger.info(f"Skipping README generation: {SKIP_README}")
+	# Record start time for benchmarking
+	start_time = time.time()
+	# Run the distillation
+	try:
+		logger.info("Starting Model2Vec distillation...")
+		m2v_model = distill(
+			model_name=MODEL_NAME,
+			pca_dims=PCA_DIMS,
+		)
+		distill_time = time.time() - start_time
+		logger.info(f"Distillation completed in {distill_time:.2f} seconds")
+		# Save the distilled model
+		m2v_model.save_pretrained(OUTPUT_DIR)
+		logger.info(f"Model saved to {OUTPUT_DIR}")
+		# Remove README.md if it was created and we want to skip it
+		if SKIP_README and (output_dir / "README.md").exists():
+			(output_dir / "README.md").unlink()
+			logger.info("Removed auto-generated README.md")
+		# Get model size information
+		model_size_mb = sum(
+			f.stat().st_size for f in output_dir.glob("**/*") if f.is_file() and f.name != "README.md"
+		) / (1024 * 1024)
+		logger.info(f"Distilled model size: {model_size_mb:.2f} MB")
+		# Push to hub if requested
+		if SAVE_TO_HUB:
+			if HUB_MODEL_ID:
+				logger.info(f"Pushing model to HuggingFace Hub as {HUB_MODEL_ID}")
+				# Create a temporary README for Hub upload if needed
+				readme_path = output_dir / "README.md"
+				had_readme = readme_path.exists()
+				if SKIP_README and had_readme:
+					# Backup the README
+					shutil.move(readme_path, output_dir / "README.md.bak")
+				# Push to Hub
+				m2v_model.push_to_hub(HUB_MODEL_ID)
+				# Restore state
+				if SKIP_README:
+					if had_readme:
+						# Restore the backup
+						shutil.move(output_dir / "README.md.bak", readme_path)
+					elif (output_dir / "README.md").exists():
+						# Remove README created during push_to_hub
+						(output_dir / "README.md").unlink()
+			else:
+				logger.error("HUB_MODEL_ID must be specified when SAVE_TO_HUB is True")
+		logger.info("Distillation process completed successfully!")
+	except Exception:
+		logger.exception("Error during distillation")
+		raise
+if __name__ == "__main__":
+	main()

evaluate.py ADDED Viewed

	@@ -0,0 +1,422 @@

+#!/usr/bin/env python
+"""
+Script to evaluate the performance of the distilled Qodo-Embed model.
+This script performs the following:
+1. Loads both the original Qodo-Embed-1-1.5B model and the distilled version
+2. Compares them on:
+   - Embedding similarity
+   - Inference speed
+   - Memory usage
+3. Outputs a comprehensive evaluation report
+"""
+import argparse
+import gc
+import logging
+import os
+import time
+from pathlib import Path
+from typing import Any, cast
+import matplotlib.pyplot as plt
+import numpy as np
+import psutil  # type: ignore [import]
+import torch
+from model2vec import StaticModel
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity  # type: ignore [import]
+# For transformer models
+from transformers import AutoModel, AutoTokenizer
+from transformers.modeling_utils import PreTrainedModel
+# =============================================================================
+# CONFIGURATION CONSTANTS
+# =============================================================================
+# Model Configuration
+ORIGINAL_MODEL = "Alibaba-NLP/gte-Qwen2-7B-instruct"  # Original model name or path
+DISTILLED_MODEL = "."  # Path to the distilled model (current directory)
+OUTPUT_DIR = "evaluation"  # Directory to save evaluation results
+# =============================================================================
+# Constants
+BYTES_PER_KB = 1024.0
+TEXT_TRUNCATE_LENGTH = 20
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+# Sample texts for evaluation
+SAMPLE_TEXTS = [
+	"def process_data_stream(source_iterator):",
+	"implement binary search tree",
+	"how to handle memory efficient data streaming",
+	"""class LazyLoader:
+        def __init__(self, source):
+            self.generator = iter(source)
+            self._cache = []""",
+	"""def dfs_traversal(root):
+        if not root:
+            return []
+        visited = []
+        stack = [root]
+        while stack:
+            node = stack.pop()
+            visited.append(node.val)
+            if node.right:
+                stack.append(node.right)
+            if node.left:
+                stack.append(node.left)
+        return visited""",
+]
+def load_models(
+	original_model_name: str, distilled_model_path: str
+) -> tuple[tuple[SentenceTransformer | PreTrainedModel, str], StaticModel]:
+	"""Load both the original and distilled models."""
+	logger.info(f"Loading original model: {original_model_name}")
+	try:
+		# Try to load as a sentence transformer first
+		original_model = SentenceTransformer(original_model_name)
+		model_type = "sentence_transformer"
+	except (ValueError, OSError, ImportError) as e:
+		# If that fails, try loading as a Hugging Face transformer
+		logger.info(f"Failed to load as SentenceTransformer: {e}")
+		AutoTokenizer.from_pretrained(original_model_name)
+		original_model = AutoModel.from_pretrained(original_model_name)
+		model_type = "huggingface"
+	logger.info(f"Loading distilled model from: {distilled_model_path}")
+	distilled_model = StaticModel.from_pretrained(distilled_model_path)
+	return (original_model, model_type), distilled_model
+def measure_memory_usage(model: SentenceTransformer | PreTrainedModel | StaticModel) -> float:
+	"""Measure memory usage of a model in MB."""
+	gc.collect()
+	torch.cuda.empty_cache() if torch.cuda.is_available() else None
+	process = psutil.Process(os.getpid())
+	memory_before = process.memory_info().rss / (1024 * 1024)  # MB
+	# Force model to allocate memory if it hasn't already
+	if isinstance(model, StaticModel | SentenceTransformer):
+		_ = model.encode(["Test"])
+	else:
+		# For HF models, we need to handle differently
+		pass
+	gc.collect()
+	torch.cuda.empty_cache() if torch.cuda.is_available() else None
+	process = psutil.Process(os.getpid())
+	memory_after = process.memory_info().rss / (1024 * 1024)  # MB
+	return memory_after - memory_before
+def compute_embeddings(
+	original_model: SentenceTransformer | PreTrainedModel,
+	original_model_type: str,
+	distilled_model: StaticModel,
+	texts: list[str],
+	original_model_name: str = "unknown",
+) -> tuple[np.ndarray, np.ndarray]:
+	"""Compute embeddings using both models."""
+	# Original model embeddings
+	if original_model_type == "sentence_transformer":
+		# Type narrowing: we know it's a SentenceTransformer here
+		sentence_model = cast("SentenceTransformer", original_model)
+		original_embeddings = sentence_model.encode(texts)
+	else:
+		# Type narrowing: we know it's a PreTrainedModel here
+		auto_model = original_model  # AutoModel.from_pretrained returns a PreTrainedModel instance
+		# For HF models, we need more custom code
+		# Simple mean pooling function for HF models
+		def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+			token_embeddings = model_output
+			input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+			return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+				input_mask_expanded.sum(1), min=1e-9
+			)
+		# Get model name for tokenizer
+		model_name = getattr(auto_model.config, "name_or_path", original_model_name)
+		tokenizer = AutoTokenizer.from_pretrained(model_name)
+		encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+		with torch.no_grad():
+			model_output = auto_model(**encoded_input)
+			original_embeddings = mean_pooling(model_output.last_hidden_state, encoded_input["attention_mask"]).numpy()
+	# Distilled model embeddings
+	distilled_embeddings = distilled_model.encode(texts)
+	return original_embeddings, distilled_embeddings
+def measure_inference_speed(
+	model: SentenceTransformer | PreTrainedModel | StaticModel, model_type: str, texts: list[str], n_runs: int = 5
+) -> float:
+	"""Measure inference speed in texts/second."""
+	# Warmup
+	if model_type in {"sentence_transformer", "static_model"}:
+		# Type narrowing: we know it has encode method here
+		encode_model = cast("SentenceTransformer | StaticModel", model)
+		_ = encode_model.encode(texts[:1])
+	else:
+		# Type narrowing: we know it's a PreTrainedModel here
+		auto_model = cast("PreTrainedModel", model)
+		# Warmup for HF models
+		model_name = getattr(auto_model.config, "name_or_path", "unknown")
+		tokenizer = AutoTokenizer.from_pretrained(model_name)
+		encoded_input = tokenizer(texts[:1], padding=True, truncation=True, return_tensors="pt")
+		with torch.no_grad():
+			_ = auto_model(**encoded_input)
+	# Measure speed
+	start_time = time.time()
+	if model_type in {"sentence_transformer", "static_model"}:
+		# Type narrowing: we know it has encode method here
+		encode_model = cast("SentenceTransformer | StaticModel", model)
+		for _ in range(n_runs):
+			_ = encode_model.encode(texts)
+	else:
+		# Type narrowing: we know it's a PreTrainedModel here
+		auto_model = cast("PreTrainedModel", model)
+		# For HF models
+		model_name = getattr(auto_model.config, "name_or_path", "unknown")
+		tokenizer = AutoTokenizer.from_pretrained(model_name)
+		for _ in range(n_runs):
+			encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+			with torch.no_grad():
+				_ = auto_model(**encoded_input)
+	total_time = time.time() - start_time
+	return (len(texts) * n_runs) / total_time
+def compute_cosine_similarity(embeddings1: np.ndarray, embeddings2: np.ndarray) -> np.ndarray:
+	"""Compute cosine similarity between embeddings, handling different dimensions.
+	For embeddings with different dimensions, we compute similarity by comparing
+	how they rank the same texts (semantically equivalent).
+	"""
+	# Ensure embeddings1 and embeddings2 are 2D arrays with shapes (n_samples, n_features)
+	if embeddings1.ndim == 1:
+		embeddings1 = embeddings1.reshape(1, -1)
+	if embeddings2.ndim == 1:
+		embeddings2 = embeddings2.reshape(1, -1)
+	# Check and transpose if needed to ensure samples are in rows
+	if embeddings2.shape[0] != len(SAMPLE_TEXTS) and embeddings2.shape[1] == len(SAMPLE_TEXTS):
+		embeddings2 = embeddings2.T
+	logger.info(f"Embeddings shapes: original={embeddings1.shape}, distilled={embeddings2.shape}")
+	# If dimensions differ, we compute similarity matrix based on how each model ranks text pairs
+	# This is a form of semantic similarity evaluation rather than direct vector comparison
+	similarity_matrix = np.zeros((len(SAMPLE_TEXTS), len(SAMPLE_TEXTS)))
+	# Compute similarity matrices within each embedding space
+	sim1 = cosine_similarity(embeddings1)
+	sim2 = cosine_similarity(embeddings2)
+	# The similarity between samples i and j is the correlation between how they rank other samples
+	for i in range(len(SAMPLE_TEXTS)):
+		for j in range(len(SAMPLE_TEXTS)):
+			# For diagonal elements (same sample), use a direct measure of how similar
+			# the two models rank that sample against all others
+			if i == j:
+				# Pearson correlation between the rankings (excluding self-comparison)
+				rankings1 = np.delete(sim1[i], i)
+				rankings2 = np.delete(sim2[i], i)
+				# Higher correlation means the models agree on the semantic similarity
+				similarity_matrix[i, j] = np.corrcoef(rankings1, rankings2)[0, 1]
+			else:
+				# For off-diagonal elements, compare how similarly both models relate samples i and j
+				similarity_matrix[i, j] = 1 - abs(sim1[i, j] - sim2[i, j])
+	return similarity_matrix
+def format_size(size_bytes: float) -> str:
+	"""Format size in bytes to human-readable format."""
+	for unit in ["B", "KB", "MB", "GB"]:
+		if size_bytes < BYTES_PER_KB:
+			return f"{size_bytes:.2f} {unit}"
+		size_bytes /= BYTES_PER_KB
+	return f"{size_bytes:.2f} TB"
+def plot_comparison(results: dict[str, Any], output_dir: str) -> None:
+	"""Generate comparison plots and save them."""
+	output_path = Path(output_dir)
+	output_path.mkdir(exist_ok=True, parents=True)
+	# Speed comparison
+	plt.figure(figsize=(10, 6))
+	models = ["Original", "Distilled"]
+	speeds = [results["original_speed"], results["distilled_speed"]]
+	plt.bar(models, speeds, color=["#1f77b4", "#ff7f0e"])
+	plt.ylabel("Texts per second")
+	plt.title("Inference Speed Comparison")
+	plt.savefig(output_path / "speed_comparison.png", dpi=300, bbox_inches="tight")
+	# Memory comparison
+	plt.figure(figsize=(10, 6))
+	memories = [results["original_memory"], results["distilled_memory"]]
+	plt.bar(models, memories, color=["#1f77b4", "#ff7f0e"])
+	plt.ylabel("Memory Usage (MB)")
+	plt.title("Memory Usage Comparison")
+	plt.savefig(output_path / "memory_comparison.png", dpi=300, bbox_inches="tight")
+	# Size comparison
+	plt.figure(figsize=(10, 6))
+	sizes = [results["original_size"], results["distilled_size"]]
+	plt.bar(models, sizes, color=["#1f77b4", "#ff7f0e"])
+	plt.ylabel("Model Size (MB)")
+	plt.title("Model Size Comparison")
+	plt.savefig(output_path / "size_comparison.png", dpi=300, bbox_inches="tight")
+	# Similarity matrix heatmap
+	plt.figure(figsize=(8, 6))
+	plt.imshow(results["similarity_matrix"], cmap="viridis", interpolation="nearest")
+	plt.colorbar(label="Cosine Similarity")
+	plt.title("Embedding Similarity Between Original and Distilled Models")
+	plt.xticks([])
+	plt.yticks(
+		range(len(SAMPLE_TEXTS)),
+		[t[:TEXT_TRUNCATE_LENGTH] + "..." if len(t) > TEXT_TRUNCATE_LENGTH else t for t in SAMPLE_TEXTS],
+	)
+	plt.savefig(output_path / "similarity_matrix.png", dpi=300, bbox_inches="tight")
+def evaluate_models(original_model_name: str, distilled_model_path: str, output_dir: str) -> dict[str, Any]:
+	"""Evaluate the original and distilled models."""
+	# Load models
+	(original_model, original_model_type), distilled_model = load_models(original_model_name, distilled_model_path)
+	# Measure model sizes
+	if isinstance(original_model, SentenceTransformer):
+		# For SentenceTransformer, get parameters from all modules
+		total_params = 0
+		for module in original_model.modules():
+			if hasattr(module, "parameters"):
+				for param in module.parameters():
+					total_params += param.numel()
+		original_model_size = total_params * 4 / (1024 * 1024)  # MB (assuming float32)
+	else:
+		# For PreTrainedModel
+		auto_model = original_model  # AutoModel.from_pretrained returns a PreTrainedModel instance
+		original_model_size = sum(p.numel() * 4 for p in auto_model.parameters()) / (
+			1024 * 1024
+		)  # MB (assuming float32)
+	# Calculate distilled model size - only count actual model files
+	model_files = ["model.safetensors", "config.json", "modules.json", "tokenizer.json"]
+	distilled_model_size = 0.0
+	for file_name in model_files:
+		file_path = Path(distilled_model_path) / file_name
+		if file_path.exists():
+			distilled_model_size += file_path.stat().st_size
+	distilled_model_size = distilled_model_size / (1024 * 1024)  # Convert to MB
+	# Measure memory usage
+	original_memory = measure_memory_usage(original_model)
+	distilled_memory = measure_memory_usage(distilled_model)
+	# Compute embeddings
+	original_embeddings, distilled_embeddings = compute_embeddings(
+		original_model, original_model_type, distilled_model, SAMPLE_TEXTS, original_model_name
+	)
+	# Compute similarity between embeddings
+	similarity_matrix = compute_cosine_similarity(original_embeddings, distilled_embeddings)
+	similarity_diagonal = np.diag(similarity_matrix)
+	avg_similarity = np.mean(similarity_diagonal)
+	# Measure inference speed
+	original_speed = measure_inference_speed(original_model, original_model_type, SAMPLE_TEXTS, n_runs=5)
+	distilled_speed = measure_inference_speed(distilled_model, "static_model", SAMPLE_TEXTS, n_runs=5)
+	# Collect results
+	results = {
+		"original_size": original_model_size,
+		"distilled_size": distilled_model_size,
+		"original_memory": original_memory,
+		"distilled_memory": distilled_memory,
+		"similarity_matrix": similarity_matrix,
+		"avg_similarity": avg_similarity,
+		"original_speed": original_speed,
+		"distilled_speed": distilled_speed,
+		"speed_improvement": distilled_speed / original_speed if original_speed > 0 else float("inf"),
+		"size_reduction": original_model_size / distilled_model_size if distilled_model_size > 0 else float("inf"),
+		"memory_reduction": original_memory / distilled_memory if distilled_memory > 0 else float("inf"),
+	}
+	# Generate plots
+	plot_comparison(results, output_dir)
+	# Print results
+	separator = "=" * 50
+	logger.info("\n%s", separator)
+	logger.info("Model Evaluation Results")
+	logger.info("%s", separator)
+	logger.info(f"Original Model Size: {results['original_size']:.2f} MB")
+	logger.info(f"Distilled Model Size: {results['distilled_size']:.2f} MB")
+	logger.info(f"Size Reduction Factor: {results['size_reduction']:.2f}x")
+	logger.info("\n")
+	logger.info(f"Original Model Memory: {results['original_memory']:.2f} MB")
+	logger.info(f"Distilled Model Memory: {results['distilled_memory']:.2f} MB")
+	logger.info(f"Memory Reduction Factor: {results['memory_reduction']:.2f}x")
+	logger.info("\n")
+	logger.info(f"Original Model Speed: {results['original_speed']:.2f} texts/second")
+	logger.info(f"Distilled Model Speed: {results['distilled_speed']:.2f} texts/second")
+	logger.info(f"Speed Improvement Factor: {results['speed_improvement']:.2f}x")
+	logger.info("\n")
+	logger.info(f"Average Embedding Similarity: {results['avg_similarity']:.4f}")
+	logger.info("%s", separator)
+	return results
+def main() -> None:
+	"""Run the evaluation process."""
+	parser = argparse.ArgumentParser(description="Evaluate the distilled model against the original")
+	parser.add_argument("--original_model", default=ORIGINAL_MODEL, help="Original model name or path")
+	parser.add_argument("--distilled_model", default=DISTILLED_MODEL, help="Path to the distilled model")
+	parser.add_argument("--output_dir", default=OUTPUT_DIR, help="Directory to save evaluation results")
+	args = parser.parse_args()
+	# Validate configuration
+	if not args.distilled_model:
+		logger.error("Distilled model path must be provided")
+		logger.error("Use --distilled_model to specify the path or set DISTILLED_MODEL constant")
+		return
+	# Create output directory
+	output_dir = Path(args.output_dir)
+	output_dir.mkdir(parents=True, exist_ok=True)
+	# Run evaluation
+	try:
+		evaluate_models(args.original_model, args.distilled_model, args.output_dir)
+		logger.info(f"Evaluation completed. Results saved to {args.output_dir}")
+	except Exception:
+		logger.exception("Error during evaluation")
+		raise
+if __name__ == "__main__":
+	main()

evaluation/memory_comparison.png ADDED Viewed

Git LFS Details

SHA256: 3e1258d250bc8fb9a5f8deb0a5b91eea2d7de8aa99b8c1353383e30a203874e4
Pointer size: 130 Bytes
Size of remote file: 72.3 kB

evaluation/similarity_matrix.png ADDED Viewed

Git LFS Details

SHA256: c9e597292d3f120e347b0382d80a7cf5da00cce234167276bc6acc219602a7ef
Pointer size: 131 Bytes
Size of remote file: 118 kB

evaluation/size_comparison.png ADDED Viewed

Git LFS Details

SHA256: 0e8356904ec54d4837361f88af9dfe7cb2161acbb3bb9b08ac1d11e718804d77
Pointer size: 130 Bytes
Size of remote file: 77.2 kB

evaluation/speed_comparison.png ADDED Viewed

Git LFS Details

SHA256: e8edf611a573a36223262ccfdcfa6251e14e1d1982af70144d21560684d69cba
Pointer size: 130 Bytes
Size of remote file: 80 kB

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9dc253eb31caa17834057d92999b03869cda542d5f70ecca3d4a5f03b3563b3f
+size 155283544

modules.json ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+    {
+        "idx": 0,
+        "name": "0",
+        "path": ".",
+        "type": "sentence_transformers.models.StaticEmbedding"
+    },
+    {
+        "idx": 1,
+        "name": "1",
+        "path": "1_Normalize",
+        "type": "sentence_transformers.models.Normalize"
+    }
+]

mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/AmazonCounterfactualClassification.json ADDED Viewed

	@@ -0,0 +1,179 @@

+{
+  "dataset_revision": "e8379541af4e31359cca9fbcf4b00f2671dba205",
+  "evaluation_time": 8.737873554229736,
+  "kg_co2_emissions": null,
+  "mteb_version": "1.14.15",
+  "scores": {
+    "test": [
+      {
+        "accuracy": 0.5690404797601201,
+        "ap": 0.13918928297805203,
+        "ap_weighted": 0.13918928297805203,
+        "f1": 0.47354721284407075,
+        "f1_weighted": 0.6484719142466673,
+        "hf_subset": "en-ext",
+        "languages": [
+          "eng-Latn"
+        ],
+        "main_score": 0.5690404797601201,
+        "scores_per_experiment": [
+          {
+            "accuracy": 0.6041979010494752,
+            "ap": 0.1403174548244783,
+            "ap_weighted": 0.1403174548244783,
+            "f1": 0.49333755341034974,
+            "f1_weighted": 0.6809473724919468
+          },
+          {
+            "accuracy": 0.6319340329835083,
+            "ap": 0.15145229144021116,
+            "ap_weighted": 0.15145229144021116,
+            "f1": 0.5150100219839455,
+            "f1_weighted": 0.7035165191069046
+          },
+          {
+            "accuracy": 0.6124437781109445,
+            "ap": 0.15370328203750555,
+            "ap_weighted": 0.15370328203750555,
+            "f1": 0.5069738581294719,
+            "f1_weighted": 0.6874863954073245
+          },
+          {
+            "accuracy": 0.5712143928035982,
+            "ap": 0.13285280504159222,
+            "ap_weighted": 0.13285280504159222,
+            "f1": 0.471264367816092,
+            "f1_weighted": 0.6532423443450689
+          },
+          {
+            "accuracy": 0.6101949025487257,
+            "ap": 0.1382528418572316,
+            "ap_weighted": 0.1382528418572316,
+            "f1": 0.49459093982420554,
+            "f1_weighted": 0.6859354298509973
+          },
+          {
+            "accuracy": 0.5194902548725637,
+            "ap": 0.12777013417285304,
+            "ap_weighted": 0.12777013417285304,
+            "f1": 0.4405866978944166,
+            "f1_weighted": 0.6068983868543434
+          },
+          {
+            "accuracy": 0.43778110944527737,
+            "ap": 0.11875450153550213,
+            "ap_weighted": 0.11875450153550213,
+            "f1": 0.3875609684433214,
+            "f1_weighted": 0.5263894210560583
+          },
+          {
+            "accuracy": 0.6176911544227887,
+            "ap": 0.14128018744097307,
+            "ap_weighted": 0.14128018744097307,
+            "f1": 0.5005725863284003,
+            "f1_weighted": 0.6920233725631899
+          },
+          {
+            "accuracy": 0.49025487256371814,
+            "ap": 0.14466447368521512,
+            "ap_weighted": 0.14466447368521512,
+            "f1": 0.43475703375805064,
+            "f1_weighted": 0.5749621002144737
+          },
+          {
+            "accuracy": 0.5952023988005997,
+            "ap": 0.14284485774495798,
+            "ap_weighted": 0.14284485774495798,
+            "f1": 0.4908181008524535,
+            "f1_weighted": 0.6733178005763648
+          }
+        ]
+      },
+      {
+        "accuracy": 0.62,
+        "ap": 0.26415963699316264,
+        "ap_weighted": 0.26415963699316264,
+        "f1": 0.5644640290850564,
+        "f1_weighted": 0.6579491434972964,
+        "hf_subset": "en",
+        "languages": [
+          "eng-Latn"
+        ],
+        "main_score": 0.62,
+        "scores_per_experiment": [
+          {
+            "accuracy": 0.5955223880597015,
+            "ap": 0.25283011702254965,
+            "ap_weighted": 0.25283011702254965,
+            "f1": 0.5461419440632507,
+            "f1_weighted": 0.637305840672083
+          },
+          {
+            "accuracy": 0.5835820895522388,
+            "ap": 0.26075921450734807,
+            "ap_weighted": 0.26075921450734807,
+            "f1": 0.5441743417924314,
+            "f1_weighted": 0.6257903879142659
+          },
+          {
+            "accuracy": 0.6029850746268657,
+            "ap": 0.24791359505097144,
+            "ap_weighted": 0.24791359505097144,
+            "f1": 0.5467492700989818,
+            "f1_weighted": 0.643970491486171
+          },
+          {
+            "accuracy": 0.6029850746268657,
+            "ap": 0.26571020411740476,
+            "ap_weighted": 0.26571020411740476,
+            "f1": 0.5578808446455505,
+            "f1_weighted": 0.6438739560117962
+          },
+          {
+            "accuracy": 0.6761194029850747,
+            "ap": 0.2747168088064611,
+            "ap_weighted": 0.2747168088064611,
+            "f1": 0.5995328480020714,
+            "f1_weighted": 0.7061789723470043
+          },
+          {
+            "accuracy": 0.6537313432835821,
+            "ap": 0.2721721103504757,
+            "ap_weighted": 0.2721721103504757,
+            "f1": 0.587775408670931,
+            "f1_weighted": 0.6881859359916834
+          },
+          {
+            "accuracy": 0.6597014925373135,
+            "ap": 0.26444025941241256,
+            "ap_weighted": 0.26444025941241256,
+            "f1": 0.5851663570893213,
+            "f1_weighted": 0.692245002380803
+          },
+          {
+            "accuracy": 0.6149253731343284,
+            "ap": 0.23113683661630094,
+            "ap_weighted": 0.23113683661630094,
+            "f1": 0.538936721825689,
+            "f1_weighted": 0.6529196987886481
+          },
+          {
+            "accuracy": 0.5791044776119403,
+            "ap": 0.2786207978292612,
+            "ap_weighted": 0.2786207978292612,
+            "f1": 0.5501428571428572,
+            "f1_weighted": 0.6196507462686567
+          },
+          {
+            "accuracy": 0.6313432835820896,
+            "ap": 0.29329642621844076,
+            "ap_weighted": 0.29329642621844076,
+            "f1": 0.5881396975194806,
+            "f1_weighted": 0.6693704031118514
+          }
+        ]
+      }
+    ]
+  },
+  "task_name": "AmazonCounterfactualClassification"
+}

mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/Banking77Classification.json ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+  "dataset_revision": "0fd18e25b25c072e09e0d92ab615fda904d66300",
+  "evaluation_time": 6.451777696609497,
+  "kg_co2_emissions": null,
+  "mteb_version": "1.14.15",
+  "scores": {
+    "test": [
+      {
+        "accuracy": 0.4396103896103896,
+        "f1": 0.4142711532114576,
+        "f1_weighted": 0.4142711532114576,
+        "hf_subset": "default",
+        "languages": [
+          "eng-Latn"
+        ],
+        "main_score": 0.4396103896103896,
+        "scores_per_experiment": [
+          {
+            "accuracy": 0.4279220779220779,
+            "f1": 0.4030476288783657,
+            "f1_weighted": 0.4030476288783656
+          },
+          {
+            "accuracy": 0.4211038961038961,
+            "f1": 0.39776168133611584,
+            "f1_weighted": 0.39776168133611584
+          },
+          {
+            "accuracy": 0.45064935064935063,
+            "f1": 0.42872843564828145,
+            "f1_weighted": 0.42872843564828145
+          },
+          {
+            "accuracy": 0.4448051948051948,
+            "f1": 0.420756828398419,
+            "f1_weighted": 0.42075682839841905
+          },
+          {
+            "accuracy": 0.44675324675324674,
+            "f1": 0.42100682221185654,
+            "f1_weighted": 0.42100682221185654
+          },
+          {
+            "accuracy": 0.45324675324675323,
+            "f1": 0.4392342490231314,
+            "f1_weighted": 0.4392342490231314
+          },
+          {
+            "accuracy": 0.437012987012987,
+            "f1": 0.4056017558988273,
+            "f1_weighted": 0.40560175589882724
+          },
+          {
+            "accuracy": 0.42337662337662335,
+            "f1": 0.39123709562594644,
+            "f1_weighted": 0.39123709562594655
+          },
+          {
+            "accuracy": 0.44512987012987015,
+            "f1": 0.41578171494860966,
+            "f1_weighted": 0.41578171494860966
+          },
+          {
+            "accuracy": 0.4461038961038961,
+            "f1": 0.4195553201450221,
+            "f1_weighted": 0.419555320145022
+          }
+        ]
+      }
+    ]
+  },
+  "task_name": "Banking77Classification"
+}

mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/CQADupstackProgrammersRetrieval.json ADDED Viewed

	@@ -0,0 +1,158 @@

+{
+  "dataset_revision": "6184bc1440d2dbc7612be22b50686b8826d22b32",
+  "evaluation_time": 99.69791841506958,
+  "kg_co2_emissions": null,
+  "mteb_version": "1.14.15",
+  "scores": {
+    "test": [
+      {
+        "hf_subset": "default",
+        "languages": [
+          "eng-Latn"
+        ],
+        "main_score": 0.0501,
+        "map_at_1": 0.02467,
+        "map_at_10": 0.03898,
+        "map_at_100": 0.04261,
+        "map_at_1000": 0.04333,
+        "map_at_20": 0.04068,
+        "map_at_3": 0.03388,
+        "map_at_5": 0.03693,
+        "mrr_at_1": 0.030821917808219176,
+        "mrr_at_10": 0.04904462926723201,
+        "mrr_at_100": 0.05339942610218758,
+        "mrr_at_1000": 0.05413492750157237,
+        "mrr_at_20": 0.05126402659708249,
+        "mrr_at_3": 0.04280821917808219,
+        "mrr_at_5": 0.04634703196347032,
+        "nauc_map_at_1000_diff1": 0.03644747951501248,
+        "nauc_map_at_1000_max": 0.2240572170754659,
+        "nauc_map_at_1000_std": -0.17708810912472517,
+        "nauc_map_at_100_diff1": 0.03759221625144172,
+        "nauc_map_at_100_max": 0.22324901446317413,
+        "nauc_map_at_100_std": -0.17630470695891512,
+        "nauc_map_at_10_diff1": 0.03906418656483989,
+        "nauc_map_at_10_max": 0.22061594321968936,
+        "nauc_map_at_10_std": -0.17777470317814356,
+        "nauc_map_at_1_diff1": 0.1731091343679673,
+        "nauc_map_at_1_max": 0.33459947679728974,
+        "nauc_map_at_1_std": -0.23115450977179597,
+        "nauc_map_at_20_diff1": 0.03795725531499195,
+        "nauc_map_at_20_max": 0.22396003211648763,
+        "nauc_map_at_20_std": -0.17867373725662639,
+        "nauc_map_at_3_diff1": 0.06042780588964212,
+        "nauc_map_at_3_max": 0.2486807528974488,
+        "nauc_map_at_3_std": -0.18512855007450404,
+        "nauc_map_at_5_diff1": 0.04407217741234605,
+        "nauc_map_at_5_max": 0.22647048266105405,
+        "nauc_map_at_5_std": -0.18107585673560017,
+        "nauc_mrr_at_1000_diff1": 0.033601872249839834,
+        "nauc_mrr_at_1000_max": 0.2523936325136619,
+        "nauc_mrr_at_1000_std": -0.19078164353963076,
+        "nauc_mrr_at_100_diff1": 0.03435870935950355,
+        "nauc_mrr_at_100_max": 0.2523932973431928,
+        "nauc_mrr_at_100_std": -0.1900913512193067,
+        "nauc_mrr_at_10_diff1": 0.03361519179733555,
+        "nauc_mrr_at_10_max": 0.25392922716866984,
+        "nauc_mrr_at_10_std": -0.1935061134919541,
+        "nauc_mrr_at_1_diff1": 0.1772995319079407,
+        "nauc_mrr_at_1_max": 0.35182174117717013,
+        "nauc_mrr_at_1_std": -0.24426280067522707,
+        "nauc_mrr_at_20_diff1": 0.03479828151019169,
+        "nauc_mrr_at_20_max": 0.25624951214228564,
+        "nauc_mrr_at_20_std": -0.19212268093923462,
+        "nauc_mrr_at_3_diff1": 0.06173430027850725,
+        "nauc_mrr_at_3_max": 0.26889485727748363,
+        "nauc_mrr_at_3_std": -0.19153801111553947,
+        "nauc_mrr_at_5_diff1": 0.036743759763164886,
+        "nauc_mrr_at_5_max": 0.253857849052297,
+        "nauc_mrr_at_5_std": -0.19604549670316734,
+        "nauc_ndcg_at_1000_diff1": -0.010372586628261796,
+        "nauc_ndcg_at_1000_max": 0.20925878430027478,
+        "nauc_ndcg_at_1000_std": -0.1717044268161809,
+        "nauc_ndcg_at_100_diff1": 0.0023309149151885546,
+        "nauc_ndcg_at_100_max": 0.20125970115134734,
+        "nauc_ndcg_at_100_std": -0.15865628929382014,
+        "nauc_ndcg_at_10_diff1": 0.0026192804576363727,
+        "nauc_ndcg_at_10_max": 0.19884193622357532,
+        "nauc_ndcg_at_10_std": -0.16919003671988075,
+        "nauc_ndcg_at_1_diff1": 0.1772995319079407,
+        "nauc_ndcg_at_1_max": 0.35182174117717013,
+        "nauc_ndcg_at_1_std": -0.24426280067522707,
+        "nauc_ndcg_at_20_diff1": 0.0031543394811079034,
+        "nauc_ndcg_at_20_max": 0.20925361343315524,
+        "nauc_ndcg_at_20_std": -0.17106125631597793,
+        "nauc_ndcg_at_3_diff1": 0.03670154146101528,
+        "nauc_ndcg_at_3_max": 0.23212930749840155,
+        "nauc_ndcg_at_3_std": -0.1728371812831961,
+        "nauc_ndcg_at_5_diff1": 0.0107566708693031,
+        "nauc_ndcg_at_5_max": 0.20474332948099355,
+        "nauc_ndcg_at_5_std": -0.1734952739301359,
+        "nauc_precision_at_1000_diff1": -0.07195606207962846,
+        "nauc_precision_at_1000_max": 0.2542912736794115,
+        "nauc_precision_at_1000_std": -0.1881459402790264,
+        "nauc_precision_at_100_diff1": -0.04518222914182943,
+        "nauc_precision_at_100_max": 0.22138981394024387,
+        "nauc_precision_at_100_std": -0.13384472263037697,
+        "nauc_precision_at_10_diff1": -0.052513811685878764,
+        "nauc_precision_at_10_max": 0.18962064467698705,
+        "nauc_precision_at_10_std": -0.14827004787357115,
+        "nauc_precision_at_1_diff1": 0.1772995319079407,
+        "nauc_precision_at_1_max": 0.35182174117717013,
+        "nauc_precision_at_1_std": -0.24426280067522707,
+        "nauc_precision_at_20_diff1": -0.040789324913047875,
+        "nauc_precision_at_20_max": 0.22086458009752882,
+        "nauc_precision_at_20_std": -0.14430508663959002,
+        "nauc_precision_at_3_diff1": -0.013044619440245884,
+        "nauc_precision_at_3_max": 0.21285488271783465,
+        "nauc_precision_at_3_std": -0.1483164417030193,
+        "nauc_precision_at_5_diff1": -0.05113181393685194,
+        "nauc_precision_at_5_max": 0.1756649379589832,
+        "nauc_precision_at_5_std": -0.15632134056178232,
+        "nauc_recall_at_1000_diff1": -0.047075752528689695,
+        "nauc_recall_at_1000_max": 0.16414155669676642,
+        "nauc_recall_at_1000_std": -0.1513320281746568,
+        "nauc_recall_at_100_diff1": -0.023004658252697183,
+        "nauc_recall_at_100_max": 0.14861973646512244,
+        "nauc_recall_at_100_std": -0.12240747671934184,
+        "nauc_recall_at_10_diff1": -0.051375323084735164,
+        "nauc_recall_at_10_max": 0.1384336247044034,
+        "nauc_recall_at_10_std": -0.14737738059263306,
+        "nauc_recall_at_1_diff1": 0.1731091343679673,
+        "nauc_recall_at_1_max": 0.33459947679728974,
+        "nauc_recall_at_1_std": -0.23115450977179597,
+        "nauc_recall_at_20_diff1": -0.03578815918976938,
+        "nauc_recall_at_20_max": 0.16386688869593355,
+        "nauc_recall_at_20_std": -0.1528456365862212,
+        "nauc_recall_at_3_diff1": -0.021696811828998432,
+        "nauc_recall_at_3_max": 0.1864107664448688,
+        "nauc_recall_at_3_std": -0.14586036842324565,
+        "nauc_recall_at_5_diff1": -0.0538517948884412,
+        "nauc_recall_at_5_max": 0.1453135254521713,
+        "nauc_recall_at_5_std": -0.1531619473747777,
+        "ndcg_at_1": 0.03082,
+        "ndcg_at_10": 0.0501,
+        "ndcg_at_100": 0.07072,
+        "ndcg_at_1000": 0.09327,
+        "ndcg_at_20": 0.05662,
+        "ndcg_at_3": 0.03989,
+        "ndcg_at_5": 0.04484,
+        "precision_at_1": 0.03082,
+        "precision_at_10": 0.00993,
+        "precision_at_100": 0.00241,
+        "precision_at_1000": 0.00052,
+        "precision_at_20": 0.00685,
+        "precision_at_3": 0.02017,
+        "precision_at_5": 0.0153,
+        "recall_at_1": 0.02467,
+        "recall_at_10": 0.07499,
+        "recall_at_100": 0.16969,
+        "recall_at_1000": 0.33718,
+        "recall_at_20": 0.09901,
+        "recall_at_3": 0.04648,
+        "recall_at_5": 0.05869
+      }
+    ]
+  },
+  "task_name": "CQADupstackProgrammersRetrieval"
+}

mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/STSBenchmark.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "dataset_revision": "b0fddb56ed78048fa8b90373c8a3cfc37b684831",
+  "evaluation_time": 0.12331175804138184,
+  "kg_co2_emissions": null,
+  "mteb_version": "1.14.15",
+  "scores": {
+    "test": [
+      {
+        "cosine_pearson": 0.34632056143460516,
+        "cosine_spearman": 0.42973159111999676,
+        "euclidean_pearson": 0.4043313982401531,
+        "euclidean_spearman": 0.42973159111999676,
+        "hf_subset": "default",
+        "languages": [
+          "eng-Latn"
+        ],
+        "main_score": 0.42973159111999676,
+        "manhattan_pearson": 0.511950240807258,
+        "manhattan_spearman": 0.5019330550880601,
+        "pearson": 0.34632056143460516,
+        "spearman": 0.42973159111999676
+      }
+    ]
+  },
+  "task_name": "STSBenchmark"
+}

mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/SprintDuplicateQuestions.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "dataset_revision": "d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46",
+  "evaluation_time": 1.9629368782043457,
+  "kg_co2_emissions": null,
+  "mteb_version": "1.14.15",
+  "scores": {
+    "test": [
+      {
+        "cosine_accuracy": 0.9926237623762376,
+        "cosine_accuracy_threshold": 0.9106360077857971,
+        "cosine_ap": 0.4700755863552174,
+        "cosine_f1": 0.4925187032418952,
+        "cosine_f1_threshold": 0.8986777067184448,
+        "cosine_precision": 0.6539735099337748,
+        "cosine_recall": 0.395,
+        "dot_accuracy": 0.9926237623762376,
+        "dot_accuracy_threshold": 0.9106361269950867,
+        "dot_ap": 0.47007548398718707,
+        "dot_f1": 0.4925187032418952,
+        "dot_f1_threshold": 0.8986777663230896,
+        "dot_precision": 0.6539735099337748,
+        "dot_recall": 0.395,
+        "euclidean_accuracy": 0.9926237623762376,
+        "euclidean_accuracy_threshold": 0.42276236414909363,
+        "euclidean_ap": 0.47007558217981027,
+        "euclidean_f1": 0.4925187032418952,
+        "euclidean_f1_threshold": 0.4501606225967407,
+        "euclidean_precision": 0.6539735099337748,
+        "euclidean_recall": 0.395,
+        "hf_subset": "default",
+        "languages": [
+          "eng-Latn"
+        ],
+        "main_score": 0.6386707007383838,
+        "manhattan_accuracy": 0.9939207920792079,
+        "manhattan_accuracy_threshold": 4.824772834777832,
+        "manhattan_ap": 0.6386707007383838,
+        "manhattan_f1": 0.6293103448275862,
+        "manhattan_f1_threshold": 5.194998741149902,
+        "manhattan_precision": 0.6822429906542056,
+        "manhattan_recall": 0.584,
+        "max_accuracy": 0.9939207920792079,
+        "max_ap": 0.6386707007383838,
+        "max_f1": 0.6293103448275862,
+        "max_precision": 0.6822429906542056,
+        "max_recall": 0.584,
+        "similarity_accuracy": 0.9926237623762376,
+        "similarity_accuracy_threshold": 0.9106360077857971,
+        "similarity_ap": 0.4700755863552174,
+        "similarity_f1": 0.4925187032418952,
+        "similarity_f1_threshold": 0.8986777067184448,
+        "similarity_precision": 0.6539735099337748,
+        "similarity_recall": 0.395
+      }
+    ]
+  },
+  "task_name": "SprintDuplicateQuestions"
+}

mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/StackExchangeClustering.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "dataset_revision": "6cbc1f7b2bc0622f2e39d2c77fa502909748c259",
+  "evaluation_time": 1075.5739603042603,
+  "kg_co2_emissions": null,
+  "mteb_version": "1.14.15",
+  "scores": {
+    "test": [
+      {
+        "hf_subset": "default",
+        "languages": [
+          "eng-Latn"
+        ],
+        "main_score": 0.2747977935355363,
+        "v_measure": 0.2747977935355363,
+        "v_measure_std": 0.04408138950391278,
+        "v_measures": [
+          0.2671568735697825,
+          0.35324106044655595,
+          0.2134334295678833,
+          0.26069561242914296,
+          0.2360037867112385,
+          0.18352010080864292,
+          0.21227539957559294,
+          0.22564157353303899,
+          0.31014309699664405,
+          0.2792317143409387,
+          0.30736400840236383,
+          0.33654065468328326,
+          0.3375811203083562,
+          0.23635769205347795,
+          0.2889733490218442,
+          0.2628972368553193,
+          0.2892573063858698,
+          0.3093369539018476,
+          0.2778955236652676,
+          0.29489160764728006,
+          0.3092126928451642,
+          0.22100223054084894,
+          0.23711645754707986,
+          0.3264131545037563,
+          0.2937622020471872
+        ]
+      }
+    ]
+  },
+  "task_name": "StackExchangeClustering"
+}

mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled/model_meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"name": "gte-Qwen2-7B-instruct-M2V-Distilled", "revision": "distilled", "release_date": null, "languages": ["eng"], "n_parameters": null, "memory_usage": null, "max_tokens": null, "embed_dim": null, "license": null, "open_source": null, "similarity_fn_name": null, "framework": [], "loader": null}

mteb_results/mteb_parsed_results.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "gte-Qwen2-7B-instruct-M2V-Distilled": "ResultSet(datasets={'Banking77Classification': DatasetResult(scores=[0.4396103896103896], time=6.451777696609497), 'StackExchangeClustering': DatasetResult(scores=[0.2747977935355363], time=1075.5739603042603), 'STSBenchmark': DatasetResult(scores=[0.42973159111999676], time=0.12331175804138184), 'CQADupstackProgrammersRetrieval': DatasetResult(scores=[0.0501], time=99.69791841506958), 'SprintDuplicateQuestions': DatasetResult(scores=[0.6386707007383838], time=1.9629368782043457)})"
+}

mteb_results/mteb_raw_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+[
+  "dataset_revision='0fd18e25b25c072e09e0d92ab615fda904d66300' task_name='Banking77Classification' mteb_version='1.14.15' scores={'test': [{'accuracy': 0.4396103896103896, 'f1': 0.4142711532114576, 'f1_weighted': 0.4142711532114576, 'scores_per_experiment': [{'accuracy': 0.4279220779220779, 'f1': 0.4030476288783657, 'f1_weighted': 0.4030476288783656}, {'accuracy': 0.4211038961038961, 'f1': 0.39776168133611584, 'f1_weighted': 0.39776168133611584}, {'accuracy': 0.45064935064935063, 'f1': 0.42872843564828145, 'f1_weighted': 0.42872843564828145}, {'accuracy': 0.4448051948051948, 'f1': 0.420756828398419, 'f1_weighted': 0.42075682839841905}, {'accuracy': 0.44675324675324674, 'f1': 0.42100682221185654, 'f1_weighted': 0.42100682221185654}, {'accuracy': 0.45324675324675323, 'f1': 0.4392342490231314, 'f1_weighted': 0.4392342490231314}, {'accuracy': 0.437012987012987, 'f1': 0.4056017558988273, 'f1_weighted': 0.40560175589882724}, {'accuracy': 0.42337662337662335, 'f1': 0.39123709562594644, 'f1_weighted': 0.39123709562594655}, {'accuracy': 0.44512987012987015, 'f1': 0.41578171494860966, 'f1_weighted': 0.41578171494860966}, {'accuracy': 0.4461038961038961, 'f1': 0.4195553201450221, 'f1_weighted': 0.419555320145022}], 'main_score': 0.4396103896103896, 'hf_subset': 'default', 'languages': ['eng-Latn']}]} evaluation_time=6.451777696609497 kg_co2_emissions=None",
+  "dataset_revision='6cbc1f7b2bc0622f2e39d2c77fa502909748c259' task_name='StackExchangeClustering' mteb_version='1.14.15' scores={'test': [{'v_measure': 0.2747977935355363, 'v_measure_std': 0.04408138950391278, 'v_measures': [0.2671568735697825, 0.35324106044655595, 0.2134334295678833, 0.26069561242914296, 0.2360037867112385, 0.18352010080864292, 0.21227539957559294, 0.22564157353303899, 0.31014309699664405, 0.2792317143409387, 0.30736400840236383, 0.33654065468328326, 0.3375811203083562, 0.23635769205347795, 0.2889733490218442, 0.2628972368553193, 0.2892573063858698, 0.3093369539018476, 0.2778955236652676, 0.29489160764728006, 0.3092126928451642, 0.22100223054084894, 0.23711645754707986, 0.3264131545037563, 0.2937622020471872], 'main_score': 0.2747977935355363, 'hf_subset': 'default', 'languages': ['eng-Latn']}]} evaluation_time=1075.5739603042603 kg_co2_emissions=None",
+  "dataset_revision='b0fddb56ed78048fa8b90373c8a3cfc37b684831' task_name='STSBenchmark' mteb_version='1.14.15' scores={'test': [{'pearson': 0.34632056143460516, 'spearman': 0.42973159111999676, 'cosine_pearson': 0.34632056143460516, 'cosine_spearman': 0.42973159111999676, 'manhattan_pearson': 0.511950240807258, 'manhattan_spearman': 0.5019330550880601, 'euclidean_pearson': 0.4043313982401531, 'euclidean_spearman': 0.42973159111999676, 'main_score': 0.42973159111999676, 'hf_subset': 'default', 'languages': ['eng-Latn']}]} evaluation_time=0.12331175804138184 kg_co2_emissions=None",
+  "dataset_revision='6184bc1440d2dbc7612be22b50686b8826d22b32' task_name='CQADupstackProgrammersRetrieval' mteb_version='1.14.15' scores={'test': [{'ndcg_at_1': 0.03082, 'ndcg_at_3': 0.03989, 'ndcg_at_5': 0.04484, 'ndcg_at_10': 0.0501, 'ndcg_at_20': 0.05662, 'ndcg_at_100': 0.07072, 'ndcg_at_1000': 0.09327, 'map_at_1': 0.02467, 'map_at_3': 0.03388, 'map_at_5': 0.03693, 'map_at_10': 0.03898, 'map_at_20': 0.04068, 'map_at_100': 0.04261, 'map_at_1000': 0.04333, 'recall_at_1': 0.02467, 'recall_at_3': 0.04648, 'recall_at_5': 0.05869, 'recall_at_10': 0.07499, 'recall_at_20': 0.09901, 'recall_at_100': 0.16969, 'recall_at_1000': 0.33718, 'precision_at_1': 0.03082, 'precision_at_3': 0.02017, 'precision_at_5': 0.0153, 'precision_at_10': 0.00993, 'precision_at_20': 0.00685, 'precision_at_100': 0.00241, 'precision_at_1000': 0.00052, 'mrr_at_1': 0.030821917808219176, 'mrr_at_3': 0.04280821917808219, 'mrr_at_5': 0.04634703196347032, 'mrr_at_10': 0.04904462926723201, 'mrr_at_20': 0.05126402659708249, 'mrr_at_100': 0.05339942610218758, 'mrr_at_1000': 0.05413492750157237, 'nauc_ndcg_at_1_max': 0.35182174117717013, 'nauc_ndcg_at_1_std': -0.24426280067522707, 'nauc_ndcg_at_1_diff1': 0.1772995319079407, 'nauc_ndcg_at_3_max': 0.23212930749840155, 'nauc_ndcg_at_3_std': -0.1728371812831961, 'nauc_ndcg_at_3_diff1': 0.03670154146101528, 'nauc_ndcg_at_5_max': 0.20474332948099355, 'nauc_ndcg_at_5_std': -0.1734952739301359, 'nauc_ndcg_at_5_diff1': 0.0107566708693031, 'nauc_ndcg_at_10_max': 0.19884193622357532, 'nauc_ndcg_at_10_std': -0.16919003671988075, 'nauc_ndcg_at_10_diff1': 0.0026192804576363727, 'nauc_ndcg_at_20_max': 0.20925361343315524, 'nauc_ndcg_at_20_std': -0.17106125631597793, 'nauc_ndcg_at_20_diff1': 0.0031543394811079034, 'nauc_ndcg_at_100_max': 0.20125970115134734, 'nauc_ndcg_at_100_std': -0.15865628929382014, 'nauc_ndcg_at_100_diff1': 0.0023309149151885546, 'nauc_ndcg_at_1000_max': 0.20925878430027478, 'nauc_ndcg_at_1000_std': -0.1717044268161809, 'nauc_ndcg_at_1000_diff1': -0.010372586628261796, 'nauc_map_at_1_max': 0.33459947679728974, 'nauc_map_at_1_std': -0.23115450977179597, 'nauc_map_at_1_diff1': 0.1731091343679673, 'nauc_map_at_3_max': 0.2486807528974488, 'nauc_map_at_3_std': -0.18512855007450404, 'nauc_map_at_3_diff1': 0.06042780588964212, 'nauc_map_at_5_max': 0.22647048266105405, 'nauc_map_at_5_std': -0.18107585673560017, 'nauc_map_at_5_diff1': 0.04407217741234605, 'nauc_map_at_10_max': 0.22061594321968936, 'nauc_map_at_10_std': -0.17777470317814356, 'nauc_map_at_10_diff1': 0.03906418656483989, 'nauc_map_at_20_max': 0.22396003211648763, 'nauc_map_at_20_std': -0.17867373725662639, 'nauc_map_at_20_diff1': 0.03795725531499195, 'nauc_map_at_100_max': 0.22324901446317413, 'nauc_map_at_100_std': -0.17630470695891512, 'nauc_map_at_100_diff1': 0.03759221625144172, 'nauc_map_at_1000_max': 0.2240572170754659, 'nauc_map_at_1000_std': -0.17708810912472517, 'nauc_map_at_1000_diff1': 0.03644747951501248, 'nauc_recall_at_1_max': 0.33459947679728974, 'nauc_recall_at_1_std': -0.23115450977179597, 'nauc_recall_at_1_diff1': 0.1731091343679673, 'nauc_recall_at_3_max': 0.1864107664448688, 'nauc_recall_at_3_std': -0.14586036842324565, 'nauc_recall_at_3_diff1': -0.021696811828998432, 'nauc_recall_at_5_max': 0.1453135254521713, 'nauc_recall_at_5_std': -0.1531619473747777, 'nauc_recall_at_5_diff1': -0.0538517948884412, 'nauc_recall_at_10_max': 0.1384336247044034, 'nauc_recall_at_10_std': -0.14737738059263306, 'nauc_recall_at_10_diff1': -0.051375323084735164, 'nauc_recall_at_20_max': 0.16386688869593355, 'nauc_recall_at_20_std': -0.1528456365862212, 'nauc_recall_at_20_diff1': -0.03578815918976938, 'nauc_recall_at_100_max': 0.14861973646512244, 'nauc_recall_at_100_std': -0.12240747671934184, 'nauc_recall_at_100_diff1': -0.023004658252697183, 'nauc_recall_at_1000_max': 0.16414155669676642, 'nauc_recall_at_1000_std': -0.1513320281746568, 'nauc_recall_at_1000_diff1': -0.047075752528689695, 'nauc_precision_at_1_max': 0.35182174117717013, 'nauc_precision_at_1_std': -0.24426280067522707, 'nauc_precision_at_1_diff1': 0.1772995319079407, 'nauc_precision_at_3_max': 0.21285488271783465, 'nauc_precision_at_3_std': -0.1483164417030193, 'nauc_precision_at_3_diff1': -0.013044619440245884, 'nauc_precision_at_5_max': 0.1756649379589832, 'nauc_precision_at_5_std': -0.15632134056178232, 'nauc_precision_at_5_diff1': -0.05113181393685194, 'nauc_precision_at_10_max': 0.18962064467698705, 'nauc_precision_at_10_std': -0.14827004787357115, 'nauc_precision_at_10_diff1': -0.052513811685878764, 'nauc_precision_at_20_max': 0.22086458009752882, 'nauc_precision_at_20_std': -0.14430508663959002, 'nauc_precision_at_20_diff1': -0.040789324913047875, 'nauc_precision_at_100_max': 0.22138981394024387, 'nauc_precision_at_100_std': -0.13384472263037697, 'nauc_precision_at_100_diff1': -0.04518222914182943, 'nauc_precision_at_1000_max': 0.2542912736794115, 'nauc_precision_at_1000_std': -0.1881459402790264, 'nauc_precision_at_1000_diff1': -0.07195606207962846, 'nauc_mrr_at_1_max': 0.35182174117717013, 'nauc_mrr_at_1_std': -0.24426280067522707, 'nauc_mrr_at_1_diff1': 0.1772995319079407, 'nauc_mrr_at_3_max': 0.26889485727748363, 'nauc_mrr_at_3_std': -0.19153801111553947, 'nauc_mrr_at_3_diff1': 0.06173430027850725, 'nauc_mrr_at_5_max': 0.253857849052297, 'nauc_mrr_at_5_std': -0.19604549670316734, 'nauc_mrr_at_5_diff1': 0.036743759763164886, 'nauc_mrr_at_10_max': 0.25392922716866984, 'nauc_mrr_at_10_std': -0.1935061134919541, 'nauc_mrr_at_10_diff1': 0.03361519179733555, 'nauc_mrr_at_20_max': 0.25624951214228564, 'nauc_mrr_at_20_std': -0.19212268093923462, 'nauc_mrr_at_20_diff1': 0.03479828151019169, 'nauc_mrr_at_100_max': 0.2523932973431928, 'nauc_mrr_at_100_std': -0.1900913512193067, 'nauc_mrr_at_100_diff1': 0.03435870935950355, 'nauc_mrr_at_1000_max': 0.2523936325136619, 'nauc_mrr_at_1000_std': -0.19078164353963076, 'nauc_mrr_at_1000_diff1': 0.033601872249839834, 'main_score': 0.0501, 'hf_subset': 'default', 'languages': ['eng-Latn']}]} evaluation_time=99.69791841506958 kg_co2_emissions=None",
+  "dataset_revision='d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46' task_name='SprintDuplicateQuestions' mteb_version='1.14.15' scores={'test': [{'similarity_accuracy': 0.9926237623762376, 'similarity_accuracy_threshold': 0.9106360077857971, 'similarity_f1': 0.4925187032418952, 'similarity_f1_threshold': 0.8986777067184448, 'similarity_precision': 0.6539735099337748, 'similarity_recall': 0.395, 'similarity_ap': 0.4700755863552174, 'cosine_accuracy': 0.9926237623762376, 'cosine_accuracy_threshold': 0.9106360077857971, 'cosine_f1': 0.4925187032418952, 'cosine_f1_threshold': 0.8986777067184448, 'cosine_precision': 0.6539735099337748, 'cosine_recall': 0.395, 'cosine_ap': 0.4700755863552174, 'manhattan_accuracy': 0.9939207920792079, 'manhattan_accuracy_threshold': 4.824772834777832, 'manhattan_f1': 0.6293103448275862, 'manhattan_f1_threshold': 5.194998741149902, 'manhattan_precision': 0.6822429906542056, 'manhattan_recall': 0.584, 'manhattan_ap': 0.6386707007383838, 'euclidean_accuracy': 0.9926237623762376, 'euclidean_accuracy_threshold': 0.42276236414909363, 'euclidean_f1': 0.4925187032418952, 'euclidean_f1_threshold': 0.4501606225967407, 'euclidean_precision': 0.6539735099337748, 'euclidean_recall': 0.395, 'euclidean_ap': 0.47007558217981027, 'dot_accuracy': 0.9926237623762376, 'dot_accuracy_threshold': 0.9106361269950867, 'dot_f1': 0.4925187032418952, 'dot_f1_threshold': 0.8986777663230896, 'dot_precision': 0.6539735099337748, 'dot_recall': 0.395, 'dot_ap': 0.47007548398718707, 'max_accuracy': 0.9939207920792079, 'max_f1': 0.6293103448275862, 'max_precision': 0.6822429906542056, 'max_recall': 0.584, 'max_ap': 0.6386707007383838, 'main_score': 0.6386707007383838, 'hf_subset': 'default', 'languages': ['eng-Latn']}]} evaluation_time=1.9629368782043457 kg_co2_emissions=None"
+]

mteb_results/mteb_report.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+================================================================================
+MTEB Evaluation Report
+================================================================================
+Model: gte-Qwen2-7B-instruct-M2V-Distilled
+Model Path: .
+Evaluation Time: 1235.71 seconds
+Total Datasets: 1
+Summary Statistics:
+  Average Score: 0.0501
+  Median Score: 0.0501
+  Standard Deviation: 0.0000
+  Score Range: 0.0501 - 0.0501
+Detailed Results:
+--------------------------------------------------
+                              Model Average (All) Average (MTEB) Classification Clustering PairClassification Reranking Retrieval STS Summarization PEARL WordSim
+gte-Qwen2-7B-instruct-M2V-Distilled           nan            nan            nan        nan                nan       nan      5.01 nan           nan   nan     nan
+================================================================================

mteb_results/mteb_summary.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "model_name": "gte-Qwen2-7B-instruct-M2V-Distilled",
+  "evaluation_time_seconds": 1235.7057559490204,
+  "task_scores": {
+    "gte-Qwen2-7B-instruct-M2V-Distilled": {
+      "task_means": "Classification           NaN\nClustering               NaN\nPairClassification       NaN\nReranking                NaN\nRetrieval             0.0501\nSTS                      NaN\nSummarization            NaN\nPEARL                    NaN\nWordSim                  NaN\ndtype: float64",
+      "dataset_scores": {
+        "CQADupstack": 0.0501
+      }
+    }
+  },
+  "summary_stats": {
+    "total_datasets": 1,
+    "average_score": 0.0501,
+    "median_score": 0.0501,
+    "std_dev": 0.0,
+    "min_score": 0.0501,
+    "max_score": 0.0501
+  }
+}

pipeline.skops ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd33bcb8eee34a4df1a0d5e7d22b1e2b241ea683750204be74f78055882c76c3
+size 3843639

pyproject.toml ADDED Viewed

	@@ -0,0 +1,101 @@

+[project]
+name = "gte-qwen2-7b-instruct-m2v"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "datasets>=3.6.0",
+    "evaluation",
+    "iso639>=0.1.4",
+    "lightning>=2.5.1.post0",
+    "matplotlib>=3.10.3",
+    "model2vec[train]>=0.5.0",
+    "mteb>=1.14.15",
+    "psutil>=7.0.0",
+    "scikit-learn>=1.6.1",
+    "sentence-transformers>=4.1.0",
+    "torch>=2.7.0",
+]
+[dependency-groups]
+dev = [
+    "mypy>=1.15.0",
+    "ruff>=0.11.6",
+]
+[tool.mypy]
+exclude = [
+    ".git",
+    ".ruff_cache",
+    ".venv",
+    "venv",
+    "__pycache__",
+    "build",
+    "dist",
+    "vendor",
+]
+follow_untyped_imports = true
+[tool.ruff]
+line-length = 120
+target-version = "py312"
+# Exclude files/directories
+exclude = [
+    ".git",
+    ".ruff_cache",
+    ".venv",
+    "venv",
+    "__pycache__",
+    "build",
+    "dist",
+    "vendor"
+]
+[tool.ruff.lint]
+# Enable all rules by default, then selectively disable
+select = ["ALL"]
+ignore = [
+    # Rules that conflict with other tools/preferences
+    "D203",  # one-blank-line-before-class
+    "D212",  # multi-line-summary-first-line
+    "FBT001",  # Boolean positional arg in function definition (required for typer)
+    "FBT002",  # Boolean default value in function definition (required for typer)
+    "C901",  # function too complex
+    "PLR0911",  # too many return statements
+    "PLR0912",  # too many branches
+    "PLR0913",  # too many arguments in function definition
+    "PLR0915",  # too many statements
+    "TRY300",  # Consider moving this statement to an `else` block
+    "COM812",  # Use a constant for the message in a raise statement
+    "TC001",  # Move application import into a type-checking block
+    "ERA001", # Found commented-out code
+    "G004", # Logging statement uses f-string
+    "TD003", # Missing link in to-do
+    "TRY301", # Abstract raise to an inner function
+    # Disable rules that conflict with tab indentation
+    "E101",  # Indentation contains mixed spaces and tabs
+    "W191",  # indentation contains tabs
+    "D206",  # indent with spaces, not tabs
+]
+[tool.ruff.lint.mccabe]
+max-complexity = 10
+[tool.ruff.lint.pylint]
+max-args = 5
+max-branches = 12
+max-statements = 50
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "tab"
+skip-magic-trailing-comma = false
+line-ending = "auto"
+[tool.uv.sources]
+evaluation = { git = "https://github.com/MinishLab/evaluation.git", rev = "main" }

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e76d728582b9955c7afa6e5757b5b7825b3d40ef49d935b3cb7b148ad556dce4
+size 11418179

train_code_classification.py ADDED Viewed

	@@ -0,0 +1,365 @@

+#!/usr/bin/env python
+"""
+Script to train a code classification model using CodeSearchNet dataset with Model2Vec.
+This script performs the following operations:
+1. Downloads the Alibaba-NLP/gte-Qwen2-7B-instruct model
+2. Optionally distills it using Model2Vec to create a smaller, faster static model
+3. Trains a programming language classifier on CodeSearchNet dataset
+4. Evaluates the classifier and saves the trained model
+Based on the official CodeSearchNet dataset: https://github.com/github/CodeSearchNet
+"""
+import json
+import logging
+import re
+import time
+from pathlib import Path
+from time import perf_counter
+from typing import Any, cast
+from datasets import Dataset, DatasetDict, load_dataset  # type: ignore [import]
+from model2vec.distill import distill
+from model2vec.train import StaticModelForClassification
+# =============================================================================
+# CONFIGURATION CONSTANTS
+# =============================================================================
+# Model Configuration
+MODEL_NAME = "Alibaba-NLP/gte-Qwen2-7B-instruct"  # Source model to distill
+OUTPUT_DIR = "."  # Directory to save the trained model
+# Distillation Configuration
+SKIP_DISTILLATION = False  # Set to True to skip distillation and use existing model
+DISTILLED_MODEL_PATH = "."  # Path to existing distilled model (if skipping distillation)
+PCA_DIMS = 256  # Dimensions for PCA reduction (smaller = faster but less accurate)
+# Dataset Configuration
+DATASET_NAME = "code-search-net/code_search_net"  # CodeSearchNet dataset
+CLASSIFICATION_TASK = "language"  # Task: classify programming language
+MAX_SAMPLES_PER_LANGUAGE = 5000  # Limit samples per language for balanced training
+MIN_CODE_LENGTH = 50  # Minimum code length in characters
+MAX_CODE_LENGTH = 2000  # Maximum code length in characters (for memory efficiency)
+# Text processing constants
+MAX_COMMENT_LENGTH = 200  # Maximum length for comment lines before truncation
+# Training Configuration
+MAX_EPOCHS = 30  # Maximum number of training epochs
+PATIENCE = 5  # Early stopping patience
+BATCH_SIZE = 32  # Training batch size
+LEARNING_RATE = 1e-3  # Learning rate
+# Saving Configuration
+SAVE_PIPELINE = True  # Save as scikit-learn compatible pipeline
+SAVE_TO_HUB = False  # Whether to push the model to HuggingFace Hub
+HUB_MODEL_ID = None  # Model ID for HuggingFace Hub (if saving to hub)
+# =============================================================================
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+def clean_code_text(code: str) -> str:
+	"""Clean and normalize code text for better classification."""
+	if not code:
+		return ""
+	# Remove excessive whitespace while preserving structure
+	code = re.sub(r"\n\s*\n\s*\n", "\n\n", code)  # Remove multiple empty lines
+	code = re.sub(r" +", " ", code)  # Replace multiple spaces with single space
+	# Remove very long comments that might bias classification
+	lines = code.split("\n")
+	cleaned_lines = []
+	for original_line in lines:
+		line = original_line
+		# Keep comment lines but limit their length
+		if line.strip().startswith(("#", "//", "/*", "*", "--")) and len(line) > MAX_COMMENT_LENGTH:
+			line = line[:MAX_COMMENT_LENGTH] + "..."
+		cleaned_lines.append(line)
+	return "\n".join(cleaned_lines)
+def load_codesearchnet_dataset() -> tuple[Dataset, Dataset, str, str]:
+	"""Load and preprocess the CodeSearchNet dataset for programming language classification."""
+	logger.info("Loading CodeSearchNet dataset...")
+	try:
+		# Load the dataset with trust_remote_code=True
+		logger.info("Downloading and loading CodeSearchNet data...")
+		ds = cast(
+			"DatasetDict",
+			load_dataset(
+				DATASET_NAME,
+				trust_remote_code=True,
+				# Load a reasonable sample for training
+			),
+		)
+		logger.info(f"Available splits: {list(ds.keys())}")
+		# Use train/test splits if available, otherwise split the data
+		if "train" in ds and "test" in ds:
+			train_dataset = ds["train"]
+			test_dataset = ds["test"]
+		elif "train" in ds:
+			# Split the train set
+			split_ds = ds["train"].train_test_split(test_size=0.2, seed=42)
+			train_dataset = split_ds["train"]
+			test_dataset = split_ds["test"]
+		else:
+			# Use all data and split
+			all_data = ds[next(iter(ds.keys()))]
+			split_ds = all_data.train_test_split(test_size=0.2, seed=42)
+			train_dataset = split_ds["train"]
+			test_dataset = split_ds["test"]
+		logger.info(f"Raw dataset sizes - Train: {len(train_dataset)}, Test: {len(test_dataset)}")
+		# Filter and preprocess the data
+		def filter_and_clean(dataset: Dataset) -> Dataset:
+			# Filter examples with valid code and language
+			filtered = dataset.filter(
+				lambda x: (
+					x["func_code_string"] is not None
+					and x["language"] is not None
+					and len(x["func_code_string"]) >= MIN_CODE_LENGTH
+					and len(x["func_code_string"]) <= MAX_CODE_LENGTH
+					and x["language"] in ["python", "java", "javascript", "go", "php", "ruby"]
+				)
+			)
+			# Balance the dataset by limiting samples per language
+			if len(filtered) > MAX_SAMPLES_PER_LANGUAGE * 6:  # 6 languages
+				# Group by language and sample
+				language_samples: dict[str, list[dict[str, Any]]] = {}
+				for example in filtered:
+					lang = example["language"]
+					if lang not in language_samples:
+						language_samples[lang] = []
+					if len(language_samples[lang]) < MAX_SAMPLES_PER_LANGUAGE:
+						language_samples[lang].append(example)
+				# Combine all samples
+				balanced_examples = []
+				for lang_examples in language_samples.values():
+					balanced_examples.extend(lang_examples)
+				# Convert back to dataset format
+				if balanced_examples:
+					filtered = Dataset.from_list(balanced_examples)
+			# Clean the code text
+			def clean_example(example: dict[str, Any]) -> dict[str, Any]:
+				example["func_code_string"] = clean_code_text(example["func_code_string"])
+				return example
+			return filtered.map(clean_example)
+		train_dataset = filter_and_clean(train_dataset)
+		test_dataset = filter_and_clean(test_dataset)
+		logger.info(f"Filtered dataset sizes - Train: {len(train_dataset)}, Test: {len(test_dataset)}")
+		# Show language distribution
+		if len(train_dataset) > 0:
+			from collections import Counter
+			train_lang_dist = Counter(train_dataset["language"])
+			test_lang_dist = Counter(test_dataset["language"])
+			logger.info(f"Training language distribution: {dict(train_lang_dist)}")
+			logger.info(f"Test language distribution: {dict(test_lang_dist)}")
+		return train_dataset, test_dataset, "func_code_string", "language"
+	except Exception:
+		logger.exception("Error loading CodeSearchNet dataset")
+		raise
+def main() -> None:
+	"""Run the code classification training pipeline."""
+	# Create output directory if it doesn't exist
+	output_dir = Path(OUTPUT_DIR)
+	output_dir.mkdir(parents=True, exist_ok=True)
+	logger.info(f"Starting CodeSearchNet code classification pipeline for {MODEL_NAME}")
+	logger.info(f"Classification task: {CLASSIFICATION_TASK}")
+	logger.info(f"Trained model will be saved to {output_dir}")
+	# Record start time for benchmarking
+	total_start_time = time.time()
+	try:
+		# Step 1: Get the static model (either distill or load existing)
+		static_model = None
+		if SKIP_DISTILLATION:
+			if DISTILLED_MODEL_PATH:
+				logger.info(f"Loading existing distilled model from {DISTILLED_MODEL_PATH}")
+				# Note: We'll create the classifier from pretrained instead
+			else:
+				logger.error("DISTILLED_MODEL_PATH must be specified when SKIP_DISTILLATION is True")
+				return
+		else:
+			logger.info("Starting Model2Vec distillation...")
+			distill_start_time = time.time()
+			static_model = distill(
+				model_name=MODEL_NAME,
+				pca_dims=PCA_DIMS,
+			)
+			distill_time = time.time() - distill_start_time
+			logger.info(f"Distillation completed in {distill_time:.2f} seconds")
+		# Step 2: Create the classifier
+		logger.info("Creating classifier...")
+		if static_model is not None:
+			# From a distilled model
+			classifier = StaticModelForClassification.from_static_model(model=static_model)
+		else:
+			# From a pre-trained model path
+			classifier = StaticModelForClassification.from_pretrained(model_name=DISTILLED_MODEL_PATH)
+		# Step 3: Load the CodeSearchNet dataset
+		train_dataset, test_dataset, text_column, label_column = load_codesearchnet_dataset()
+		if len(train_dataset) == 0 or len(test_dataset) == 0:
+			logger.error("No valid data found after filtering. Please check dataset configuration.")
+			return
+		logger.info(f"Training dataset size: {len(train_dataset)}")
+		logger.info(f"Test dataset size: {len(test_dataset)}")
+		# Get unique languages for reference
+		unique_languages = sorted(set(train_dataset[label_column]))
+		logger.info(f"Programming languages to classify: {unique_languages}")
+		# Step 4: Train the classifier
+		logger.info("Starting training...")
+		train_start_time = perf_counter()
+		classifier = classifier.fit(
+			train_dataset[text_column],
+			train_dataset[label_column],
+			max_epochs=MAX_EPOCHS,
+			batch_size=BATCH_SIZE,
+			learning_rate=LEARNING_RATE,
+			early_stopping_patience=PATIENCE,
+		)
+		train_time = perf_counter() - train_start_time
+		logger.info(f"Training completed in {int(train_time)} seconds")
+		# Step 5: Evaluate the classifier
+		logger.info("Evaluating classifier...")
+		eval_start_time = perf_counter()
+		classification_report = classifier.evaluate(test_dataset[text_column], test_dataset[label_column])
+		eval_time = perf_counter() - eval_start_time
+		logger.info(f"Evaluation completed in {int(eval_time * 1000)} milliseconds")
+		logger.info(f"Classification results:\n{classification_report}")
+		# Step 6: Test with some examples
+		logger.info("Testing with sample code snippets...")
+		# Test examples for different languages
+		test_examples = [
+			'def hello_world():\n    print("Hello, World!")\n    return True',  # Python
+			(
+				"public class HelloWorld {\n"
+				"    public static void main(String[] args) {\n"
+				'        System.out.println("Hello, World!");\n'
+				"    }\n"
+				"}"
+			),  # Java
+			'function helloWorld() {\n    console.log("Hello, World!");\n    return true;\n}',  # JavaScript
+			'package main\n\nimport "fmt"\n\nfunc main() {\n    fmt.Println("Hello, World!")\n}',  # Go
+			'<?php\nfunction hello_world() {\n    echo "Hello, World!";\n    return true;\n}\n?>',  # PHP
+			'def hello_world\n    puts "Hello, World!"\n    true\nend',  # Ruby
+		]
+		predictions = classifier.predict(test_examples)
+		for i, (code, pred) in enumerate(zip(test_examples, predictions, strict=False)):
+			logger.info(f"Example {i + 1}: {pred}")
+			logger.info(f"Code snippet: {code[:100]}...")
+		# Step 7: Benchmark inference speed
+		logger.info("Benchmarking inference speed...")
+		inference_start_time = perf_counter()
+		_ = classifier.predict(test_dataset[text_column][:100])  # Test on first 100 samples
+		inference_time = perf_counter() - inference_start_time
+		logger.info(f"Inference took {int(inference_time * 1000)} milliseconds for 100 code snippets on CPU")
+		# Step 8: Save the model
+		if SAVE_PIPELINE:
+			logger.info("Converting to scikit-learn pipeline...")
+			pipeline = classifier.to_pipeline()
+			# Save locally
+			pipeline_path = output_dir / "pipeline"
+			pipeline.save_pretrained(str(pipeline_path))
+			logger.info(f"Pipeline saved to {pipeline_path}")
+			# Save additional metadata
+			metadata = {
+				"model_name": MODEL_NAME,
+				"dataset": DATASET_NAME,
+				"task": "programming_language_classification",
+				"languages": unique_languages,
+				"pca_dims": PCA_DIMS,
+				"train_samples": len(train_dataset),
+				"test_samples": len(test_dataset),
+			}
+			metadata_path = output_dir / "metadata.json"
+			with metadata_path.open("w") as f:
+				json.dump(metadata, f, indent=2)
+			logger.info("Metadata saved to metadata.json")
+			# Push to hub if requested
+			if SAVE_TO_HUB and HUB_MODEL_ID:
+				logger.info(f"Pushing pipeline to HuggingFace Hub as {HUB_MODEL_ID}")
+				pipeline.push_to_hub(HUB_MODEL_ID)
+		else:
+			# Save the classifier directly
+			classifier_path = output_dir / "classifier"
+			classifier_path.mkdir(exist_ok=True)
+			# Note: StaticModelForClassification might not have save_pretrained
+			# We'll save the underlying static model and create instructions
+			if static_model is not None:
+				static_model.save_pretrained(str(classifier_path / "static_model"))
+			logger.info(f"Classifier components saved to {classifier_path}")
+		# Summary
+		total_time = time.time() - total_start_time
+		logger.info("=" * 60)
+		logger.info("CODE CLASSIFICATION TRAINING COMPLETED SUCCESSFULLY!")
+		logger.info(f"Total time: {total_time:.2f} seconds")
+		if not SKIP_DISTILLATION:
+			logger.info(f"Distillation time: {distill_time:.2f} seconds")
+		logger.info(f"Training time: {int(train_time)} seconds")
+		logger.info(f"Dataset: {DATASET_NAME}")
+		logger.info("Task: Programming Language Classification")
+		logger.info(f"Languages: {', '.join(unique_languages)}")
+		logger.info(f"Model saved to: {output_dir}")
+		logger.info("=" * 60)
+	except Exception:
+		logger.exception("Error during code classification training pipeline")
+		raise
+if __name__ == "__main__":
+	main()

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff