File size: 2,342 Bytes
634c39b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import mteb
from mteb.encoder_interface import PromptType
from sentence_transformers import SentenceTransformer, models
import numpy as np
import torch
import os
import math

model_save_path = "./" #REPLACE WITH YOUR PATH

# Reload the prepared SentenceTransformer model
model = SentenceTransformer(model_save_path)

# -------- Step 3: Define Custom Model Interface for MTEB --------
class CustomModel:
    def __init__(self, model):
        self.model = model

    def encode(
        self, 
        sentences, 
        task_name: str, 
        prompt_type = None, 
        max_batch_size: int = 32,  # Set default max batch size
        **kwargs
    ) -> np.ndarray:
        """
        Encodes the given sentences using the model with a maximum batch size.

        Args:
            sentences (List[str]): The sentences to encode.
            task_name (str): The name of the task.
            prompt_type (Optional[PromptType]): The prompt type to use.
            max_batch_size (int): The maximum number of sentences to process in a single batch.
            **kwargs: Additional arguments to pass to the encoder.

        Returns:
            np.ndarray: Encoded sentences as a numpy array.
        """

        sentences = [str(sentence) for sentence in sentences]
        total_sentences = len(sentences)
        num_batches = math.ceil(total_sentences / max_batch_size)
        embeddings_list = []

        for batch_idx in range(num_batches):
            start_idx = batch_idx * max_batch_size
            end_idx = min(start_idx + max_batch_size, total_sentences)
            batch_sentences = sentences[start_idx:end_idx]
            batch_embeddings = self.model.encode(batch_sentences, convert_to_tensor=True)

            if not isinstance(batch_embeddings, torch.Tensor):
                batch_embeddings = torch.tensor(batch_embeddings)

            embeddings_list.append(batch_embeddings.cpu().numpy())

        return np.vstack(embeddings_list)



# Wrap the SentenceTransformer model in the CustomModel class
custom_model = CustomModel(model)

# Select the MTEB tasks to evaluate
tasks = mteb.get_benchmark("MTEB(eng, classic)") 

# Initialize the evaluation framework
evaluation = mteb.MTEB(tasks=tasks)

# Run evaluation and save results
results = evaluation.run(custom_model, output_folder="results/model_results")