Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
8b7a945
1 Parent(s): 57ca843

feat: adapt the data loading part

Browse files
.gitignore CHANGED
@@ -11,3 +11,7 @@ eval-results/
11
  eval-queue-bk/
12
  eval-results-bk/
13
  logs/
 
 
 
 
 
11
  eval-queue-bk/
12
  eval-results-bk/
13
  logs/
14
+
15
+ .idea/
16
+ .venv/
17
+ toys/
app.py CHANGED
@@ -49,11 +49,11 @@ raw_data, original_df = get_leaderboard_df(
49
  EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
50
  leaderboard_df = original_df.copy()
51
 
52
- (
53
- finished_eval_queue_df,
54
- running_eval_queue_df,
55
- pending_eval_queue_df,
56
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
57
 
58
 
59
  # Searching and filtering
 
49
  EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
50
  leaderboard_df = original_df.copy()
51
 
52
+ # (
53
+ # finished_eval_queue_df,
54
+ # running_eval_queue_df,
55
+ # pending_eval_queue_df,
56
+ # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
57
 
58
 
59
  # Searching and filtering
src/about.py CHANGED
@@ -1,19 +1,36 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
4
  @dataclass
5
  class Task:
6
- benchmark: str
7
- metric: str
8
- col_name: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
- # Select your tasks here
12
- # ---------------------------------------------------
13
- class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
  @dataclass
6
  class Task:
7
+ name: str # qa, long_doc
8
+
9
+ @dataclass
10
+ class Metric:
11
+ name: str # ndcg_at_1
12
+
13
+ @dataclass
14
+ class Language:
15
+ name: str # en, zh
16
+
17
+
18
+ @dataclass
19
+ class Domain:
20
+ name: str # law, wiki
21
+
22
+
23
+ @dataclass
24
+ class EmbeddingModel:
25
+ full_name: str # jinaai/jina-embeddings-v2-en-base
26
+ org: str # jinaai
27
+ model: str # jina-embeddings-v2-en-base
28
+ size: int # size (millions of parameters)
29
+ dim: int # output dimensions
30
+ max_tokens: int # max tokens
31
+ model_type: str # open, proprietary, sentence transformers
32
 
33
 
 
 
 
 
 
 
34
 
35
  NUM_FEWSHOT = 0 # Change with your few shot
36
  # ---------------------------------------------------
src/benchmarks.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+
5
+ def get_safe_name(name: str):
6
+ """Get RFC 1123 compatible safe name"""
7
+ name = name.replace('-', '_')
8
+ return ''.join(
9
+ character.lower()
10
+ for character in name
11
+ if (character.isalnum() or character == '_'))
12
+
13
+
14
+ dataset_dict = {
15
+ "qa": {
16
+ "wiki": {
17
+ "en": ["wikipedia_20240101", ],
18
+ "zh": ["wikipedia_20240101", ]
19
+ },
20
+ "web": {
21
+ "en": ["mC4", ],
22
+ "zh": ["mC4", ]
23
+ },
24
+ "news": {
25
+ "en": ["CC-News", ],
26
+ "zh": ["CC-News", ]
27
+ },
28
+ "health": {
29
+ "en": ["PubMedQA", ],
30
+ "zh": ["Huatuo-26M", ]
31
+ },
32
+ "law": {
33
+ "en": ["pile-of-law", ],
34
+ "zh": ["flk_npc_gov_cn", ]
35
+ },
36
+ "finance": {
37
+ "en": ["Reuters-Financial", ],
38
+ "zh": ["FinCorpus", ]
39
+ },
40
+ "arxiv": {
41
+ "en": ["Arxiv", ]},
42
+ },
43
+ "long_doc": {
44
+ "arxiv": {
45
+ "en": ["gpt-3", "llama2", "llm-survey", "gemini"],
46
+ },
47
+ "book": {
48
+ "en": [
49
+ "origin-of-species_darwin",
50
+ "a-brief-history-of-time_stephen-hawking"
51
+ ]
52
+ },
53
+ "healthcare": {
54
+ "en": [
55
+ "pubmed_100K-200K_1",
56
+ "pubmed_100K-200K_2",
57
+ "pubmed_100K-200K_3",
58
+ "pubmed_40K-50K_5-merged",
59
+ "pubmed_30K-40K_10-merged"
60
+ ]
61
+ },
62
+ "law": {
63
+ "en": [
64
+ "lex_files_300K-400K",
65
+ "lex_files_400K-500K",
66
+ "lex_files_500K-600K",
67
+ "lex_files_600K-700K"
68
+ ]
69
+ }
70
+ }
71
+ }
72
+
73
+ metric_list = [
74
+ "ndcg_at_1",
75
+ "ndcg_at_3",
76
+ "ndcg_at_5",
77
+ "ndcg_at_10",
78
+ "ndcg_at_100",
79
+ "ndcg_at_1000",
80
+ "map_at_1",
81
+ "map_at_3",
82
+ "map_at_5",
83
+ "map_at_10",
84
+ "map_at_100",
85
+ "map_at_1000",
86
+ "recall_at_1",
87
+ "recall_at_3",
88
+ "recall_at_5",
89
+ "recall_at_10"
90
+ "recall_at_100",
91
+ "recall_at_1000",
92
+ "precision_at_1",
93
+ "precision_at_3",
94
+ "precision_at_5",
95
+ "precision_at_10",
96
+ "precision_at_100",
97
+ "precision_at_1000",
98
+ "mrr_at_1",
99
+ "mrr_at_3",
100
+ "mrr_at_5",
101
+ "mrr_at_10",
102
+ "mrr_at_100",
103
+ "mrr_at_1000"
104
+ ]
105
+
106
+
107
+ @dataclass
108
+ class Benchmark:
109
+ name: str # [task]_[domain]_[language]_[metric], task_key in the json file,
110
+ metric: str # ndcg_at_1 ,metric_key in the json file
111
+ col_name: str # [domain]_[language], name to display in the leaderboard
112
+
113
+ benchmark_dict = {}
114
+ for task, domain_dict in dataset_dict.items():
115
+ for domain, lang_dict in domain_dict.items():
116
+ for lang, dataset_list in lang_dict.items():
117
+ if task == "qa":
118
+ benchmark_name = f"{task}_{domain}_{lang}"
119
+ benchmark_name = get_safe_name(benchmark_name)
120
+ col_name = f"{domain}_{lang}"
121
+ for metric in dataset_list:
122
+ benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name)
123
+ elif task == "long_doc":
124
+ for dataset in dataset_list:
125
+ col_name = f"{domain}_{lang}_{dataset}"
126
+ for metric in metric_list:
127
+ benchmark_name = f"{task}_{domain}_{lang}_{dataset}_{metric}"
128
+ benchmark_name = get_safe_name(benchmark_name)
129
+ benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name)
130
+
131
+ Benchmarks = Enum('Benchmarks', benchmark_dict)
src/display/utils.py CHANGED
@@ -1,9 +1,7 @@
1
  from dataclasses import dataclass, make_dataclass
2
- from enum import Enum
3
 
4
- import pandas as pd
5
 
6
- from src.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -11,7 +9,7 @@ def fields(raw_class):
11
 
12
  # These classes are for user facing column names,
13
  # to avoid having to change them all around the code
14
- # when a modif is needed
15
  @dataclass
16
  class ColumnContent:
17
  name: str
@@ -20,116 +18,40 @@ class ColumnContent:
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
- for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
- # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
 
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
49
  model = ColumnContent("model", "markdown", True)
50
- revision = ColumnContent("revision", "str", True)
51
- private = ColumnContent("private", "bool", True)
52
- precision = ColumnContent("precision", "str", True)
53
- weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
55
 
56
- ## All the model information that we might need
57
- @dataclass
58
- class ModelDetails:
59
- name: str
60
- display_name: str = ""
61
- symbol: str = "" # emoji
62
-
63
-
64
- class ModelType(Enum):
65
- PT = ModelDetails(name="pretrained", symbol="🟢")
66
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
- Unknown = ModelDetails(name="", symbol="?")
70
-
71
- def to_str(self, separator=" "):
72
- return f"{self.value.symbol}{separator}{self.value.name}"
73
-
74
- @staticmethod
75
- def from_str(type):
76
- if "fine-tuned" in type or "🔶" in type:
77
- return ModelType.FT
78
- if "pretrained" in type or "🟢" in type:
79
- return ModelType.PT
80
- if "RL-tuned" in type or "🟦" in type:
81
- return ModelType.RL
82
- if "instruction-tuned" in type or "⭕" in type:
83
- return ModelType.IFT
84
- return ModelType.Unknown
85
-
86
- class WeightType(Enum):
87
- Adapter = ModelDetails("Adapter")
88
- Original = ModelDetails("Original")
89
- Delta = ModelDetails("Delta")
90
-
91
- class Precision(Enum):
92
- float16 = ModelDetails("float16")
93
- bfloat16 = ModelDetails("bfloat16")
94
- float32 = ModelDetails("float32")
95
- #qt_8bit = ModelDetails("8bit")
96
- #qt_4bit = ModelDetails("4bit")
97
- #qt_GPTQ = ModelDetails("GPTQ")
98
- Unknown = ModelDetails("?")
99
-
100
- def from_str(precision):
101
- if precision in ["torch.float16", "float16"]:
102
- return Precision.float16
103
- if precision in ["torch.bfloat16", "bfloat16"]:
104
- return Precision.bfloat16
105
- if precision in ["float32"]:
106
- return Precision.float32
107
- #if precision in ["8bit"]:
108
- # return Precision.qt_8bit
109
- #if precision in ["4bit"]:
110
- # return Precision.qt_4bit
111
- #if precision in ["GPTQ", "None"]:
112
- # return Precision.qt_GPTQ
113
- return Precision.Unknown
114
 
115
  # Column selection
116
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
117
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
118
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
119
- TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
120
 
121
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
122
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
123
-
124
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
125
 
126
- NUMERIC_INTERVALS = {
127
- "?": pd.Interval(-1, 0, closed="right"),
128
- "~1.5": pd.Interval(0, 2, closed="right"),
129
- "~3": pd.Interval(2, 4, closed="right"),
130
- "~7": pd.Interval(4, 9, closed="right"),
131
- "~13": pd.Interval(9, 20, closed="right"),
132
- "~35": pd.Interval(20, 45, closed="right"),
133
- "~60": pd.Interval(45, 70, closed="right"),
134
- "70+": pd.Interval(70, 10000, closed="right"),
135
- }
 
1
  from dataclasses import dataclass, make_dataclass
 
2
 
3
+ from src.benchmarks import Benchmarks
4
 
 
5
 
6
  def fields(raw_class):
7
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
9
 
10
  # These classes are for user facing column names,
11
  # to avoid having to change them all around the code
12
+ # when a modification is needed
13
  @dataclass
14
  class ColumnContent:
15
  name: str
 
18
  hidden: bool = False
19
  never_hidden: bool = False
20
 
21
+
22
  ## Leaderboard columns
23
  auto_eval_column_dict = []
24
  # Init
25
+ auto_eval_column_dict.append(
26
+ ["retrieval_model", ColumnContent, ColumnContent("Retrieval Model", "markdown", True, never_hidden=True)]
27
+ )
28
+ auto_eval_column_dict.append(
29
+ ["reranking_model", ColumnContent, ColumnContent("Reranking Model", "markdown", True, never_hidden=True)]
30
+ )
31
+ auto_eval_column_dict.append(
32
+ ["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)]
33
+ )
34
+ for benchmark in Benchmarks:
35
+ auto_eval_column_dict.append(
36
+ [benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
37
+ )
 
 
 
38
 
39
  # We use make dataclass to dynamically fill the scores from Tasks
40
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
41
 
42
+
43
  ## For the queue columns in the submission tab
44
  @dataclass(frozen=True)
45
  class EvalQueueColumn: # Queue column
46
  model = ColumnContent("model", "markdown", True)
 
 
 
 
47
  status = ColumnContent("status", "str", True)
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  # Column selection
51
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
52
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
53
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 
54
 
55
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 
 
 
56
 
57
+ BENCHMARK_COLS = [t.value.col_name for t in Benchmarks]
 
 
 
 
 
 
 
 
 
src/leaderboard/read_evals.py CHANGED
@@ -1,196 +1,171 @@
1
  import glob
2
  import json
3
- import math
4
- import os
5
  from dataclasses import dataclass
 
6
 
7
- import dateutil
8
- import numpy as np
9
 
10
- from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
- from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
16
  class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
  """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
- model: str
23
- revision: str # commit hash, "" if main
24
- results: dict
25
- precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
- license: str = "?"
30
- likes: int = 0
31
- num_params: int = 0
32
- date: str = "" # submission date of request file
33
- still_on_hub: bool = False
34
 
35
- @classmethod
36
- def init_from_json_file(self, json_filepath):
37
- """Inits the result from the specific model result file"""
38
- with open(json_filepath) as fp:
39
- data = json.load(fp)
40
-
41
- config = data.get("config")
42
 
43
- # Precision
44
- precision = Precision.from_str(config.get("model_dtype"))
45
-
46
- # Get model and org
47
- org_and_model = config.get("model_name", config.get("model_args", None))
48
- org_and_model = org_and_model.split("/", 1)
49
-
50
- if len(org_and_model) == 1:
51
- org = None
52
- model = org_and_model[0]
53
- result_key = f"{model}_{precision.value.name}"
54
- else:
55
- org = org_and_model[0]
56
- model = org_and_model[1]
57
- result_key = f"{org}_{model}_{precision.value.name}"
58
- full_model = "/".join(org_and_model)
59
 
60
- still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  )
63
- architecture = "?"
64
- if model_config is not None:
65
- architectures = getattr(model_config, "architectures", None)
66
- if architectures:
67
- architecture = ";".join(architectures)
68
-
69
- # Extract results available in this file (some results are split in several files)
70
- results = {}
71
- for task in Tasks:
72
- task = task.value
73
-
74
- # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
- if accs.size == 0 or any([acc is None for acc in accs]):
77
- continue
78
 
79
- mean_acc = np.mean(accs) * 100.0
80
- results[task.benchmark] = mean_acc
81
-
82
- return self(
83
- eval_name=result_key,
84
- full_model=full_model,
85
- org=org,
86
- model=model,
87
- results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
- still_on_hub=still_on_hub,
91
- architecture=architecture
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  )
93
 
94
- def update_with_request_file(self, requests_path):
95
- """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
-
98
  try:
99
  with open(request_file, "r") as f:
100
  request = json.load(f)
101
- self.model_type = ModelType.from_str(request.get("model_type", ""))
102
- self.weight_type = WeightType[request.get("weight_type", "Original")]
103
- self.license = request.get("license", "?")
104
- self.likes = request.get("likes", 0)
105
- self.num_params = request.get("params", 0)
106
  self.date = request.get("submitted_time", "")
107
  except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
-
110
- def to_dict(self):
111
- """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
- data_dict = {
114
- "eval_name": self.eval_name, # not a column, just a save name,
115
- AutoEvalColumn.precision.name: self.precision.value.name,
116
- AutoEvalColumn.model_type.name: self.model_type.value.name,
117
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
122
- AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
- AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
- }
128
-
129
- for task in Tasks:
130
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
-
132
- return data_dict
133
-
134
-
135
- def get_request_file_for_model(requests_path, model_name, precision):
136
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
  request_files = os.path.join(
138
  requests_path,
139
- f"{model_name}_eval_request_*.json",
 
 
140
  )
141
  request_files = glob.glob(request_files)
142
 
143
- # Select correct request file (precision)
144
  request_file = ""
145
  request_files = sorted(request_files, reverse=True)
146
  for tmp_request_file in request_files:
147
  with open(tmp_request_file, "r") as f:
148
  req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
  request_file = tmp_request_file
 
154
  return request_file
155
 
156
 
157
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
- """From the path of the results folder root, extract all needed info for results"""
 
 
159
  model_result_filepaths = []
160
-
161
- for root, _, files in os.walk(results_path):
162
- # We should only have json files in model results
163
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
164
  continue
165
-
166
- # Sort the files by date
167
  try:
168
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
  except dateutil.parser._parser.ParserError:
170
  files = [files[-1]]
171
 
 
172
  for file in files:
173
  model_result_filepaths.append(os.path.join(root, file))
174
 
175
  eval_results = {}
176
  for model_result_filepath in model_result_filepaths:
177
- # Creation of result
178
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
 
 
179
  eval_result.update_with_request_file(requests_path)
180
-
181
- # Store results of same eval together
 
 
 
182
  eval_name = eval_result.eval_name
183
- if eval_name in eval_results.keys():
184
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
185
- else:
186
- eval_results[eval_name] = eval_result
187
 
188
  results = []
189
- for v in eval_results.values():
190
  try:
191
- v.to_dict() # we test if the dict version is complete
192
  results.append(v)
193
- except KeyError: # not all eval values present
 
194
  continue
195
-
196
  return results
 
1
  import glob
2
  import json
3
+ import os.path
 
4
  from dataclasses import dataclass
5
+ from typing import List
6
 
7
+ import dateutil.parser._parser
 
8
 
9
+ from src.display.utils import AutoEvalColumn
10
+ from src.benchmarks import get_safe_name
 
11
 
12
 
13
  @dataclass
14
  class EvalResult:
15
+ """Full evaluation result of a single embedding model
16
  """
17
+ eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]_[metric]
18
+ retrieval_model: str
19
+ reranking_model: str
20
+ results: list # results on all the benchmarks over different domains, languages, and datasets. Use benchmark.name as the key
21
+ task: str
22
+ metric: str
23
+ timestamp: str = "" # submission timestamp
 
 
 
 
 
 
 
 
24
 
 
 
 
 
 
 
 
25
 
26
+ @dataclass
27
+ class FullEvalResult:
28
+ eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]
29
+ retrieval_model: str
30
+ reranking_model: str
31
+ results: List[EvalResult] # results on all the EvalResults over different tasks and metrics.
32
+ date: str = ""
 
 
 
 
 
 
 
 
 
33
 
34
+ @classmethod
35
+ def init_from_json_file(cls, json_filepath):
36
+ """Initiate from the result json file for a single model.
37
+ The json file will be written only when the status is FINISHED.
38
+ """
39
+ with open(json_filepath) as fp:
40
+ model_data = json.load(fp)
41
+
42
+ # store all the results for different metrics and tasks
43
+ result_list = []
44
+ for item in model_data:
45
+ config = item.get("config", {})
46
+ # eval results for different metrics
47
+ results = item.get("results", [])
48
+ eval_result = EvalResult(
49
+ eval_name=f"{config['retrieval_model']}_{config['reranking_model']}_{config['metric']}",
50
+ retrieval_model=config["retrieval_model"],
51
+ reranking_model=config["reranking_model"],
52
+ results=results,
53
+ task=config["task"],
54
+ metric=config["metric"]
55
+ )
56
+ result_list.append(eval_result)
57
+ return cls(
58
+ eval_name=f"{result_list[0].retrieval_model}_{result_list[0].reranking_model}",
59
+ retrieval_model=result_list[0].retrieval_model,
60
+ reranking_model=result_list[0].reranking_model,
61
+ results=result_list
62
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ def to_dict(self, task='qa', metric='ndcg_at_1'):
65
+ """Convert FullEvalResult to a list of dict compatible with our dataframe UI
66
+ """
67
+ results = []
68
+ for eval_result in self.results:
69
+ if eval_result.metric != metric:
70
+ continue
71
+ if eval_result.task != task:
72
+ continue
73
+ data_dict = {
74
+ "eval_name": eval_result.eval_name,
75
+ AutoEvalColumn.retrieval_model.name: self.retrieval_model,
76
+ AutoEvalColumn.reranking_model.name: self.reranking_model,
77
+ }
78
+ for result in eval_result.results:
79
+ # add result for each domain, language, and dataset
80
+ domain = result["domain"]
81
+ lang = result["lang"]
82
+ dataset = result["dataset"]
83
+ value = result["value"]
84
+ if task == 'qa':
85
+ benchmark_name = f"{task}_{domain}_{lang}"
86
+ elif task == 'long_doc':
87
+ benchmark_name = f"{task}_{domain}_{lang}_{dataset}_{metric}"
88
+ data_dict[get_safe_name(benchmark_name)] = value
89
+ results.append(data_dict)
90
+ return results
91
+
92
+ def update_with_request_file(self, request_path):
93
+ """
94
+ Update the request file
95
+ """
96
+ request_file = get_request_file_for_model(
97
+ request_path, self.retrieval_model, self.reranking_model
98
  )
99
 
 
 
 
 
100
  try:
101
  with open(request_file, "r") as f:
102
  request = json.load(f)
 
 
 
 
 
103
  self.date = request.get("submitted_time", "")
104
  except Exception:
105
+ print(f"Failed to find request file for {self.retrieval_model}, {self.reranking_model}: {request_path}")
106
+
107
+
108
+ def get_request_file_for_model(requests_path, retrieval_model_name, reranking_model_name):
109
+ """
110
+ Load the request status from a json file
111
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  request_files = os.path.join(
113
  requests_path,
114
+ f"{retrieval_model_name}",
115
+ f"{reranking_model_name}",
116
+ "eval_request_*.json",
117
  )
118
  request_files = glob.glob(request_files)
119
 
 
120
  request_file = ""
121
  request_files = sorted(request_files, reverse=True)
122
  for tmp_request_file in request_files:
123
  with open(tmp_request_file, "r") as f:
124
  req_content = json.load(f)
125
+ if req_content["status"] in ["FINISHED"]:
 
 
 
126
  request_file = tmp_request_file
127
+ break
128
  return request_file
129
 
130
 
131
+ def get_raw_eval_results(results_path: str, requests_path: str) -> List[FullEvalResult]:
132
+ """
133
+ Load the evaluation results from a json file
134
+ """
135
  model_result_filepaths = []
136
+ for root, dirs, files in os.walk(results_path):
 
 
137
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
138
  continue
 
 
139
  try:
140
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_demo_")[:-7], reverse=True)
141
  except dateutil.parser._parser.ParserError:
142
  files = [files[-1]]
143
 
144
+ # select the latest and finished results
145
  for file in files:
146
  model_result_filepaths.append(os.path.join(root, file))
147
 
148
  eval_results = {}
149
  for model_result_filepath in model_result_filepaths:
150
+ # create evaluation results
151
+ # TODO: fix the bug here, the running results should not be loaded
152
+ eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
153
+ # get the latest result that is finished
154
  eval_result.update_with_request_file(requests_path)
155
+ latest_date_str = eval_result.date.replace(":", "-")
156
+ model_result_date_str = model_result_filepath.split('/')[-1
157
+ ].removeprefix("results_demo_").removesuffix(".json")
158
+ if latest_date_str != model_result_date_str:
159
+ continue
160
  eval_name = eval_result.eval_name
161
+ eval_results[eval_name] = eval_result
 
 
 
162
 
163
  results = []
164
+ for k, v in eval_results.items():
165
  try:
166
+ v.to_dict()
167
  results.append(v)
168
+ except KeyError:
169
+ print(f"loading failed: {k}")
170
  continue
 
171
  return results
src/populate.py CHANGED
@@ -24,7 +24,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
24
 
25
 
26
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
27
- """Creates the different dataframes for the evaluation queues requestes"""
28
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
29
  all_evals = []
30
 
 
24
 
25
 
26
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
27
+ """Creates the different dataframes for the evaluation queues requests"""
28
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
29
  all_evals = []
30
 
tests/src/display/test_utils.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from src.display.utils import fields, AutoEvalColumn, COLS, COLS_LITE, TYPES, EVAL_COLS, BENCHMARK_COLS
3
+
4
+
5
+ def test_fields():
6
+ for c in fields(AutoEvalColumn):
7
+ print(c.name)
8
+
9
+
10
+ def test_macro_variables():
11
+ print(f'COLS: {COLS}')
12
+ print(f'COLS_LITE: {COLS_LITE}')
13
+ print(f'TYPES: {TYPES}')
14
+ print(f'EVAL_COLS: {EVAL_COLS}')
15
+ print(f'BENCHMARK_COLS: {BENCHMARK_COLS}')
tests/src/leaderboard/test_read_evals.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from src.leaderboard.read_evals import FullEvalResult, get_raw_eval_results, get_request_file_for_model
4
+
5
+ cur_fp = Path(__file__)
6
+
7
+
8
+ def test_init_from_json_file():
9
+ json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
10
+ full_eval_result = FullEvalResult.init_from_json_file(json_fp)
11
+ assert len(full_eval_result.results) == 6
12
+
13
+
14
+ def test_to_dict():
15
+ json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
16
+ full_eval_result = FullEvalResult.init_from_json_file(json_fp)
17
+ result_dict = full_eval_result.to_dict(task='qa', metric='ndcg_at_1')
18
+ assert len(result_dict) == 2
19
+
20
+
21
+ def test_get_request_file_for_model():
22
+ requests_path = cur_fp.parents[2] / "toydata" / "test_requests"
23
+ request_file = get_request_file_for_model(requests_path, "bge-m3", "bge-reranker-v2-m3")
24
+ # only load the latest finished results
25
+ assert Path(request_file).name.removeprefix("eval_request_").removesuffix(".json") == "2023-11-21T18-10-08"
26
+
27
+
28
+ def test_get_raw_eval_results():
29
+ requests_path = cur_fp.parents[2] / "toydata" / "test_requests"
30
+ results_path = cur_fp.parents[2] / "toydata" / "test_results" / "bge-m3"
31
+ results = get_raw_eval_results(results_path, requests_path)
32
+ # only load the latest results
33
+ assert len(results) == 2
34
+ assert results[0].date == "2023-12-21T18:10:08"
35
+ assert results[0].eval_name == "bge-m3_NoReranker"
36
+ assert len(results[0].results) == 3
37
+ assert results[1].eval_name == "bge-m3_bge-reranker-v2-m3"
38
+ assert results[1].date == "2023-11-21T18:10:08"
39
+ assert len(results[1].results) == 6
tests/src/test_populate.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.populate import get_leaderboard_df
2
+ from pathlib import Path
3
+
4
+ cur_fp = Path(__file__)
5
+
6
+ def test_get_leaderboard_df():
7
+ requests_path = cur_fp.parents[2] / "toydata" / "test_requests"
8
+ results_path = cur_fp.parents[2] / "toydata" / "test_results"
9
+ cols = []
10
+ benchmark_cols = []
11
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
12
+ get_leaderboard_df(results_path, requests_path, cols, benchmark_cols)
tests/toydata/test_data.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "config": {
4
+ "retrieval_model": "bge-m3",
5
+ "reranking_model": "bge-reranker-v2-m3",
6
+ "task": "long_doc",
7
+ "metric": "ndcg_at_1"
8
+ },
9
+ "results": [
10
+ {
11
+ "domain": "law",
12
+ "lang": "en",
13
+ "dataset": "lex_files_500K-600K",
14
+ "value": 0.75723
15
+ }
16
+ ]
17
+ },
18
+ {
19
+ "config": {
20
+ "retrieval_model": "bge-m3",
21
+ "reranking_model": "bge-reranker-v2-m3",
22
+ "task": "long_doc",
23
+ "metric": "ndcg_at_3"
24
+ },
25
+ "results": [
26
+ {
27
+ "domain": "law",
28
+ "lang": "en",
29
+ "dataset": "lex_files_500K-600K",
30
+ "value": 0.69909
31
+ }
32
+ ]
33
+ },
34
+ {
35
+ "config": {
36
+ "retrieval_model": "bge-m3",
37
+ "reranking_model": "bge-reranker-v2-m3",
38
+ "task": "qa",
39
+ "metric": "ndcg_at_1"
40
+ },
41
+ "results": [
42
+ {
43
+ "domain": "wiki",
44
+ "lang": "en",
45
+ "dataset": "unknown",
46
+ "value": 0.69083
47
+ }
48
+ ]
49
+ },
50
+ {
51
+ "config": {
52
+ "retrieval_model": "bge-m3",
53
+ "reranking_model": "bge-reranker-v2-m3",
54
+ "task": "qa",
55
+ "metric": "ndcg_at_3"
56
+ },
57
+ "results": [
58
+ {
59
+ "domain": "wiki",
60
+ "lang": "en",
61
+ "dataset": "unknown",
62
+ "value": 0.73359
63
+ }
64
+ ]
65
+ },
66
+ {
67
+ "config": {
68
+ "retrieval_model": "bge-m3",
69
+ "reranking_model": "bge-reranker-v2-m3",
70
+ "task": "qa",
71
+ "metric": "ndcg_at_1"
72
+ },
73
+ "results": [
74
+ {
75
+ "domain": "wiki",
76
+ "lang": "zh",
77
+ "dataset": "unknown",
78
+ "value": 0.78358
79
+ }
80
+ ]
81
+ },
82
+ {
83
+ "config": {
84
+ "retrieval_model": "bge-m3",
85
+ "reranking_model": "bge-reranker-v2-m3",
86
+ "task": "qa",
87
+ "metric": "ndcg_at_3"
88
+ },
89
+ "results": [
90
+ {
91
+ "domain": "wiki",
92
+ "lang": "zh",
93
+ "dataset": "unknown",
94
+ "value": 0.78358
95
+ }
96
+ ]
97
+ }
98
+ ]
tests/toydata/test_requests/bge-m3/NoReranker/eval_request_2023-11-21T18-10-08.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "retrieval_model": "BAAI/bge-m3",
3
+ "reranking_model": "NoReranker",
4
+ "status": "FINISHED",
5
+ "submitted_time": "2023-11-21T18:10:08"
6
+ }
tests/toydata/test_requests/bge-m3/NoReranker/eval_request_2023-12-21T18-10-08.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "retrieval_model": "BAAI/bge-m3",
3
+ "reranking_model": "NoReranker",
4
+ "status": "FINISHED",
5
+ "submitted_time": "2023-12-21T18:10:08"
6
+ }
tests/toydata/test_requests/bge-m3/bge-reranker-v2-m3/eval_request_2023-11-21T18-10-08.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "retrieval_model": "BAAI/bge-m3",
3
+ "reranking_model": "BAAI/bge-reranker-v2-m3",
4
+ "status": "FINISHED",
5
+ "submitted_time": "2023-11-21T18:10:08"
6
+ }
tests/toydata/test_requests/bge-m3/bge-reranker-v2-m3/eval_request_2023-12-21T18-10-08.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "retrieval_model": "BAAI/bge-m3",
3
+ "reranking_model": "BAAI/bge-reranker-v2-m3",
4
+ "status": "RUNNING",
5
+ "submitted_time": "2023-12-21T18:10:08"
6
+ }
tests/toydata/test_results/bge-m3/NoReranker/results_demo_2023-11-21T18-10-08.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "config": {
4
+ "retrieval_model": "bge-m3",
5
+ "reranking_model": "NoReranker",
6
+ "task": "long_doc",
7
+ "metric": "ndcg_at_1"
8
+ },
9
+ "results": [
10
+ {
11
+ "domain": "law",
12
+ "lang": "en",
13
+ "dataset": "lex_files_500K-600K",
14
+ "value": 0.75723
15
+ }
16
+ ]
17
+ },
18
+ {
19
+ "config": {
20
+ "retrieval_model": "bge-m3",
21
+ "reranking_model": "NoReranker",
22
+ "task": "long_doc",
23
+ "metric": "ndcg_at_3"
24
+ },
25
+ "results": [
26
+ {
27
+ "domain": "law",
28
+ "lang": "en",
29
+ "dataset": "lex_files_500K-600K",
30
+ "value": 0.69909
31
+ }
32
+ ]
33
+ },
34
+ {
35
+ "config": {
36
+ "retrieval_model": "bge-m3",
37
+ "reranking_model": "NoReranker",
38
+ "task": "qa",
39
+ "metric": "ndcg_at_1"
40
+ },
41
+ "results": [
42
+ {
43
+ "domain": "wiki",
44
+ "lang": "en",
45
+ "dataset": "unknown",
46
+ "value": 0.69083
47
+ }
48
+ ]
49
+ },
50
+ {
51
+ "config": {
52
+ "retrieval_model": "bge-m3",
53
+ "reranking_model": "NoReranker",
54
+ "task": "qa",
55
+ "metric": "ndcg_at_3"
56
+ },
57
+ "results": [
58
+ {
59
+ "domain": "wiki",
60
+ "lang": "en",
61
+ "dataset": "unknown",
62
+ "value": 0.73359
63
+ }
64
+ ]
65
+ },
66
+ {
67
+ "config": {
68
+ "retrieval_model": "bge-m3",
69
+ "reranking_model": "NoReranker",
70
+ "task": "qa",
71
+ "metric": "ndcg_at_1"
72
+ },
73
+ "results": [
74
+ {
75
+ "domain": "wiki",
76
+ "lang": "zh",
77
+ "dataset": "unknown",
78
+ "value": 0.78358
79
+ }
80
+ ]
81
+ },
82
+ {
83
+ "config": {
84
+ "retrieval_model": "bge-m3",
85
+ "reranking_model": "NoReranker",
86
+ "task": "qa",
87
+ "metric": "ndcg_at_3"
88
+ },
89
+ "results": [
90
+ {
91
+ "domain": "wiki",
92
+ "lang": "zh",
93
+ "dataset": "unknown",
94
+ "value": 0.78358
95
+ }
96
+ ]
97
+ }
98
+ ]
tests/toydata/test_results/bge-m3/NoReranker/results_demo_2023-12-21T18-10-08.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "config": {
4
+ "retrieval_model": "bge-m3",
5
+ "reranking_model": "NoReranker",
6
+ "task": "long_doc",
7
+ "metric": "ndcg_at_1"
8
+ },
9
+ "results": [
10
+ {
11
+ "domain": "law",
12
+ "lang": "en",
13
+ "dataset": "lex_files_500K-600K",
14
+ "value": 0.75723
15
+ }
16
+ ]
17
+ },
18
+ {
19
+ "config": {
20
+ "retrieval_model": "bge-m3",
21
+ "reranking_model": "NoReranker",
22
+ "task": "qa",
23
+ "metric": "ndcg_at_1"
24
+ },
25
+ "results": [
26
+ {
27
+ "domain": "wiki",
28
+ "lang": "en",
29
+ "dataset": "unknown",
30
+ "value": 0.69083
31
+ }
32
+ ]
33
+ },
34
+ {
35
+ "config": {
36
+ "retrieval_model": "bge-m3",
37
+ "reranking_model": "NoReranker",
38
+ "task": "qa",
39
+ "metric": "ndcg_at_1"
40
+ },
41
+ "results": [
42
+ {
43
+ "domain": "wiki",
44
+ "lang": "zh",
45
+ "dataset": "unknown",
46
+ "value": 0.78358
47
+ }
48
+ ]
49
+ }
50
+ ]
tests/toydata/test_results/bge-m3/bge-reranker-v2-m3/results_demo_2023-11-21T18-10-08.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "config": {
4
+ "retrieval_model": "bge-m3",
5
+ "reranking_model": "bge-reranker-v2-m3",
6
+ "task": "long_doc",
7
+ "metric": "ndcg_at_1"
8
+ },
9
+ "results": [
10
+ {
11
+ "domain": "law",
12
+ "lang": "en",
13
+ "dataset": "lex_files_500K-600K",
14
+ "value": 0.75723
15
+ }
16
+ ]
17
+ },
18
+ {
19
+ "config": {
20
+ "retrieval_model": "bge-m3",
21
+ "reranking_model": "bge-reranker-v2-m3",
22
+ "task": "long_doc",
23
+ "metric": "ndcg_at_3"
24
+ },
25
+ "results": [
26
+ {
27
+ "domain": "law",
28
+ "lang": "en",
29
+ "dataset": "lex_files_500K-600K",
30
+ "value": 0.69909
31
+ }
32
+ ]
33
+ },
34
+ {
35
+ "config": {
36
+ "retrieval_model": "bge-m3",
37
+ "reranking_model": "bge-reranker-v2-m3",
38
+ "task": "qa",
39
+ "metric": "ndcg_at_1"
40
+ },
41
+ "results": [
42
+ {
43
+ "domain": "wiki",
44
+ "lang": "en",
45
+ "dataset": "unknown",
46
+ "value": 0.69083
47
+ }
48
+ ]
49
+ },
50
+ {
51
+ "config": {
52
+ "retrieval_model": "bge-m3",
53
+ "reranking_model": "bge-reranker-v2-m3",
54
+ "task": "qa",
55
+ "metric": "ndcg_at_3"
56
+ },
57
+ "results": [
58
+ {
59
+ "domain": "wiki",
60
+ "lang": "en",
61
+ "dataset": "unknown",
62
+ "value": 0.73359
63
+ }
64
+ ]
65
+ },
66
+ {
67
+ "config": {
68
+ "retrieval_model": "bge-m3",
69
+ "reranking_model": "bge-reranker-v2-m3",
70
+ "task": "qa",
71
+ "metric": "ndcg_at_1"
72
+ },
73
+ "results": [
74
+ {
75
+ "domain": "wiki",
76
+ "lang": "zh",
77
+ "dataset": "unknown",
78
+ "value": 0.78358
79
+ }
80
+ ]
81
+ },
82
+ {
83
+ "config": {
84
+ "retrieval_model": "bge-m3",
85
+ "reranking_model": "bge-reranker-v2-m3",
86
+ "task": "qa",
87
+ "metric": "ndcg_at_3"
88
+ },
89
+ "results": [
90
+ {
91
+ "domain": "wiki",
92
+ "lang": "zh",
93
+ "dataset": "unknown",
94
+ "value": 0.78358
95
+ }
96
+ ]
97
+ }
98
+ ]