manu antonioloison commited on
Commit
19d93fe
·
verified ·
1 Parent(s): 7545429

Use MTEB results in new leaderboard (#8)

Browse files

- feat: deprecate old leaderboards (d6696d6acb32dff8dacc826328c0952a01353cfb)
- feat: add new leaderboard based on mteb results (a212b4a0cd1cd9c7b27383aa5f0366421be7645c)
- feat: add model size column (b5279c6709dde10405810e48634d5b1601c74cef)
- docs: update documentation on how to add your model (1c161e2337749f98d0876f4b4149c018fcf129a9)


Co-authored-by: Antonio Loison <[email protected]>

app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
 
3
  from app.utils import add_rank_and_format, filter_models, get_refresh_function
 
4
  from data.model_handler import ModelHandler
5
 
6
  METRICS = [
@@ -16,23 +17,43 @@ METRICS = [
16
 
17
 
18
  def main():
 
19
  model_handler = ModelHandler()
20
  initial_metric = "ndcg_at_5"
21
 
22
  model_handler.get_vidore_data(initial_metric)
23
- data_benchmark_1 = model_handler.compute_averages(initial_metric, benchmark_version=1)
24
  data_benchmark_1 = add_rank_and_format(data_benchmark_1, benchmark_version=1)
25
 
26
- data_benchmark_2 = model_handler.compute_averages(initial_metric, benchmark_version=2)
27
  data_benchmark_2 = add_rank_and_format(data_benchmark_2, benchmark_version=2)
28
 
29
- NUM_DATASETS_1 = len(data_benchmark_1.columns) - 3
30
- NUM_SCORES_1 = len(data_benchmark_1) * NUM_DATASETS_1
31
- NUM_MODELS_1 = len(data_benchmark_1)
32
 
33
- NUM_DATASETS_2 = len(data_benchmark_2.columns) - 3
34
- NUM_SCORES_2 = len(data_benchmark_2) * NUM_DATASETS_2
35
- NUM_MODELS_2 = len(data_benchmark_2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  css = """
38
  table > thead {
@@ -59,7 +80,7 @@ def main():
59
 
60
  with gr.Blocks(css=css) as block:
61
  with gr.Tabs():
62
- with gr.TabItem("🏆 ViDoRe V1"):
63
  gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
64
  gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
65
 
@@ -70,7 +91,7 @@ def main():
70
  Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
71
  """
72
  )
73
- datasets_columns_1 = list(data_benchmark_1.columns[3:])
74
 
75
  with gr.Row():
76
  metric_dropdown_1 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
@@ -83,17 +104,16 @@ def main():
83
  )
84
 
85
  with gr.Row():
86
- datatype_1 = ["number", "markdown"] + ["number"] * (NUM_DATASETS_1 + 1)
87
  dataframe_1 = gr.Dataframe(data_benchmark_1, datatype=datatype_1, type="pandas")
88
 
89
  def update_data_1(metric, search_term, selected_columns):
90
  model_handler.get_vidore_data(metric)
91
- data = model_handler.compute_averages(metric, benchmark_version=1)
92
  data = add_rank_and_format(data, benchmark_version=1)
93
  data = filter_models(data, search_term)
94
- # data = remove_duplicates(data) # Add this line
95
  if selected_columns:
96
- data = data[["Rank", "Model", "Average"] + selected_columns]
97
  return data
98
 
99
  with gr.Row():
@@ -124,9 +144,9 @@ def main():
124
 
125
  gr.Markdown(
126
  f"""
127
- - **Total Datasets**: {NUM_DATASETS_1}
128
- - **Total Scores**: {NUM_SCORES_1}
129
- - **Total Models**: {NUM_MODELS_1}
130
  """
131
  + r"""
132
  Please consider citing:
@@ -144,7 +164,7 @@ def main():
144
  ```
145
  """
146
  )
147
- with gr.TabItem("🏆 ViDoRe V2"):
148
  gr.Markdown("# ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍")
149
  gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
150
 
@@ -155,7 +175,7 @@ def main():
155
  Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models.
156
  """
157
  )
158
- datasets_columns_2 = list(data_benchmark_2.columns[3:])
159
 
160
  with gr.Row():
161
  metric_dropdown_2 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
@@ -168,17 +188,17 @@ def main():
168
  )
169
 
170
  with gr.Row():
171
- datatype_2 = ["number", "markdown"] + ["number"] * (NUM_DATASETS_2 + 1)
172
  dataframe_2 = gr.Dataframe(data_benchmark_2, datatype=datatype_2, type="pandas")
173
 
174
  def update_data_2(metric, search_term, selected_columns):
175
  model_handler.get_vidore_data(metric)
176
- data = model_handler.compute_averages(metric, benchmark_version=2)
177
  data = add_rank_and_format(data, benchmark_version=2)
178
  data = filter_models(data, search_term)
179
  # data = remove_duplicates(data) # Add this line
180
  if selected_columns:
181
- data = data[["Rank", "Model", "Average"] + selected_columns]
182
  return data
183
 
184
  with gr.Row():
@@ -217,9 +237,9 @@ def main():
217
 
218
  gr.Markdown(
219
  f"""
220
- - **Total Datasets**: {NUM_DATASETS_2}
221
- - **Total Scores**: {NUM_SCORES_2}
222
- - **Total Models**: {NUM_MODELS_2}
223
  """
224
  + r"""
225
  Please consider citing:
@@ -247,7 +267,6 @@ def main():
247
  ```
248
  """
249
  )
250
-
251
  with gr.TabItem("📚 Submit your model"):
252
  gr.Markdown("# How to Submit a New Model to the Leaderboard")
253
  gr.Markdown(
@@ -255,32 +274,10 @@ def main():
255
  To submit a new model to the ViDoRe leaderboard, follow these steps:
256
 
257
  1. **Evaluate your model**:
258
- - Follow the evaluation script provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/)
259
 
260
  2. **Format your submission file**:
261
- - The submission file should automatically be generated, and named `results.json` with the
262
- following structure:
263
- ```json
264
- {
265
- "dataset_name_1": {
266
- "metric_1": score_1,
267
- "metric_2": score_2,
268
- ...
269
- },
270
- "dataset_name_2": {
271
- "metric_1": score_1,
272
- "metric_2": score_2,
273
- ...
274
- },
275
- }
276
- ```
277
- - The dataset names should be the same as the ViDoRe and ViDoRe 2 dataset names listed in the following
278
- collections: [ViDoRe Benchmark](https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d) and [ViDoRe Benchmark 2](vidore/vidore-benchmark-v2-dev-67ae03e3924e85b36e7f53b0).
279
-
280
- 3. **Submit your model**:
281
- - Create a public HuggingFace model repository with your model.
282
- - Add the tag `vidore` to your model in the metadata of the model card and place the
283
- `results.json` file at the root.
284
 
285
  And you're done! Your model will appear on the leaderboard when you click refresh! Once the space
286
  gets rebooted, it will appear on startup.
@@ -289,6 +286,210 @@ def main():
289
  kebab-case, e.g. `my-model-name`.
290
  """
291
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  block.queue(max_size=10).launch(debug=True)
294
 
 
1
  import gradio as gr
2
 
3
  from app.utils import add_rank_and_format, filter_models, get_refresh_function
4
+ from data.deprecated_model_handler import DeprecatedModelHandler
5
  from data.model_handler import ModelHandler
6
 
7
  METRICS = [
 
17
 
18
 
19
  def main():
20
+ # Get new results
21
  model_handler = ModelHandler()
22
  initial_metric = "ndcg_at_5"
23
 
24
  model_handler.get_vidore_data(initial_metric)
25
+ data_benchmark_1 = model_handler.render_df(initial_metric, benchmark_version=1)
26
  data_benchmark_1 = add_rank_and_format(data_benchmark_1, benchmark_version=1)
27
 
28
+ data_benchmark_2 = model_handler.render_df(initial_metric, benchmark_version=2)
29
  data_benchmark_2 = add_rank_and_format(data_benchmark_2, benchmark_version=2)
30
 
31
+ num_datasets_1 = len(data_benchmark_1.columns) - 3
32
+ num_scores_1 = len(data_benchmark_1) * num_datasets_1
33
+ num_models_1 = len(data_benchmark_1)
34
 
35
+ num_datasets_2 = len(data_benchmark_2.columns) - 3
36
+ num_scores_2 = len(data_benchmark_2) * num_datasets_2
37
+ num_models_2 = len(data_benchmark_2)
38
+
39
+ # Get deprecated results
40
+ deprecated_model_handler = DeprecatedModelHandler()
41
+ initial_metric = "ndcg_at_5"
42
+
43
+ deprecated_model_handler.get_vidore_data(initial_metric)
44
+ deprecated_data_benchmark_1 = deprecated_model_handler.render_df(initial_metric, benchmark_version=1)
45
+ deprecated_data_benchmark_1 = add_rank_and_format(deprecated_data_benchmark_1, benchmark_version=1)
46
+
47
+ deprecated_data_benchmark_2 = deprecated_model_handler.render_df(initial_metric, benchmark_version=2)
48
+ deprecated_data_benchmark_2 = add_rank_and_format(deprecated_data_benchmark_2, benchmark_version=2)
49
+
50
+ deprecated_num_datasets_1 = len(deprecated_data_benchmark_1.columns) - 3
51
+ deprecated_num_scores_1 = len(deprecated_data_benchmark_1) * deprecated_num_datasets_1
52
+ deprecated_num_models_1 = len(deprecated_data_benchmark_1)
53
+
54
+ deprecated_num_datasets_2 = len(deprecated_data_benchmark_2.columns) - 3
55
+ deprecated_num_scores_2 = len(deprecated_data_benchmark_2) * deprecated_num_datasets_2
56
+ deprecated_num_models_2 = len(deprecated_data_benchmark_2)
57
 
58
  css = """
59
  table > thead {
 
80
 
81
  with gr.Blocks(css=css) as block:
82
  with gr.Tabs():
83
+ with gr.TabItem("ViDoRe V1"):
84
  gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
85
  gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
86
 
 
91
  Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
92
  """
93
  )
94
+ datasets_columns_1 = list(data_benchmark_1.columns[4:])
95
 
96
  with gr.Row():
97
  metric_dropdown_1 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
 
104
  )
105
 
106
  with gr.Row():
107
+ datatype_1 = ["number", "markdown"] + ["number"] * (num_datasets_1 + 1)
108
  dataframe_1 = gr.Dataframe(data_benchmark_1, datatype=datatype_1, type="pandas")
109
 
110
  def update_data_1(metric, search_term, selected_columns):
111
  model_handler.get_vidore_data(metric)
112
+ data = deprecated_model_handler.render_df(metric, benchmark_version=1)
113
  data = add_rank_and_format(data, benchmark_version=1)
114
  data = filter_models(data, search_term)
 
115
  if selected_columns:
116
+ data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average"] + selected_columns]
117
  return data
118
 
119
  with gr.Row():
 
144
 
145
  gr.Markdown(
146
  f"""
147
+ - **Total Datasets**: {num_datasets_1}
148
+ - **Total Scores**: {num_scores_1}
149
+ - **Total Models**: {num_models_1}
150
  """
151
  + r"""
152
  Please consider citing:
 
164
  ```
165
  """
166
  )
167
+ with gr.TabItem("ViDoRe V2"):
168
  gr.Markdown("# ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍")
169
  gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
170
 
 
175
  Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models.
176
  """
177
  )
178
+ datasets_columns_2 = list(data_benchmark_2.columns[4:])
179
 
180
  with gr.Row():
181
  metric_dropdown_2 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
 
188
  )
189
 
190
  with gr.Row():
191
+ datatype_2 = ["number", "markdown"] + ["number"] * (num_datasets_2 + 1)
192
  dataframe_2 = gr.Dataframe(data_benchmark_2, datatype=datatype_2, type="pandas")
193
 
194
  def update_data_2(metric, search_term, selected_columns):
195
  model_handler.get_vidore_data(metric)
196
+ data = deprecated_model_handler.render_df(metric, benchmark_version=2)
197
  data = add_rank_and_format(data, benchmark_version=2)
198
  data = filter_models(data, search_term)
199
  # data = remove_duplicates(data) # Add this line
200
  if selected_columns:
201
+ data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average"] + selected_columns]
202
  return data
203
 
204
  with gr.Row():
 
237
 
238
  gr.Markdown(
239
  f"""
240
+ - **Total Datasets**: {num_datasets_2}
241
+ - **Total Scores**: {num_scores_2}
242
+ - **Total Models**: {num_models_2}
243
  """
244
  + r"""
245
  Please consider citing:
 
267
  ```
268
  """
269
  )
 
270
  with gr.TabItem("📚 Submit your model"):
271
  gr.Markdown("# How to Submit a New Model to the Leaderboard")
272
  gr.Markdown(
 
274
  To submit a new model to the ViDoRe leaderboard, follow these steps:
275
 
276
  1. **Evaluate your model**:
277
+ - Follow the evaluation procedure provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/) that uses MTEB.
278
 
279
  2. **Format your submission file**:
280
+ - Add the generated files to [MTEB results](https://github.com/embeddings-benchmark/results) project. Check the [Colpali results](https://github.com/embeddings-benchmark/results/tree/main/results/vidore__colpali-v1.3/1b5c8929330df1a66de441a9b5409a878f0de5b0) for an example.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
  And you're done! Your model will appear on the leaderboard when you click refresh! Once the space
283
  gets rebooted, it will appear on startup.
 
286
  kebab-case, e.g. `my-model-name`.
287
  """
288
  )
289
+ with gr.TabItem("[Deprecated] ViDoRe V1"):
290
+ gr.Markdown(
291
+ "## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
292
+ "[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
293
+ "which is no longer maintained. Results should be computed using the "
294
+ "[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
295
+ "[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
296
+ )
297
+ gr.Markdown("## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>")
298
+ gr.Markdown("# <span style='color:red'>[Deprecated]</span> ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
299
+ gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
300
+
301
+ gr.Markdown(
302
+ """
303
+ Visual Document Retrieval Benchmark 1 leaderboard. To submit results, refer to the corresponding tab.
304
+
305
+ Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
306
+ """
307
+ )
308
+ datasets_columns_1 = list(deprecated_data_benchmark_1.columns[3:])
309
+
310
+ with gr.Row():
311
+ metric_dropdown_1 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
312
+ research_textbox_1 = gr.Textbox(
313
+ placeholder="🔍 Search Models... [press enter]",
314
+ label="Filter Models by Name",
315
+ )
316
+ column_checkboxes_1 = gr.CheckboxGroup(
317
+ choices=datasets_columns_1, value=datasets_columns_1, label="Select Columns to Display"
318
+ )
319
+
320
+ with gr.Row():
321
+ datatype_1 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_1 + 1)
322
+ dataframe_1 = gr.Dataframe(deprecated_data_benchmark_1, datatype=datatype_1, type="pandas")
323
+
324
+ def update_data_1(metric, search_term, selected_columns):
325
+ deprecated_model_handler.get_vidore_data(metric)
326
+ data = deprecated_model_handler.render_df(metric, benchmark_version=1)
327
+ data = add_rank_and_format(data, benchmark_version=1)
328
+ data = filter_models(data, search_term)
329
+ # data = remove_duplicates(data) # Add this line
330
+ if selected_columns:
331
+ data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average"] + selected_columns]
332
+ return data
333
+
334
+ with gr.Row():
335
+ refresh_button_1 = gr.Button("Refresh")
336
+ refresh_button_1.click(
337
+ get_refresh_function(deprecated_model_handler, benchmark_version=1),
338
+ inputs=[metric_dropdown_1],
339
+ outputs=dataframe_1,
340
+ concurrency_limit=20,
341
+ )
342
+
343
+ # Automatically refresh the dataframe when the dropdown value changes
344
+ metric_dropdown_1.change(
345
+ get_refresh_function(deprecated_model_handler, benchmark_version=1),
346
+ inputs=[metric_dropdown_1],
347
+ outputs=dataframe_1,
348
+ )
349
+ research_textbox_1.submit(
350
+ lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns),
351
+ inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1],
352
+ outputs=dataframe_1,
353
+ )
354
+ column_checkboxes_1.change(
355
+ lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns),
356
+ inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1],
357
+ outputs=dataframe_1,
358
+ )
359
+
360
+ gr.Markdown(
361
+ f"""
362
+ - **Total Datasets**: {deprecated_num_datasets_1}
363
+ - **Total Scores**: {deprecated_num_scores_1}
364
+ - **Total Models**: {deprecated_num_models_1}
365
+ """
366
+ + r"""
367
+ Please consider citing:
368
+
369
+ ```bibtex
370
+ @misc{faysse2024colpaliefficientdocumentretrieval,
371
+ title={ColPali: Efficient Document Retrieval with Vision Language Models},
372
+ author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
373
+ year={2024},
374
+ eprint={2407.01449},
375
+ archivePrefix={arXiv},
376
+ primaryClass={cs.IR},
377
+ url={https://arxiv.org/abs/2407.01449},
378
+ }
379
+ ```
380
+ """
381
+ )
382
+ with gr.TabItem("[Deprecated] ViDoRe V2"):
383
+ gr.Markdown(
384
+ "## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the "
385
+ "[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
386
+ "which is no longer maintained. Results should be computed using the "
387
+ "[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
388
+ "[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>"
389
+ )
390
+ gr.Markdown("## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>")
391
+ gr.Markdown("# <span style='color:red'>[Deprecated]</span> ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍")
392
+ gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
393
+
394
+ gr.Markdown(
395
+ """
396
+ Visual Document Retrieval Benchmark 2 leaderboard. To submit results, refer to the corresponding tab.
397
+
398
+ Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models.
399
+ """
400
+ )
401
+ datasets_columns_2 = list(deprecated_data_benchmark_2.columns[3:])
402
+
403
+ with gr.Row():
404
+ metric_dropdown_2 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
405
+ research_textbox_2 = gr.Textbox(
406
+ placeholder="🔍 Search Models... [press enter]",
407
+ label="Filter Models by Name",
408
+ )
409
+ column_checkboxes_2 = gr.CheckboxGroup(
410
+ choices=datasets_columns_2, value=datasets_columns_2, label="Select Columns to Display"
411
+ )
412
+
413
+ with gr.Row():
414
+ datatype_2 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_2 + 1)
415
+ dataframe_2 = gr.Dataframe(deprecated_data_benchmark_2, datatype=datatype_2, type="pandas")
416
+
417
+ def update_data_2(metric, search_term, selected_columns):
418
+ deprecated_model_handler.get_vidore_data(metric)
419
+ data = deprecated_model_handler.render_df(metric, benchmark_version=2)
420
+ data = add_rank_and_format(data, benchmark_version=2)
421
+ data = filter_models(data, search_term)
422
+ # data = remove_duplicates(data) # Add this line
423
+ if selected_columns:
424
+ data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average"] + selected_columns]
425
+ return data
426
+
427
+ with gr.Row():
428
+ refresh_button_2 = gr.Button("Refresh")
429
+ refresh_button_2.click(
430
+ get_refresh_function(deprecated_model_handler, benchmark_version=2),
431
+ inputs=[metric_dropdown_2],
432
+ outputs=dataframe_2,
433
+ concurrency_limit=20,
434
+ )
435
+
436
+ with gr.Row():
437
+ gr.Markdown(
438
+ """
439
+ **Note**: For now, all models were evaluated using the vidore-benchmark package and custom retrievers on our side.
440
+ Those numbers are not numbers obtained from the organisations that released those models.
441
+ """
442
+ )
443
+
444
+ # Automatically refresh the dataframe when the dropdown value changes
445
+ metric_dropdown_2.change(
446
+ get_refresh_function(deprecated_model_handler, benchmark_version=2),
447
+ inputs=[metric_dropdown_2],
448
+ outputs=dataframe_2,
449
+ )
450
+ research_textbox_2.submit(
451
+ lambda metric, search_term, selected_columns: update_data_2(metric, search_term, selected_columns),
452
+ inputs=[metric_dropdown_2, research_textbox_2, column_checkboxes_2],
453
+ outputs=dataframe_2,
454
+ )
455
+ column_checkboxes_2.change(
456
+ lambda metric, search_term, selected_columns: update_data_2(metric, search_term, selected_columns),
457
+ inputs=[metric_dropdown_2, research_textbox_2, column_checkboxes_2],
458
+ outputs=dataframe_2,
459
+ )
460
+
461
+ gr.Markdown(
462
+ f"""
463
+ - **Total Datasets**: {deprecated_num_datasets_2}
464
+ - **Total Scores**: {deprecated_num_scores_2}
465
+ - **Total Models**: {deprecated_num_models_2}
466
+ """
467
+ + r"""
468
+ Please consider citing:
469
+
470
+ ```bibtex
471
+ @misc{faysse2024colpaliefficientdocumentretrieval,
472
+ title={ColPali: Efficient Document Retrieval with Vision Language Models},
473
+ author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
474
+ year={2024},
475
+ eprint={2407.01449},
476
+ archivePrefix={arXiv},
477
+ primaryClass={cs.IR},
478
+ url={https://arxiv.org/abs/2407.01449},
479
+ }
480
+
481
+ @misc{macé2025vidorebenchmarkv2raising,
482
+ title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
483
+ author={Quentin Macé and António Loison and Manuel Faysse},
484
+ year={2025},
485
+ eprint={2505.17166},
486
+ archivePrefix={arXiv},
487
+ primaryClass={cs.IR},
488
+ url={https://arxiv.org/abs/2505.17166},
489
+ }
490
+ ```
491
+ """
492
+ )
493
 
494
  block.queue(max_size=10).launch(debug=True)
495
 
app/utils.py CHANGED
@@ -1,9 +1,10 @@
1
- from data.model_handler import ModelHandler
2
 
3
 
4
  def make_clickable_model(model_name, link=None):
5
  if link is None:
6
- desanitized_model_name = model_name.replace("_", "/")
 
7
  desanitized_model_name = desanitized_model_name.replace("-thisisapoint-", ".")
8
 
9
  if "/captioning" in desanitized_model_name:
@@ -16,10 +17,38 @@ def make_clickable_model(model_name, link=None):
16
  return f'<a target="_blank" style="text-decoration: underline" href="{link}">{desanitized_model_name}</a>'
17
 
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def add_rank_and_format(df, benchmark_version=1):
20
  df = df.reset_index()
21
  df = df.rename(columns={"index": "Model"})
22
- df = ModelHandler.add_rank(df, benchmark_version)
23
  df["Model"] = df["Model"].apply(make_clickable_model)
24
  # df = remove_duplicates(df)
25
  return df
@@ -36,7 +65,7 @@ def remove_duplicates(df):
36
  def get_refresh_function(model_handler, benchmark_version):
37
  def _refresh(metric):
38
  model_handler.get_vidore_data(metric)
39
- data_task_category = model_handler.compute_averages(metric, benchmark_version)
40
  df = add_rank_and_format(data_task_category, benchmark_version)
41
  return df
42
 
 
1
+ from data.deprecated_model_handler import DeprecatedModelHandler
2
 
3
 
4
  def make_clickable_model(model_name, link=None):
5
  if link is None:
6
+ desanitized_model_name = model_name.replace("__", "/")
7
+ desanitized_model_name = desanitized_model_name.replace("_", "/")
8
  desanitized_model_name = desanitized_model_name.replace("-thisisapoint-", ".")
9
 
10
  if "/captioning" in desanitized_model_name:
 
17
  return f'<a target="_blank" style="text-decoration: underline" href="{link}">{desanitized_model_name}</a>'
18
 
19
 
20
+ def add_rank(df, benchmark_version=1):
21
+ df.fillna(0.0, inplace=True)
22
+ cols_to_rank = [
23
+ col
24
+ for col in df.columns
25
+ if col
26
+ not in [
27
+ "Model",
28
+ "Model Size (Million Parameters)",
29
+ "Memory Usage (GB, fp32)",
30
+ "Embedding Dimensions",
31
+ "Max Tokens",
32
+ ]
33
+ ]
34
+
35
+ if len(cols_to_rank) == 1:
36
+ df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
37
+ else:
38
+ df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False))
39
+ df.sort_values("Average", ascending=False, inplace=True)
40
+ df.insert(0, "Rank", list(range(1, len(df) + 1)))
41
+ # multiply values by 100 if they are floats and round to 1 decimal place
42
+ for col in df.columns:
43
+ if df[col].dtype == "float64" and col != "Model Size (Million Parameters)":
44
+ df[col] = df[col].apply(lambda x: round(x * 100, 1))
45
+ return df
46
+
47
+
48
  def add_rank_and_format(df, benchmark_version=1):
49
  df = df.reset_index()
50
  df = df.rename(columns={"index": "Model"})
51
+ df = add_rank(df, benchmark_version)
52
  df["Model"] = df["Model"].apply(make_clickable_model)
53
  # df = remove_duplicates(df)
54
  return df
 
65
  def get_refresh_function(model_handler, benchmark_version):
66
  def _refresh(metric):
67
  model_handler.get_vidore_data(metric)
68
+ data_task_category = model_handler.render_df(metric, benchmark_version)
69
  df = add_rank_and_format(data_task_category, benchmark_version)
70
  return df
71
 
data/dataset_handler.py CHANGED
@@ -1,4 +1,22 @@
1
- VIDORE_DATASETS_KEYWORDS = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "arxivqa",
3
  "docvqa",
4
  "infovqa",
@@ -11,15 +29,48 @@ VIDORE_DATASETS_KEYWORDS = [
11
  "healthcare_industry",
12
  ]
13
 
14
- VIDORE_2_DATASETS_KEYWORDS = [
15
  "restaurant_esg",
16
  "rse_restaurant",
17
  "mit_biomedical",
18
  "economics_macro",
19
  ]
20
 
21
-
22
  def get_datasets_nickname(dataset_name) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  if "arxivqa" in dataset_name:
24
  return "ArxivQA"
25
 
 
1
+ VIDORE_V1_MTEB_NAMES = [
2
+ "VidoreArxivQARetrieval",
3
+ "VidoreDocVQARetrieval",
4
+ "VidoreInfoVQARetrieval",
5
+ "VidoreShiftProjectRetrieval",
6
+ "VidoreSyntheticDocQAAIRetrieval",
7
+ "VidoreSyntheticDocQAEnergyRetrieval",
8
+ "VidoreSyntheticDocQAGovernmentReportsRetrieval",
9
+ "VidoreSyntheticDocQAHealthcareIndustryRetrieval",
10
+ "VidoreTabfquadRetrieval",
11
+ "VidoreTatdqaRetrieval",
12
+ ]
13
+ VIDORE_V2_MTEB_NAMES = [
14
+ "Vidore2BioMedicalLecturesRetrieval",
15
+ "Vidore2EconomicsReportsRetrieval",
16
+ "Vidore2ESGReportsHLRetrieval",
17
+ "Vidore2ESGReportsRetrieval",
18
+ ]
19
+ DEPRECATED_VIDORE_DATASETS_KEYWORDS = [
20
  "arxivqa",
21
  "docvqa",
22
  "infovqa",
 
29
  "healthcare_industry",
30
  ]
31
 
32
+ DEPRECATED_VIDORE_2_DATASETS_KEYWORDS = [
33
  "restaurant_esg",
34
  "rse_restaurant",
35
  "mit_biomedical",
36
  "economics_macro",
37
  ]
38
 
 
39
  def get_datasets_nickname(dataset_name) -> str:
40
+ if dataset_name == "VidoreArxivQARetrieval":
41
+ return "ArxivQA"
42
+ elif dataset_name == "VidoreDocVQARetrieval":
43
+ return "DocVQA"
44
+ elif dataset_name == "VidoreInfoVQARetrieval":
45
+ return "InfoVQA"
46
+ elif dataset_name == "VidoreTabfquadRetrieval":
47
+ return "TabFQuad"
48
+ elif dataset_name == "VidoreTatdqaRetrieval":
49
+ return "TAT-DQA"
50
+ elif dataset_name == "VidoreShiftProjectRetrieval":
51
+ return "Shift Project"
52
+ elif dataset_name == "VidoreSyntheticDocQAAIRetrieval":
53
+ return "Artificial Intelligence"
54
+ elif dataset_name == "VidoreSyntheticDocQAEnergyRetrieval":
55
+ return "Energy"
56
+ elif dataset_name == "VidoreSyntheticDocQAGovernmentReportsRetrieval":
57
+ return "Government Reports"
58
+ elif dataset_name == "VidoreSyntheticDocQAHealthcareIndustryRetrieval":
59
+ return "Healthcare Industry"
60
+
61
+ elif dataset_name == "Vidore2ESGReportsHLRetrieval":
62
+ return "ESG Restaurant Human English"
63
+ elif dataset_name == "Vidore2ESGReportsRetrieval":
64
+ return "ESG Restaurant Synthetic Multilingual"
65
+ elif dataset_name == "Vidore2BioMedicalLecturesRetrieval":
66
+ return "MIT Biomedical Multilingual"
67
+ elif dataset_name == "Vidore2EconomicsReportsRetrieval":
68
+ return "Economics Macro Multilingual"
69
+
70
+ else:
71
+ raise ValueError(f"Dataset {dataset_name} not found in ViDoRe")
72
+
73
+ def deprecated_get_datasets_nickname(dataset_name) -> str:
74
  if "arxivqa" in dataset_name:
75
  return "ArxivQA"
76
 
data/deprecated_model_handler.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import Any, Dict
4
+
5
+ import pandas as pd
6
+ from huggingface_hub import HfApi, hf_hub_download, metadata_load
7
+
8
+ from .dataset_handler import DEPRECATED_VIDORE_2_DATASETS_KEYWORDS, DEPRECATED_VIDORE_DATASETS_KEYWORDS, deprecated_get_datasets_nickname
9
+
10
+ BLOCKLIST = ["impactframes"]
11
+
12
+
13
+ class DeprecatedModelHandler:
14
+ def __init__(self, model_infos_path="model_infos.json"):
15
+ self.api = HfApi()
16
+ self.model_infos_path = model_infos_path
17
+ self.model_infos = self._load_model_infos()
18
+
19
+ def _load_model_infos(self) -> Dict:
20
+ if os.path.exists(self.model_infos_path):
21
+ with open(self.model_infos_path) as f:
22
+ return json.load(f)
23
+ return {}
24
+
25
+ def _save_model_infos(self):
26
+ with open(self.model_infos_path, "w") as f:
27
+ json.dump(self.model_infos, f)
28
+
29
+ def _are_results_in_new_vidore_format(self, results: Dict[str, Any]) -> bool:
30
+ return "metadata" in results and "metrics" in results
31
+
32
+ def _is_baseline_repo(self, repo_id: str) -> bool:
33
+ return repo_id == "vidore/baseline-results"
34
+
35
+ def sanitize_model_name(self, model_name):
36
+ return model_name.replace("/", "_").replace(".", "-thisisapoint-")
37
+
38
+ def fuze_model_infos(self, model_name, results):
39
+ for dataset, metrics in results.items():
40
+ if dataset not in self.model_infos[model_name]["results"].keys():
41
+ self.model_infos[model_name]["results"][dataset] = metrics
42
+ else:
43
+ continue
44
+
45
+ def get_vidore_data(self, metric="ndcg_at_5"):
46
+ models = self.api.list_models(filter="vidore")
47
+ repositories = [model.modelId for model in models] # type: ignore
48
+
49
+ # Sort repositories to process non-baseline repos first (to prioritize their results)
50
+ repositories.sort(key=lambda x: self._is_baseline_repo(x))
51
+
52
+ for repo_id in repositories:
53
+ org_name = repo_id.split("/")[0]
54
+ if org_name in BLOCKLIST:
55
+ continue
56
+ files = [f for f in self.api.list_repo_files(repo_id) if f.endswith("_metrics.json") or f == "results.json"]
57
+
58
+ if len(files) == 0:
59
+ continue
60
+ else:
61
+ for file in files:
62
+ if file.endswith("results.json"):
63
+ model_name = repo_id.replace("/", "_").replace(".", "-thisisapoint-")
64
+ else:
65
+ model_name = file.split("_metrics.json")[0]
66
+ model_name = model_name.replace("/", "_").replace(".", "-thisisapoint-")
67
+
68
+ # Skip if the model is from baseline and we already have results
69
+
70
+ readme_path = hf_hub_download(repo_id, filename="README.md")
71
+ meta = metadata_load(readme_path)
72
+ try:
73
+ result_path = hf_hub_download(repo_id, filename=file)
74
+
75
+ with open(result_path) as f:
76
+ results = json.load(f)
77
+
78
+ if self._are_results_in_new_vidore_format(results):
79
+ metadata = results["metadata"]
80
+ results = results["metrics"]
81
+
82
+ # Handles the case where the model is both in baseline and outside of it
83
+ # (prioritizes the non-baseline results)
84
+ if self._is_baseline_repo(repo_id) and self.sanitize_model_name(model_name) in self.model_infos:
85
+ self.fuze_model_infos(model_name, results)
86
+
87
+ self.model_infos[model_name] = {"meta": meta, "results": results}
88
+ except Exception as e:
89
+ print(f"Error loading {model_name} - {e}")
90
+ continue
91
+
92
+ # In order to keep only models relevant to a benchmark
93
+ def filter_models_by_benchmark(self, benchmark_version=1):
94
+ filtered_model_infos = {}
95
+ keywords = DEPRECATED_VIDORE_DATASETS_KEYWORDS if benchmark_version == 1 else DEPRECATED_VIDORE_2_DATASETS_KEYWORDS
96
+
97
+ for model, info in self.model_infos.items():
98
+ results = info["results"]
99
+ if any(any(keyword in dataset for keyword in keywords) for dataset in results.keys()):
100
+ filtered_model_infos[model] = info
101
+
102
+ return filtered_model_infos
103
+
104
+ # Compute the average of a metric for each model,
105
+ def render_df(self, metric="ndcg_at_5", benchmark_version=1):
106
+ model_res = {}
107
+ filtered_model_infos = self.filter_models_by_benchmark(benchmark_version)
108
+ if len(filtered_model_infos) > 0:
109
+ for model in filtered_model_infos.keys():
110
+ res = filtered_model_infos[model]["results"]
111
+ dataset_res = {}
112
+ keywords = DEPRECATED_VIDORE_DATASETS_KEYWORDS if benchmark_version == 1 else DEPRECATED_VIDORE_2_DATASETS_KEYWORDS
113
+ for dataset in res.keys():
114
+ if not any(keyword in dataset for keyword in keywords):
115
+ continue
116
+
117
+ dataset_nickname = deprecated_get_datasets_nickname(dataset)
118
+ dataset_res[dataset_nickname] = res[dataset][metric]
119
+ model_res[model] = dataset_res
120
+
121
+ df = pd.DataFrame(model_res).T
122
+
123
+ return df
124
+ return pd.DataFrame()
data/model_handler.py CHANGED
@@ -1,98 +1,66 @@
1
- import json
 
2
  import os
3
- from typing import Any, Dict
4
-
5
  import pandas as pd
6
- from huggingface_hub import HfApi, hf_hub_download, metadata_load
7
-
8
- from .dataset_handler import VIDORE_2_DATASETS_KEYWORDS, VIDORE_DATASETS_KEYWORDS, get_datasets_nickname
9
-
10
- BLOCKLIST = ["impactframes"]
11
 
 
12
 
13
  class ModelHandler:
14
- def __init__(self, model_infos_path="model_infos.json"):
15
- self.api = HfApi()
16
- self.model_infos_path = model_infos_path
17
- self.model_infos = self._load_model_infos()
18
-
19
- def _load_model_infos(self) -> Dict:
20
- if os.path.exists(self.model_infos_path):
21
- with open(self.model_infos_path) as f:
22
- return json.load(f)
23
- return {}
24
-
25
- def _save_model_infos(self):
26
- with open(self.model_infos_path, "w") as f:
27
- json.dump(self.model_infos, f)
28
-
29
- def _are_results_in_new_vidore_format(self, results: Dict[str, Any]) -> bool:
30
- return "metadata" in results and "metrics" in results
31
 
32
- def _is_baseline_repo(self, repo_id: str) -> bool:
33
- return repo_id == "vidore/baseline-results"
34
 
35
- def sanitize_model_name(self, model_name):
36
- return model_name.replace("/", "_").replace(".", "-thisisapoint-")
37
-
38
- def fuze_model_infos(self, model_name, results):
39
- for dataset, metrics in results.items():
40
- if dataset not in self.model_infos[model_name]["results"].keys():
41
- self.model_infos[model_name]["results"][dataset] = metrics
42
- else:
43
- continue
44
 
45
  def get_vidore_data(self, metric="ndcg_at_5"):
46
- models = self.api.list_models(filter="vidore")
47
- repositories = [model.modelId for model in models] # type: ignore
48
-
49
- # Sort repositories to process non-baseline repos first (to prioritize their results)
50
- repositories.sort(key=lambda x: self._is_baseline_repo(x))
51
-
52
- for repo_id in repositories:
53
- org_name = repo_id.split("/")[0]
54
- if org_name in BLOCKLIST:
55
- continue
56
- files = [f for f in self.api.list_repo_files(repo_id) if f.endswith("_metrics.json") or f == "results.json"]
57
-
58
- if len(files) == 0:
59
- continue
 
 
 
 
 
 
 
 
 
60
  else:
61
- for file in files:
62
- if file.endswith("results.json"):
63
- model_name = repo_id.replace("/", "_").replace(".", "-thisisapoint-")
64
- else:
65
- model_name = file.split("_metrics.json")[0]
66
- model_name = model_name.replace("/", "_").replace(".", "-thisisapoint-")
67
-
68
- # Skip if the model is from baseline and we already have results
69
-
70
- readme_path = hf_hub_download(repo_id, filename="README.md")
71
- meta = metadata_load(readme_path)
72
- try:
73
- result_path = hf_hub_download(repo_id, filename=file)
74
-
75
- with open(result_path) as f:
76
- results = json.load(f)
77
-
78
- if self._are_results_in_new_vidore_format(results):
79
- metadata = results["metadata"]
80
- results = results["metrics"]
81
-
82
- # Handles the case where the model is both in baseline and outside of it
83
- # (prioritizes the non-baseline results)
84
- if self._is_baseline_repo(repo_id) and self.sanitize_model_name(model_name) in self.model_infos:
85
- self.fuze_model_infos(model_name, results)
86
-
87
- self.model_infos[model_name] = {"meta": meta, "results": results}
88
- except Exception as e:
89
- print(f"Error loading {model_name} - {e}")
90
- continue
91
 
92
- # In order to keep only models relevant to a benchmark
93
  def filter_models_by_benchmark(self, benchmark_version=1):
94
  filtered_model_infos = {}
95
- keywords = VIDORE_DATASETS_KEYWORDS if benchmark_version == 1 else VIDORE_2_DATASETS_KEYWORDS
96
 
97
  for model, info in self.model_infos.items():
98
  results = info["results"]
@@ -101,52 +69,26 @@ class ModelHandler:
101
 
102
  return filtered_model_infos
103
 
104
- # Compute the average of a metric for each model,
105
- def compute_averages(self, metric="ndcg_at_5", benchmark_version=1):
106
  model_res = {}
107
  filtered_model_infos = self.filter_models_by_benchmark(benchmark_version)
108
  if len(filtered_model_infos) > 0:
109
  for model in filtered_model_infos.keys():
110
  res = filtered_model_infos[model]["results"]
111
  dataset_res = {}
112
- keywords = VIDORE_DATASETS_KEYWORDS if benchmark_version == 1 else VIDORE_2_DATASETS_KEYWORDS
 
 
 
 
113
  for dataset in res.keys():
114
  if not any(keyword in dataset for keyword in keywords):
115
  continue
116
-
117
  dataset_nickname = get_datasets_nickname(dataset)
118
- dataset_res[dataset_nickname] = res[dataset][metric]
119
  model_res[model] = dataset_res
120
 
121
  df = pd.DataFrame(model_res).T
122
 
123
  return df
124
  return pd.DataFrame()
125
-
126
- @staticmethod
127
- def add_rank(df, benchmark_version=1):
128
- df.fillna(0.0, inplace=True)
129
- cols_to_rank = [
130
- col
131
- for col in df.columns
132
- if col
133
- not in [
134
- "Model",
135
- "Model Size (Million Parameters)",
136
- "Memory Usage (GB, fp32)",
137
- "Embedding Dimensions",
138
- "Max Tokens",
139
- ]
140
- ]
141
-
142
- if len(cols_to_rank) == 1:
143
- df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
144
- else:
145
- df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False))
146
- df.sort_values("Average", ascending=False, inplace=True)
147
- df.insert(0, "Rank", list(range(1, len(df) + 1)))
148
- # multiply values by 100 if they are floats and round to 1 decimal place
149
- for col in df.columns:
150
- if df[col].dtype == "float64":
151
- df[col] = df[col].apply(lambda x: round(x * 100, 1))
152
- return df
 
1
+ from git import Repo
2
+ import shutil
3
  import os
4
+ import json
 
5
  import pandas as pd
 
 
 
 
 
6
 
7
+ from .dataset_handler import VIDORE_V1_MTEB_NAMES, VIDORE_V2_MTEB_NAMES, get_datasets_nickname
8
 
9
  class ModelHandler:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ def __init__(self):
12
+ self.model_infos = {}
13
 
14
+ @staticmethod
15
+ def get_folders(dir_path):
16
+ return sorted([
17
+ path_
18
+ for path_ in os.listdir(dir_path)
19
+ if os.path.isdir(os.path.join(dir_path, path_))
20
+ ])
 
 
21
 
22
  def get_vidore_data(self, metric="ndcg_at_5"):
23
+ repo_url = "https://github.com/embeddings-benchmark/results.git"
24
+ local_path = "./results"
25
+ folder_of_interest = "results"
26
+
27
+ if os.path.exists(local_path):
28
+ repo = Repo(local_path)
29
+ origin = repo.remotes.origin
30
+ origin.pull()
31
+ else:
32
+ Repo.clone_from(repo_url, local_path, depth=1)
33
+
34
+ model_names = self.get_folders(os.path.join(local_path, folder_of_interest))
35
+ for model_name in model_names:
36
+ revisions = self.get_folders(os.path.join(local_path, folder_of_interest, model_name))
37
+ first_revision = revisions[0]
38
+ result_filenames = [
39
+ result_filename
40
+ for result_filename in os.listdir(os.path.join(local_path, folder_of_interest, model_name, first_revision))
41
+ # if result_filename.endswith(".json") and result_filename != "model_meta.json"
42
+ ]
43
+ if "model_meta.json" in result_filenames:
44
+ with open(os.path.join(local_path, folder_of_interest, model_name, first_revision, "model_meta.json"), "r") as f:
45
+ meta = json.load(f)
46
  else:
47
+ meta = {}
48
+ results = {}
49
+ if all(f"{v1_dataset_name}.json" in result_filenames for v1_dataset_name in VIDORE_V1_MTEB_NAMES):
50
+ for v1_dataset_name in VIDORE_V1_MTEB_NAMES:
51
+ with open(os.path.join(local_path, folder_of_interest, model_name, first_revision, f"{v1_dataset_name}.json"), "r") as f:
52
+ results[v1_dataset_name] = json.load(f)
53
+ if all(f"{v2_dataset_name}.json" in result_filenames for v2_dataset_name in VIDORE_V2_MTEB_NAMES):
54
+ for v2_dataset_name in VIDORE_V2_MTEB_NAMES:
55
+ with open(os.path.join(local_path, folder_of_interest, model_name, first_revision, f"{v2_dataset_name}.json"), "r") as f:
56
+ results[v2_dataset_name] = json.load(f)
57
+ if model_name not in self.model_infos:
58
+ self.model_infos[model_name] = {}
59
+ self.model_infos[model_name] = {"meta": meta, "results": results}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
 
61
  def filter_models_by_benchmark(self, benchmark_version=1):
62
  filtered_model_infos = {}
63
+ keywords = VIDORE_V1_MTEB_NAMES if benchmark_version == 1 else VIDORE_V2_MTEB_NAMES
64
 
65
  for model, info in self.model_infos.items():
66
  results = info["results"]
 
69
 
70
  return filtered_model_infos
71
 
72
+ def render_df(self, metric="ndcg_at_5", benchmark_version=1):
 
73
  model_res = {}
74
  filtered_model_infos = self.filter_models_by_benchmark(benchmark_version)
75
  if len(filtered_model_infos) > 0:
76
  for model in filtered_model_infos.keys():
77
  res = filtered_model_infos[model]["results"]
78
  dataset_res = {}
79
+ keywords = VIDORE_V1_MTEB_NAMES if benchmark_version == 1 else VIDORE_V2_MTEB_NAMES
80
+ if "n_parameters" in filtered_model_infos[model]["meta"]:
81
+ dataset_res["Model Size (Million Parameters)"] = filtered_model_infos[model]["meta"]["n_parameters"] // 1_000_000
82
+ else:
83
+ dataset_res["Model Size (Million Parameters)"] = None
84
  for dataset in res.keys():
85
  if not any(keyword in dataset for keyword in keywords):
86
  continue
 
87
  dataset_nickname = get_datasets_nickname(dataset)
88
+ dataset_res[dataset_nickname] = res[dataset]["scores"]["test"][0][metric]
89
  model_res[model] = dataset_res
90
 
91
  df = pd.DataFrame(model_res).T
92
 
93
  return df
94
  return pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gitpython
2
+ gradio