Spaces:
Runtime error
Runtime error
Commit
ยท
dbfa15a
1
Parent(s):
3ffdc42
Add emojis
Browse files
app.py
CHANGED
|
@@ -3,8 +3,6 @@ import pandas as pd
|
|
| 3 |
from huggingface_hub import HfApi, hf_hub_download
|
| 4 |
from huggingface_hub.repocard import metadata_load
|
| 5 |
|
| 6 |
-
path = f"https://huggingface.co/api/spaces"
|
| 7 |
-
|
| 8 |
TASKS = [
|
| 9 |
"BitextMining",
|
| 10 |
"Classification",
|
|
@@ -185,15 +183,15 @@ def get_mteb_average(get_all_avgs=False):
|
|
| 185 |
cast_to_str=False
|
| 186 |
)
|
| 187 |
|
| 188 |
-
DATA_OVERALL.insert(1, "Average", DATA_OVERALL[TASK_LIST_EN].mean(axis=1, skipna=False))
|
| 189 |
-
DATA_OVERALL.insert(2, "Classification Average", DATA_OVERALL[TASK_LIST_CLASSIFICATION].mean(axis=1, skipna=False))
|
| 190 |
-
DATA_OVERALL.insert(3, "Clustering Average", DATA_OVERALL[TASK_LIST_CLUSTERING].mean(axis=1, skipna=False))
|
| 191 |
-
DATA_OVERALL.insert(4, "Pair Classification Average", DATA_OVERALL[TASK_LIST_PAIR_CLASSIFICATION].mean(axis=1, skipna=False))
|
| 192 |
-
DATA_OVERALL.insert(5, "Reranking Average", DATA_OVERALL[TASK_LIST_RERANKING].mean(axis=1, skipna=False))
|
| 193 |
-
DATA_OVERALL.insert(6, "Retrieval Average", DATA_OVERALL[TASK_LIST_RETRIEVAL].mean(axis=1, skipna=False))
|
| 194 |
-
DATA_OVERALL.insert(7, "STS Average", DATA_OVERALL[TASK_LIST_STS].mean(axis=1, skipna=False))
|
| 195 |
-
DATA_OVERALL.insert(8, "Summarization Average", DATA_OVERALL[TASK_LIST_SUMMARIZATION].mean(axis=1, skipna=False))
|
| 196 |
-
DATA_OVERALL.sort_values("Average", ascending=False, inplace=True)
|
| 197 |
# Start ranking from 1
|
| 198 |
DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1)))
|
| 199 |
|
|
@@ -207,7 +205,7 @@ def get_mteb_average(get_all_avgs=False):
|
|
| 207 |
DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
|
| 208 |
DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
|
| 209 |
|
| 210 |
-
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Average", "Classification Average", "Clustering Average", "Pair Classification Average", "Reranking Average", "Retrieval Average", "STS Average", "Summarization Average"]]
|
| 211 |
|
| 212 |
return DATA_OVERALL
|
| 213 |
|
|
@@ -216,19 +214,27 @@ block = gr.Blocks()
|
|
| 216 |
|
| 217 |
|
| 218 |
with block:
|
| 219 |
-
gr.Markdown(
|
| 220 |
-
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
with gr.Tabs():
|
| 223 |
with gr.TabItem("Overall"):
|
| 224 |
with gr.Row():
|
| 225 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
with gr.Row():
|
| 227 |
data_overall = gr.components.Dataframe(
|
| 228 |
DATA_OVERALL,
|
| 229 |
datatype=["markdown"] * len(DATA_OVERALL.columns) * 2,
|
| 230 |
type="pandas",
|
| 231 |
-
#col_count=(len(DATA_OVERALL.columns), "fixed"),
|
| 232 |
wrap=True,
|
| 233 |
)
|
| 234 |
with gr.Row():
|
|
@@ -236,7 +242,12 @@ with block:
|
|
| 236 |
data_run.click(get_mteb_average, inputs=None, outputs=data_overall)
|
| 237 |
with gr.TabItem("BitextMining"):
|
| 238 |
with gr.Row():
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
with gr.Row():
|
| 241 |
data_bitext_mining = gr.components.Dataframe(
|
| 242 |
datatype=["markdown"] * 500, # hack when we don't know how many columns
|
|
@@ -253,7 +264,12 @@ with block:
|
|
| 253 |
with gr.TabItem("Classification"):
|
| 254 |
with gr.TabItem("English"):
|
| 255 |
with gr.Row():
|
| 256 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
with gr.Row():
|
| 258 |
data_classification_en = gr.components.Dataframe(
|
| 259 |
DATA_CLASSIFICATION_EN,
|
|
@@ -274,7 +290,12 @@ with block:
|
|
| 274 |
)
|
| 275 |
with gr.TabItem("Multilingual"):
|
| 276 |
with gr.Row():
|
| 277 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
with gr.Row():
|
| 279 |
data_classification = gr.components.Dataframe(
|
| 280 |
datatype=["markdown"] * 500, # hack when we don't know how many columns
|
|
@@ -290,7 +311,12 @@ with block:
|
|
| 290 |
)
|
| 291 |
with gr.TabItem("Clustering"):
|
| 292 |
with gr.Row():
|
| 293 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
with gr.Row():
|
| 295 |
data_clustering = gr.components.Dataframe(
|
| 296 |
DATA_CLUSTERING,
|
|
@@ -308,7 +334,12 @@ with block:
|
|
| 308 |
)
|
| 309 |
with gr.TabItem("Pair Classification"):
|
| 310 |
with gr.Row():
|
| 311 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
with gr.Row():
|
| 313 |
data_pair_classification = gr.components.Dataframe(
|
| 314 |
DATA_PAIR_CLASSIFICATION,
|
|
@@ -318,7 +349,7 @@ with block:
|
|
| 318 |
)
|
| 319 |
with gr.Row():
|
| 320 |
data_run = gr.Button("Refresh")
|
| 321 |
-
task_pair_classification = gr.Variable(value="
|
| 322 |
data_run.click(
|
| 323 |
get_mteb_data,
|
| 324 |
inputs=[task_pair_classification],
|
|
@@ -326,7 +357,12 @@ with block:
|
|
| 326 |
)
|
| 327 |
with gr.TabItem("Retrieval"):
|
| 328 |
with gr.Row():
|
| 329 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
with gr.Row():
|
| 331 |
data_retrieval = gr.components.Dataframe(
|
| 332 |
DATA_RETRIEVAL,
|
|
@@ -341,7 +377,12 @@ with block:
|
|
| 341 |
)
|
| 342 |
with gr.TabItem("Reranking"):
|
| 343 |
with gr.Row():
|
| 344 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
with gr.Row():
|
| 346 |
data_reranking = gr.components.Dataframe(
|
| 347 |
DATA_RERANKING,
|
|
@@ -359,7 +400,12 @@ with block:
|
|
| 359 |
with gr.TabItem("STS"):
|
| 360 |
with gr.TabItem("English"):
|
| 361 |
with gr.Row():
|
| 362 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
with gr.Row():
|
| 364 |
data_sts_en = gr.components.Dataframe(
|
| 365 |
DATA_STS_EN,
|
|
@@ -378,7 +424,12 @@ with block:
|
|
| 378 |
)
|
| 379 |
with gr.TabItem("Multilingual"):
|
| 380 |
with gr.Row():
|
| 381 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
with gr.Row():
|
| 383 |
data_sts = gr.components.Dataframe(
|
| 384 |
datatype=["markdown"] * 50, # hack when we don't know how many columns
|
|
@@ -390,7 +441,12 @@ with block:
|
|
| 390 |
data_run.click(get_mteb_data, inputs=[task_sts], outputs=data_sts)
|
| 391 |
with gr.TabItem("Summarization"):
|
| 392 |
with gr.Row():
|
| 393 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
with gr.Row():
|
| 395 |
data_summarization = gr.components.Dataframe(
|
| 396 |
DATA_SUMMARIZATION,
|
|
@@ -406,13 +462,15 @@ with block:
|
|
| 406 |
inputs=[task_summarization],
|
| 407 |
outputs=data_summarization,
|
| 408 |
)
|
| 409 |
-
#
|
|
|
|
| 410 |
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
|
| 411 |
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
|
| 412 |
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
|
| 413 |
block.load(get_mteb_data, inputs=[task_clustering], outputs=data_clustering)
|
| 414 |
block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval)
|
| 415 |
block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking)
|
|
|
|
| 416 |
block.load(get_mteb_data, inputs=[task_sts], outputs=data_sts)
|
| 417 |
block.load(get_mteb_data, inputs=[task_summarization], outputs=data_summarization)
|
| 418 |
|
|
|
|
| 3 |
from huggingface_hub import HfApi, hf_hub_download
|
| 4 |
from huggingface_hub.repocard import metadata_load
|
| 5 |
|
|
|
|
|
|
|
| 6 |
TASKS = [
|
| 7 |
"BitextMining",
|
| 8 |
"Classification",
|
|
|
|
| 183 |
cast_to_str=False
|
| 184 |
)
|
| 185 |
|
| 186 |
+
DATA_OVERALL.insert(1, f"Average ({len(TASK_LIST_EN)} datasets)", DATA_OVERALL[TASK_LIST_EN].mean(axis=1, skipna=False))
|
| 187 |
+
DATA_OVERALL.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", DATA_OVERALL[TASK_LIST_CLASSIFICATION].mean(axis=1, skipna=False))
|
| 188 |
+
DATA_OVERALL.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", DATA_OVERALL[TASK_LIST_CLUSTERING].mean(axis=1, skipna=False))
|
| 189 |
+
DATA_OVERALL.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", DATA_OVERALL[TASK_LIST_PAIR_CLASSIFICATION].mean(axis=1, skipna=False))
|
| 190 |
+
DATA_OVERALL.insert(5, f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", DATA_OVERALL[TASK_LIST_RERANKING].mean(axis=1, skipna=False))
|
| 191 |
+
DATA_OVERALL.insert(6, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", DATA_OVERALL[TASK_LIST_RETRIEVAL].mean(axis=1, skipna=False))
|
| 192 |
+
DATA_OVERALL.insert(7, f"STS Average ({len(TASK_LIST_STS)} datasets)", DATA_OVERALL[TASK_LIST_STS].mean(axis=1, skipna=False))
|
| 193 |
+
DATA_OVERALL.insert(8, f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)", DATA_OVERALL[TASK_LIST_SUMMARIZATION].mean(axis=1, skipna=False))
|
| 194 |
+
DATA_OVERALL.sort_values(f"Average ({len(TASK_LIST_EN)} datasets)", ascending=False, inplace=True)
|
| 195 |
# Start ranking from 1
|
| 196 |
DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1)))
|
| 197 |
|
|
|
|
| 205 |
DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
|
| 206 |
DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
|
| 207 |
|
| 208 |
+
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
|
| 209 |
|
| 210 |
return DATA_OVERALL
|
| 211 |
|
|
|
|
| 214 |
|
| 215 |
|
| 216 |
with block:
|
| 217 |
+
gr.Markdown(f"""
|
| 218 |
+
Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> ๐ค
|
| 219 |
+
|
| 220 |
+
- **Total Scores**: TODO
|
| 221 |
+
- **Total Models**: {len(DATA_OVERALL)}
|
| 222 |
+
- **Total Users**: TODO
|
| 223 |
+
""")
|
| 224 |
with gr.Tabs():
|
| 225 |
with gr.TabItem("Overall"):
|
| 226 |
with gr.Row():
|
| 227 |
+
gr.Markdown("""
|
| 228 |
+
**Overall MTEB English leaderboard ๐ฎ**
|
| 229 |
+
|
| 230 |
+
- **Metric:** Various, refer to task tabs
|
| 231 |
+
- **Languages:** English, refer to task tabs for others
|
| 232 |
+
""")
|
| 233 |
with gr.Row():
|
| 234 |
data_overall = gr.components.Dataframe(
|
| 235 |
DATA_OVERALL,
|
| 236 |
datatype=["markdown"] * len(DATA_OVERALL.columns) * 2,
|
| 237 |
type="pandas",
|
|
|
|
| 238 |
wrap=True,
|
| 239 |
)
|
| 240 |
with gr.Row():
|
|
|
|
| 242 |
data_run.click(get_mteb_average, inputs=None, outputs=data_overall)
|
| 243 |
with gr.TabItem("BitextMining"):
|
| 244 |
with gr.Row():
|
| 245 |
+
gr.Markdown("""
|
| 246 |
+
**Bitext Mining Leaderboard ๐**
|
| 247 |
+
|
| 248 |
+
- **Metric:** Accuracy (accuracy)
|
| 249 |
+
- **Languages:** 117
|
| 250 |
+
""")
|
| 251 |
with gr.Row():
|
| 252 |
data_bitext_mining = gr.components.Dataframe(
|
| 253 |
datatype=["markdown"] * 500, # hack when we don't know how many columns
|
|
|
|
| 264 |
with gr.TabItem("Classification"):
|
| 265 |
with gr.TabItem("English"):
|
| 266 |
with gr.Row():
|
| 267 |
+
gr.Markdown("""
|
| 268 |
+
**Classification Leaderboard โค๏ธ**
|
| 269 |
+
|
| 270 |
+
- **Metric:** Accuracy (accuracy)
|
| 271 |
+
- **Languages:** English
|
| 272 |
+
""")
|
| 273 |
with gr.Row():
|
| 274 |
data_classification_en = gr.components.Dataframe(
|
| 275 |
DATA_CLASSIFICATION_EN,
|
|
|
|
| 290 |
)
|
| 291 |
with gr.TabItem("Multilingual"):
|
| 292 |
with gr.Row():
|
| 293 |
+
gr.Markdown("""
|
| 294 |
+
**Classification Multilingual Leaderboard ๐๐๐**
|
| 295 |
+
|
| 296 |
+
- **Metric:** Accuracy (accuracy)
|
| 297 |
+
- **Languages:** 51
|
| 298 |
+
""")
|
| 299 |
with gr.Row():
|
| 300 |
data_classification = gr.components.Dataframe(
|
| 301 |
datatype=["markdown"] * 500, # hack when we don't know how many columns
|
|
|
|
| 311 |
)
|
| 312 |
with gr.TabItem("Clustering"):
|
| 313 |
with gr.Row():
|
| 314 |
+
gr.Markdown("""
|
| 315 |
+
**Clustering Leaderboard โจ**
|
| 316 |
+
|
| 317 |
+
- **Metric:** Validity Measure (v_measure)
|
| 318 |
+
- **Languages:** English
|
| 319 |
+
""")
|
| 320 |
with gr.Row():
|
| 321 |
data_clustering = gr.components.Dataframe(
|
| 322 |
DATA_CLUSTERING,
|
|
|
|
| 334 |
)
|
| 335 |
with gr.TabItem("Pair Classification"):
|
| 336 |
with gr.Row():
|
| 337 |
+
gr.Markdown("""
|
| 338 |
+
**Pair Classification Leaderboard ๐ญ**
|
| 339 |
+
|
| 340 |
+
- **Metric:** Average Precision based on Cosine Similarities (cos_sim_ap)
|
| 341 |
+
- **Languages:** English
|
| 342 |
+
""")
|
| 343 |
with gr.Row():
|
| 344 |
data_pair_classification = gr.components.Dataframe(
|
| 345 |
DATA_PAIR_CLASSIFICATION,
|
|
|
|
| 349 |
)
|
| 350 |
with gr.Row():
|
| 351 |
data_run = gr.Button("Refresh")
|
| 352 |
+
task_pair_classification = gr.Variable(value="PairClassification")
|
| 353 |
data_run.click(
|
| 354 |
get_mteb_data,
|
| 355 |
inputs=[task_pair_classification],
|
|
|
|
| 357 |
)
|
| 358 |
with gr.TabItem("Retrieval"):
|
| 359 |
with gr.Row():
|
| 360 |
+
gr.Markdown("""
|
| 361 |
+
**Retrieval Leaderboard ๐**
|
| 362 |
+
|
| 363 |
+
- **Metric:** Normalized Discounted Cumulative Gain @ k (ndcg_at_10)
|
| 364 |
+
- **Languages:** English
|
| 365 |
+
""")
|
| 366 |
with gr.Row():
|
| 367 |
data_retrieval = gr.components.Dataframe(
|
| 368 |
DATA_RETRIEVAL,
|
|
|
|
| 377 |
)
|
| 378 |
with gr.TabItem("Reranking"):
|
| 379 |
with gr.Row():
|
| 380 |
+
gr.Markdown("""
|
| 381 |
+
**Reranking Leaderboard ๐ฅ**
|
| 382 |
+
|
| 383 |
+
- **Metric:** Mean Average Precision (MAP)
|
| 384 |
+
- **Languages:** English
|
| 385 |
+
""")
|
| 386 |
with gr.Row():
|
| 387 |
data_reranking = gr.components.Dataframe(
|
| 388 |
DATA_RERANKING,
|
|
|
|
| 400 |
with gr.TabItem("STS"):
|
| 401 |
with gr.TabItem("English"):
|
| 402 |
with gr.Row():
|
| 403 |
+
gr.Markdown("""
|
| 404 |
+
**STS Leaderboard ๐ค**
|
| 405 |
+
|
| 406 |
+
- **Metric:** Spearman correlation based on cosine similarity
|
| 407 |
+
- **Languages:** English
|
| 408 |
+
""")
|
| 409 |
with gr.Row():
|
| 410 |
data_sts_en = gr.components.Dataframe(
|
| 411 |
DATA_STS_EN,
|
|
|
|
| 424 |
)
|
| 425 |
with gr.TabItem("Multilingual"):
|
| 426 |
with gr.Row():
|
| 427 |
+
gr.Markdown("""
|
| 428 |
+
**STS Multilingual Leaderboard ๐ฝ**
|
| 429 |
+
|
| 430 |
+
- **Metric:** Spearman correlation based on cosine similarity
|
| 431 |
+
- **Languages:** Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Russian, Spanish
|
| 432 |
+
""")
|
| 433 |
with gr.Row():
|
| 434 |
data_sts = gr.components.Dataframe(
|
| 435 |
datatype=["markdown"] * 50, # hack when we don't know how many columns
|
|
|
|
| 441 |
data_run.click(get_mteb_data, inputs=[task_sts], outputs=data_sts)
|
| 442 |
with gr.TabItem("Summarization"):
|
| 443 |
with gr.Row():
|
| 444 |
+
gr.Markdown("""
|
| 445 |
+
**Summarization Leaderboard ๐**
|
| 446 |
+
|
| 447 |
+
- **Metric:** Spearman correlation based on cosine similarity
|
| 448 |
+
- **Languages:** English
|
| 449 |
+
""")
|
| 450 |
with gr.Row():
|
| 451 |
data_summarization = gr.components.Dataframe(
|
| 452 |
DATA_SUMMARIZATION,
|
|
|
|
| 462 |
inputs=[task_summarization],
|
| 463 |
outputs=data_summarization,
|
| 464 |
)
|
| 465 |
+
# Running the function on page load in addition to when the button is clicked
|
| 466 |
+
# This is optional - If deactivated the data created loaded at "Build time" is shown like for Overall tab
|
| 467 |
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
|
| 468 |
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
|
| 469 |
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
|
| 470 |
block.load(get_mteb_data, inputs=[task_clustering], outputs=data_clustering)
|
| 471 |
block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval)
|
| 472 |
block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking)
|
| 473 |
+
block.load(get_mteb_data, inputs=[task_sts_en], outputs=data_sts_en)
|
| 474 |
block.load(get_mteb_data, inputs=[task_sts], outputs=data_sts)
|
| 475 |
block.load(get_mteb_data, inputs=[task_summarization], outputs=data_summarization)
|
| 476 |
|