Spaces:
Runtime error
Runtime error
Commit
·
f61dd83
1
Parent(s):
099d855
Cache everything; Add rankings everwhere; Automate num dataset/score computation
Browse files
app.py
CHANGED
|
@@ -393,6 +393,7 @@ MODELS_TO_SKIP = {
|
|
| 393 |
"anttip/ct2fast-e5-small-v2-hfie",
|
| 394 |
"newsrx/instructor-large",
|
| 395 |
"newsrx/instructor-xl",
|
|
|
|
| 396 |
}
|
| 397 |
|
| 398 |
|
|
@@ -471,7 +472,20 @@ def get_dim_seq_size(model):
|
|
| 471 |
size = round(size["metadata"]["total_size"] / 1e9, 2)
|
| 472 |
return dim, seq, size
|
| 473 |
|
| 474 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
api = HfApi()
|
| 476 |
models = api.list_models(filter="mteb")
|
| 477 |
# Initialize list to models that we cannot fetch metadata from
|
|
@@ -532,6 +546,8 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
| 532 |
cols = sorted(list(df.columns))
|
| 533 |
cols.insert(0, cols.pop(cols.index("Model")))
|
| 534 |
df = df[cols]
|
|
|
|
|
|
|
| 535 |
if fillna:
|
| 536 |
df.fillna("", inplace=True)
|
| 537 |
return df
|
|
@@ -551,10 +567,8 @@ def get_mteb_average():
|
|
| 551 |
langs=["en", "en-en"],
|
| 552 |
fillna=False,
|
| 553 |
add_emb_dim=True,
|
|
|
|
| 554 |
)
|
| 555 |
-
# Approximation (Missing Bitext Mining & including some nans)
|
| 556 |
-
NUM_SCORES = DATA_OVERALL.shape[0] * DATA_OVERALL.shape[1]
|
| 557 |
-
|
| 558 |
# Debugging:
|
| 559 |
# DATA_OVERALL.to_csv("overall.csv")
|
| 560 |
|
|
@@ -572,32 +586,51 @@ def get_mteb_average():
|
|
| 572 |
|
| 573 |
DATA_OVERALL = DATA_OVERALL.round(2)
|
| 574 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
# Fill NaN after averaging
|
| 576 |
DATA_OVERALL.fillna("", inplace=True)
|
| 577 |
|
| 578 |
-
DATA_CLASSIFICATION_EN = DATA_OVERALL[["Model"] + TASK_LIST_CLASSIFICATION]
|
| 579 |
-
DATA_CLUSTERING = DATA_OVERALL[["Model"] + TASK_LIST_CLUSTERING]
|
| 580 |
-
DATA_PAIR_CLASSIFICATION = DATA_OVERALL[["Model"] + TASK_LIST_PAIR_CLASSIFICATION]
|
| 581 |
-
DATA_RERANKING = DATA_OVERALL[["Model"] + TASK_LIST_RERANKING]
|
| 582 |
-
DATA_RETRIEVAL = DATA_OVERALL[["Model"] + TASK_LIST_RETRIEVAL]
|
| 583 |
-
DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
|
| 584 |
-
DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
|
| 585 |
-
|
| 586 |
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
|
| 587 |
|
| 588 |
return DATA_OVERALL
|
| 589 |
|
| 590 |
get_mteb_average()
|
| 591 |
-
|
| 592 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
|
|
|
|
| 594 |
with block:
|
| 595 |
gr.Markdown(f"""
|
| 596 |
Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗 Refer to the [MTEB paper](https://arxiv.org/abs/2210.07316) for details on metrics, tasks and models.
|
| 597 |
|
| 598 |
-
- **Total Datasets**:
|
| 599 |
- **Total Languages**: 112
|
| 600 |
-
- **Total Scores**:
|
| 601 |
- **Total Models**: {len(DATA_OVERALL)}
|
| 602 |
""")
|
| 603 |
with gr.Tabs():
|
|
@@ -629,7 +662,8 @@ with block:
|
|
| 629 |
""")
|
| 630 |
with gr.Row():
|
| 631 |
data_bitext_mining = gr.components.Dataframe(
|
| 632 |
-
|
|
|
|
| 633 |
type="pandas",
|
| 634 |
)
|
| 635 |
with gr.Row():
|
|
@@ -652,7 +686,7 @@ with block:
|
|
| 652 |
with gr.Row():
|
| 653 |
data_classification_en = gr.components.Dataframe(
|
| 654 |
DATA_CLASSIFICATION_EN,
|
| 655 |
-
datatype=["markdown"] + ["number"] * len(DATA_CLASSIFICATION_EN.columns),
|
| 656 |
type="pandas",
|
| 657 |
)
|
| 658 |
with gr.Row():
|
|
@@ -677,7 +711,8 @@ with block:
|
|
| 677 |
""")
|
| 678 |
with gr.Row():
|
| 679 |
data_classification = gr.components.Dataframe(
|
| 680 |
-
|
|
|
|
| 681 |
type="pandas",
|
| 682 |
)
|
| 683 |
with gr.Row():
|
|
@@ -700,7 +735,7 @@ with block:
|
|
| 700 |
with gr.Row():
|
| 701 |
data_clustering = gr.components.Dataframe(
|
| 702 |
DATA_CLUSTERING,
|
| 703 |
-
datatype=["markdown"] + ["number"] * len(DATA_CLUSTERING.columns),
|
| 704 |
type="pandas",
|
| 705 |
)
|
| 706 |
with gr.Row():
|
|
@@ -724,7 +759,8 @@ with block:
|
|
| 724 |
""")
|
| 725 |
with gr.Row():
|
| 726 |
data_clustering_de = gr.components.Dataframe(
|
| 727 |
-
|
|
|
|
| 728 |
type="pandas",
|
| 729 |
)
|
| 730 |
with gr.Row():
|
|
@@ -748,7 +784,7 @@ with block:
|
|
| 748 |
with gr.Row():
|
| 749 |
data_pair_classification = gr.components.Dataframe(
|
| 750 |
DATA_PAIR_CLASSIFICATION,
|
| 751 |
-
datatype=["markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION.columns),
|
| 752 |
type="pandas",
|
| 753 |
)
|
| 754 |
with gr.Row():
|
|
@@ -771,7 +807,7 @@ with block:
|
|
| 771 |
data_retrieval = gr.components.Dataframe(
|
| 772 |
DATA_RETRIEVAL,
|
| 773 |
# Add support for more columns than existing as a buffer for CQADupstack & other Retrieval tasks (e.g. MSMARCOv2)
|
| 774 |
-
datatype=["markdown"] + ["number"] * len(DATA_RETRIEVAL.columns) * 2,
|
| 775 |
type="pandas",
|
| 776 |
)
|
| 777 |
with gr.Row():
|
|
@@ -791,7 +827,7 @@ with block:
|
|
| 791 |
with gr.Row():
|
| 792 |
data_reranking = gr.components.Dataframe(
|
| 793 |
DATA_RERANKING,
|
| 794 |
-
datatype=["markdown"] + ["number"] * len(DATA_RERANKING.columns),
|
| 795 |
type="pandas",
|
| 796 |
)
|
| 797 |
with gr.Row():
|
|
@@ -813,7 +849,7 @@ with block:
|
|
| 813 |
with gr.Row():
|
| 814 |
data_sts_en = gr.components.Dataframe(
|
| 815 |
DATA_STS_EN,
|
| 816 |
-
datatype=["markdown"] + ["number"] * len(DATA_STS_EN.columns),
|
| 817 |
type="pandas",
|
| 818 |
)
|
| 819 |
with gr.Row():
|
|
@@ -835,7 +871,8 @@ with block:
|
|
| 835 |
""")
|
| 836 |
with gr.Row():
|
| 837 |
data_sts = gr.components.Dataframe(
|
| 838 |
-
|
|
|
|
| 839 |
type="pandas",
|
| 840 |
)
|
| 841 |
with gr.Row():
|
|
@@ -853,7 +890,7 @@ with block:
|
|
| 853 |
with gr.Row():
|
| 854 |
data_summarization = gr.components.Dataframe(
|
| 855 |
DATA_SUMMARIZATION,
|
| 856 |
-
datatype=["markdown"] + ["number"] * 2,
|
| 857 |
type="pandas",
|
| 858 |
)
|
| 859 |
with gr.Row():
|
|
@@ -880,8 +917,9 @@ with block:
|
|
| 880 |
}
|
| 881 |
```
|
| 882 |
""")
|
| 883 |
-
# Running the
|
| 884 |
-
# This is optional - If deactivated the data
|
|
|
|
| 885 |
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
|
| 886 |
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
|
| 887 |
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
|
|
@@ -893,6 +931,7 @@ with block:
|
|
| 893 |
block.load(get_mteb_data, inputs=[task_sts_en, lang_sts_en], outputs=data_sts_en)
|
| 894 |
block.load(get_mteb_data, inputs=[task_sts], outputs=data_sts)
|
| 895 |
block.load(get_mteb_data, inputs=[task_summarization], outputs=data_summarization)
|
|
|
|
| 896 |
|
| 897 |
block.queue(concurrency_count=40, max_size=10)
|
| 898 |
block.launch()
|
|
|
|
| 393 |
"anttip/ct2fast-e5-small-v2-hfie",
|
| 394 |
"newsrx/instructor-large",
|
| 395 |
"newsrx/instructor-xl",
|
| 396 |
+
"dmlls/all-mpnet-base-v2",
|
| 397 |
}
|
| 398 |
|
| 399 |
|
|
|
|
| 472 |
size = round(size["metadata"]["total_size"] / 1e9, 2)
|
| 473 |
return dim, seq, size
|
| 474 |
|
| 475 |
+
def add_rank(df):
|
| 476 |
+
cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length"]]
|
| 477 |
+
if len(cols_to_rank) == 1:
|
| 478 |
+
df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
|
| 479 |
+
else:
|
| 480 |
+
df.insert(1, "Average", df[cols_to_rank].mean(axis=1, skipna=False))
|
| 481 |
+
df.sort_values("Average", ascending=False, inplace=True)
|
| 482 |
+
df.insert(0, "Rank", list(range(1, len(df) + 1)))
|
| 483 |
+
df = df.round(2)
|
| 484 |
+
# Fill NaN after averaging
|
| 485 |
+
df.fillna("", inplace=True)
|
| 486 |
+
return df
|
| 487 |
+
|
| 488 |
+
def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC, rank=True):
|
| 489 |
api = HfApi()
|
| 490 |
models = api.list_models(filter="mteb")
|
| 491 |
# Initialize list to models that we cannot fetch metadata from
|
|
|
|
| 546 |
cols = sorted(list(df.columns))
|
| 547 |
cols.insert(0, cols.pop(cols.index("Model")))
|
| 548 |
df = df[cols]
|
| 549 |
+
if rank:
|
| 550 |
+
df = add_rank(df)
|
| 551 |
if fillna:
|
| 552 |
df.fillna("", inplace=True)
|
| 553 |
return df
|
|
|
|
| 567 |
langs=["en", "en-en"],
|
| 568 |
fillna=False,
|
| 569 |
add_emb_dim=True,
|
| 570 |
+
rank=False,
|
| 571 |
)
|
|
|
|
|
|
|
|
|
|
| 572 |
# Debugging:
|
| 573 |
# DATA_OVERALL.to_csv("overall.csv")
|
| 574 |
|
|
|
|
| 586 |
|
| 587 |
DATA_OVERALL = DATA_OVERALL.round(2)
|
| 588 |
|
| 589 |
+
DATA_CLASSIFICATION_EN = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_CLASSIFICATION])
|
| 590 |
+
DATA_CLUSTERING = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_CLUSTERING])
|
| 591 |
+
DATA_PAIR_CLASSIFICATION = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_PAIR_CLASSIFICATION])
|
| 592 |
+
DATA_RERANKING = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_RERANKING])
|
| 593 |
+
DATA_RETRIEVAL = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_RETRIEVAL])
|
| 594 |
+
DATA_STS_EN = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_STS])
|
| 595 |
+
DATA_SUMMARIZATION = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION])
|
| 596 |
+
|
| 597 |
# Fill NaN after averaging
|
| 598 |
DATA_OVERALL.fillna("", inplace=True)
|
| 599 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 600 |
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
|
| 601 |
|
| 602 |
return DATA_OVERALL
|
| 603 |
|
| 604 |
get_mteb_average()
|
| 605 |
+
DATA_BITEXT_MINING = get_mteb_data(["BitextMining"])
|
| 606 |
+
DATA_CLASSIFICATION = get_mteb_data(["Classification"])
|
| 607 |
+
DATA_CLUSTERING_GERMAN = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
|
| 608 |
+
DATA_STS = get_mteb_data(["STS"])
|
| 609 |
+
|
| 610 |
+
# Exact, add all non-nan integer values for every dataset
|
| 611 |
+
NUM_SCORES = 0
|
| 612 |
+
DATASETS = []
|
| 613 |
+
# LANGUAGES = []
|
| 614 |
+
for d in [DATA_BITEXT_MINING, DATA_CLASSIFICATION, DATA_CLUSTERING, DATA_CLUSTERING_GERMAN, DATA_PAIR_CLASSIFICATION, DATA_RERANKING, DATA_RETRIEVAL, DATA_STS, DATA_SUMMARIZATION]:
|
| 615 |
+
# NUM_SCORES += d.iloc[:, 1:].apply(lambda x: sum([1 for y in x if isinstance(y, float) and not np.isnan(y)]), axis=1).sum()
|
| 616 |
+
cols_to_ignore = 3 if "Average" in d.columns else 2
|
| 617 |
+
# Count number of scores including only non-nan floats & excluding the rank column
|
| 618 |
+
NUM_SCORES += d.iloc[:, cols_to_ignore:].notna().sum().sum()
|
| 619 |
+
# Exclude rank & model name column (first two); Do not count different language versions as different datasets
|
| 620 |
+
DATASETS += [i.split(" ")[0] for i in d.columns[cols_to_ignore:]]
|
| 621 |
+
# LANGUAGES += [i.split(" ")[-1] for i in d.columns[cols_to_ignore:]]
|
| 622 |
+
|
| 623 |
+
NUM_DATASETS = len(set(DATASETS))
|
| 624 |
+
# NUM_LANGUAGES = len(set(LANGUAGES))
|
| 625 |
|
| 626 |
+
block = gr.Blocks()
|
| 627 |
with block:
|
| 628 |
gr.Markdown(f"""
|
| 629 |
Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗 Refer to the [MTEB paper](https://arxiv.org/abs/2210.07316) for details on metrics, tasks and models.
|
| 630 |
|
| 631 |
+
- **Total Datasets**: {NUM_DATASETS}
|
| 632 |
- **Total Languages**: 112
|
| 633 |
+
- **Total Scores**: {NUM_SCORES}
|
| 634 |
- **Total Models**: {len(DATA_OVERALL)}
|
| 635 |
""")
|
| 636 |
with gr.Tabs():
|
|
|
|
| 662 |
""")
|
| 663 |
with gr.Row():
|
| 664 |
data_bitext_mining = gr.components.Dataframe(
|
| 665 |
+
DATA_BITEXT_MINING,
|
| 666 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_BITEXT_MINING.columns),
|
| 667 |
type="pandas",
|
| 668 |
)
|
| 669 |
with gr.Row():
|
|
|
|
| 686 |
with gr.Row():
|
| 687 |
data_classification_en = gr.components.Dataframe(
|
| 688 |
DATA_CLASSIFICATION_EN,
|
| 689 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_EN.columns),
|
| 690 |
type="pandas",
|
| 691 |
)
|
| 692 |
with gr.Row():
|
|
|
|
| 711 |
""")
|
| 712 |
with gr.Row():
|
| 713 |
data_classification = gr.components.Dataframe(
|
| 714 |
+
DATA_CLASSIFICATION,
|
| 715 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION) * 10,
|
| 716 |
type="pandas",
|
| 717 |
)
|
| 718 |
with gr.Row():
|
|
|
|
| 735 |
with gr.Row():
|
| 736 |
data_clustering = gr.components.Dataframe(
|
| 737 |
DATA_CLUSTERING,
|
| 738 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING.columns),
|
| 739 |
type="pandas",
|
| 740 |
)
|
| 741 |
with gr.Row():
|
|
|
|
| 759 |
""")
|
| 760 |
with gr.Row():
|
| 761 |
data_clustering_de = gr.components.Dataframe(
|
| 762 |
+
DATA_CLUSTERING_GERMAN,
|
| 763 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING_GERMAN.columns) * 2,
|
| 764 |
type="pandas",
|
| 765 |
)
|
| 766 |
with gr.Row():
|
|
|
|
| 784 |
with gr.Row():
|
| 785 |
data_pair_classification = gr.components.Dataframe(
|
| 786 |
DATA_PAIR_CLASSIFICATION,
|
| 787 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION.columns),
|
| 788 |
type="pandas",
|
| 789 |
)
|
| 790 |
with gr.Row():
|
|
|
|
| 807 |
data_retrieval = gr.components.Dataframe(
|
| 808 |
DATA_RETRIEVAL,
|
| 809 |
# Add support for more columns than existing as a buffer for CQADupstack & other Retrieval tasks (e.g. MSMARCOv2)
|
| 810 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_RETRIEVAL.columns) * 2,
|
| 811 |
type="pandas",
|
| 812 |
)
|
| 813 |
with gr.Row():
|
|
|
|
| 827 |
with gr.Row():
|
| 828 |
data_reranking = gr.components.Dataframe(
|
| 829 |
DATA_RERANKING,
|
| 830 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_RERANKING.columns),
|
| 831 |
type="pandas",
|
| 832 |
)
|
| 833 |
with gr.Row():
|
|
|
|
| 849 |
with gr.Row():
|
| 850 |
data_sts_en = gr.components.Dataframe(
|
| 851 |
DATA_STS_EN,
|
| 852 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_STS_EN.columns),
|
| 853 |
type="pandas",
|
| 854 |
)
|
| 855 |
with gr.Row():
|
|
|
|
| 871 |
""")
|
| 872 |
with gr.Row():
|
| 873 |
data_sts = gr.components.Dataframe(
|
| 874 |
+
DATA_STS,
|
| 875 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_STS.columns) * 2,
|
| 876 |
type="pandas",
|
| 877 |
)
|
| 878 |
with gr.Row():
|
|
|
|
| 890 |
with gr.Row():
|
| 891 |
data_summarization = gr.components.Dataframe(
|
| 892 |
DATA_SUMMARIZATION,
|
| 893 |
+
datatype=["number", "markdown"] + ["number"] * 2,
|
| 894 |
type="pandas",
|
| 895 |
)
|
| 896 |
with gr.Row():
|
|
|
|
| 917 |
}
|
| 918 |
```
|
| 919 |
""")
|
| 920 |
+
# Running the functions on page load in addition to when the button is clicked
|
| 921 |
+
# This is optional - If deactivated the data loaded at "Build time" is shown like for Overall tab
|
| 922 |
+
"""
|
| 923 |
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
|
| 924 |
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
|
| 925 |
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
|
|
|
|
| 931 |
block.load(get_mteb_data, inputs=[task_sts_en, lang_sts_en], outputs=data_sts_en)
|
| 932 |
block.load(get_mteb_data, inputs=[task_sts], outputs=data_sts)
|
| 933 |
block.load(get_mteb_data, inputs=[task_summarization], outputs=data_summarization)
|
| 934 |
+
"""
|
| 935 |
|
| 936 |
block.queue(concurrency_count=40, max_size=10)
|
| 937 |
block.launch()
|