Update space
Browse files- app.py +43 -12
- src/display/utils.py +4 -0
- src/leaderboard/read_evals.py +4 -0
- src/populate.py +2 -2
app.py
CHANGED
|
@@ -104,7 +104,8 @@ def init_leaderboard(dataframe):
|
|
| 104 |
# model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
|
| 105 |
# model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
|
| 106 |
# model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
|
| 107 |
-
model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
|
|
|
|
| 108 |
# model_leaderboard_df = get_model_leaderboard_df(model_result_path)
|
| 109 |
|
| 110 |
|
|
@@ -131,17 +132,33 @@ with demo:
|
|
| 131 |
gr.HTML(TITLE)
|
| 132 |
gr.HTML(SUB_TITLE)
|
| 133 |
gr.HTML(EXTERNAL_LINKS)
|
| 134 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 137 |
|
| 138 |
with gr.TabItem("π
Overview", elem_id="llm-benchmark-tab-table", id=0):
|
| 139 |
|
| 140 |
DESCRIPTION_TEXT = """
|
| 141 |
-
Total #models: 53 (Last updated: 2024-10-
|
| 142 |
|
| 143 |
-
This page
|
| 144 |
-
(
|
| 145 |
"""
|
| 146 |
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
| 147 |
|
|
@@ -158,6 +175,7 @@ with demo:
|
|
| 158 |
AutoEvalColumn.rank_reason_logical.name,
|
| 159 |
AutoEvalColumn.rank_reason_social.name,
|
| 160 |
AutoEvalColumn.rank_chemistry.name,
|
|
|
|
| 161 |
],
|
| 162 |
rank_col=[],
|
| 163 |
)
|
|
@@ -374,19 +392,31 @@ with demo:
|
|
| 374 |
"""
|
| 375 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
| 376 |
|
| 377 |
-
with gr.TabItem("
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
|
| 383 |
-
with gr.TabItem("
|
| 384 |
CURRENT_TEXT = """
|
| 385 |
# Coming soon!
|
| 386 |
"""
|
| 387 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
| 388 |
|
| 389 |
-
with gr.TabItem("
|
| 390 |
CURRENT_TEXT = """
|
| 391 |
# Coming soon!
|
| 392 |
"""
|
|
@@ -395,6 +425,7 @@ with demo:
|
|
| 395 |
|
| 396 |
|
| 397 |
|
|
|
|
| 398 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=6):
|
| 399 |
ABOUT_TEXT = """
|
| 400 |
# About Us
|
|
|
|
| 104 |
# model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
|
| 105 |
# model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
|
| 106 |
# model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
|
| 107 |
+
# model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
|
| 108 |
+
model_result_path = "./src/results/models_2024-10-10-06:18:54.263527.json"
|
| 109 |
# model_leaderboard_df = get_model_leaderboard_df(model_result_path)
|
| 110 |
|
| 111 |
|
|
|
|
| 132 |
gr.HTML(TITLE)
|
| 133 |
gr.HTML(SUB_TITLE)
|
| 134 |
gr.HTML(EXTERNAL_LINKS)
|
| 135 |
+
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 136 |
+
# gr.HTML('<p style="font-size:15px;">This is a larger text using HTML in Markdown.</p>')
|
| 137 |
+
INTRODUCTION_TEXT_FONT_SIZE = 16
|
| 138 |
+
INTRODUCTION_TEXT = (
|
| 139 |
+
f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
|
| 140 |
+
'<strong>Decentralized Arena</strong> automates, scales, and accelerates "<a href="https://lmarena.ai/">Chatbot Arena</a>" '
|
| 141 |
+
'for large language model (LLM) evaluation across diverse, fine-grained dimensions, '
|
| 142 |
+
'such as mathematics (algebra, geometry, probability), logical reasoning, social reasoning, biology, chemistry, and more'
|
| 143 |
+
'The evaluation is decentralized and democratic, with all participating LLMs assessing each other to ensure unbiased and fair results '
|
| 144 |
+
'With a 95\% correlation to Chatbot Arena\'s overall rankings, the system is fully transparent and reproducible.'
|
| 145 |
+
'</p>'
|
| 146 |
+
f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
|
| 147 |
+
'We actively invite <b>model developers</b> to participate and expedite their benchmarking efforts '
|
| 148 |
+
'and encourage <b>data stakeholders</b> to freely define and evaluate dimensions of interest for their own objectives.'
|
| 149 |
+
'</p>'
|
| 150 |
+
)
|
| 151 |
+
gr.HTML(INTRODUCTION_TEXT)
|
| 152 |
|
| 153 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 154 |
|
| 155 |
with gr.TabItem("π
Overview", elem_id="llm-benchmark-tab-table", id=0):
|
| 156 |
|
| 157 |
DESCRIPTION_TEXT = """
|
| 158 |
+
Total #models: 53 (Last updated: 2024-10-09)
|
| 159 |
|
| 160 |
+
This page prvovides a comprehensive overview of model ranks across various dimensions, based on their averaged ranks.
|
| 161 |
+
(Missing values are due to the slow or problemtic model responses, which will be fixed soom.)
|
| 162 |
"""
|
| 163 |
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
| 164 |
|
|
|
|
| 175 |
AutoEvalColumn.rank_reason_logical.name,
|
| 176 |
AutoEvalColumn.rank_reason_social.name,
|
| 177 |
AutoEvalColumn.rank_chemistry.name,
|
| 178 |
+
AutoEvalColumn.rank_cpp.name,
|
| 179 |
],
|
| 180 |
rank_col=[],
|
| 181 |
)
|
|
|
|
| 392 |
"""
|
| 393 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
| 394 |
|
| 395 |
+
with gr.TabItem("β C++", elem_id="cpp_subtab", id=0, elem_classes="subtab"):
|
| 396 |
+
|
| 397 |
+
leaderboard = overall_leaderboard(
|
| 398 |
+
get_model_leaderboard_df(
|
| 399 |
+
model_result_path,
|
| 400 |
+
benchmark_cols=[
|
| 401 |
+
AutoEvalColumn.rank_cpp.name,
|
| 402 |
+
AutoEvalColumn.model.name,
|
| 403 |
+
AutoEvalColumn.score_cpp.name,
|
| 404 |
+
# AutoEvalColumn.sd_cpp.name,
|
| 405 |
+
AutoEvalColumn.license.name,
|
| 406 |
+
AutoEvalColumn.organization.name,
|
| 407 |
+
AutoEvalColumn.knowledge_cutoff.name,
|
| 408 |
+
],
|
| 409 |
+
rank_col=[AutoEvalColumn.rank_cpp.name],
|
| 410 |
+
)
|
| 411 |
+
)
|
| 412 |
|
| 413 |
+
with gr.TabItem("π Python", elem_id="python_subtab", id=1, elem_classes="subtab"):
|
| 414 |
CURRENT_TEXT = """
|
| 415 |
# Coming soon!
|
| 416 |
"""
|
| 417 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
| 418 |
|
| 419 |
+
with gr.TabItem("β Java", elem_id="java_subtab", id=2, elem_classes="subtab"):
|
| 420 |
CURRENT_TEXT = """
|
| 421 |
# Coming soon!
|
| 422 |
"""
|
|
|
|
| 425 |
|
| 426 |
|
| 427 |
|
| 428 |
+
|
| 429 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=6):
|
| 430 |
ABOUT_TEXT = """
|
| 431 |
# About Us
|
src/display/utils.py
CHANGED
|
@@ -89,6 +89,10 @@ auto_eval_column_dict.append(["score_chemistry", ColumnContent, field(default_fa
|
|
| 89 |
auto_eval_column_dict.append(["sd_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Chemistry)", "number", True))])
|
| 90 |
auto_eval_column_dict.append(["rank_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Chemistry)", "number", True))])
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
for task in Tasks:
|
| 93 |
auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
|
| 94 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])
|
|
|
|
| 89 |
auto_eval_column_dict.append(["sd_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Chemistry)", "number", True))])
|
| 90 |
auto_eval_column_dict.append(["rank_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Chemistry)", "number", True))])
|
| 91 |
|
| 92 |
+
auto_eval_column_dict.append(["score_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Score (C++)", "number", True))])
|
| 93 |
+
auto_eval_column_dict.append(["sd_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (C++)", "number", True))])
|
| 94 |
+
auto_eval_column_dict.append(["rank_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (C++)", "number", True))])
|
| 95 |
+
|
| 96 |
for task in Tasks:
|
| 97 |
auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
|
| 98 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -189,6 +189,10 @@ class ModelResult:
|
|
| 189 |
AutoEvalColumn.sd_chemistry.name: self.results.get("Chemistry").get("Standard Deviation", None) if self.results.get("Chemistry") else None,
|
| 190 |
AutoEvalColumn.rank_chemistry.name: self.results.get("Chemistry").get("Rank", None) if self.results.get("Chemistry") else None,
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
AutoEvalColumn.license.name: self.license,
|
| 193 |
AutoEvalColumn.organization.name: self.org,
|
| 194 |
AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
|
|
|
|
| 189 |
AutoEvalColumn.sd_chemistry.name: self.results.get("Chemistry").get("Standard Deviation", None) if self.results.get("Chemistry") else None,
|
| 190 |
AutoEvalColumn.rank_chemistry.name: self.results.get("Chemistry").get("Rank", None) if self.results.get("Chemistry") else None,
|
| 191 |
|
| 192 |
+
AutoEvalColumn.score_cpp.name: self.results.get("CPP").get("Average Score", None) if self.results.get("CPP") else None,
|
| 193 |
+
AutoEvalColumn.sd_cpp.name: self.results.get("CPP").get("Standard Deviation", None) if self.results.get("CPP") else None,
|
| 194 |
+
AutoEvalColumn.rank_cpp.name: self.results.get("CPP").get("Rank", None) if self.results.get("CPP") else None,
|
| 195 |
+
|
| 196 |
AutoEvalColumn.license.name: self.license,
|
| 197 |
AutoEvalColumn.organization.name: self.org,
|
| 198 |
AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
|
src/populate.py
CHANGED
|
@@ -24,7 +24,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
| 24 |
if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
|
| 25 |
df = df.dropna(subset=benchmark_cols)
|
| 26 |
df = df.sort_values(by=[rank_col[0]], ascending=True)
|
| 27 |
-
# print(rank_col)
|
| 28 |
else:
|
| 29 |
# when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
|
| 30 |
avg_rank = df.iloc[:, 1:].mean(axis=1)
|
|
@@ -43,7 +43,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
| 43 |
# print(col)
|
| 44 |
# if 'Std dev' in col or 'Score' in col:
|
| 45 |
if 'Std dev' in col or 'Score' in col:
|
| 46 |
-
if "Chemistry" in col:
|
| 47 |
df[col] = (df[col]).map('{:.2f}'.format)
|
| 48 |
else:
|
| 49 |
df[col] = (df[col]*100).map('{:.2f}'.format)
|
|
|
|
| 24 |
if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
|
| 25 |
df = df.dropna(subset=benchmark_cols)
|
| 26 |
df = df.sort_values(by=[rank_col[0]], ascending=True)
|
| 27 |
+
# print(rank_col, benchmark_cols)
|
| 28 |
else:
|
| 29 |
# when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
|
| 30 |
avg_rank = df.iloc[:, 1:].mean(axis=1)
|
|
|
|
| 43 |
# print(col)
|
| 44 |
# if 'Std dev' in col or 'Score' in col:
|
| 45 |
if 'Std dev' in col or 'Score' in col:
|
| 46 |
+
if "Chemistry" in col or "C++" in col:
|
| 47 |
df[col] = (df[col]).map('{:.2f}'.format)
|
| 48 |
else:
|
| 49 |
df[col] = (df[col]*100).map('{:.2f}'.format)
|