Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
ClΓ©mentine
commited on
Commit
β’
294422e
1
Parent(s):
388bfbd
added plots back
Browse files- app.py +21 -21
- src/display/utils.py +1 -0
- src/populate.py +3 -3
- src/tools/plots.py +5 -9
app.py
CHANGED
@@ -135,9 +135,9 @@ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queu
|
|
135 |
|
136 |
|
137 |
# Data processing for plots now only on demand in the respective Gradio tab
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
|
142 |
def init_leaderboard(dataframe):
|
143 |
return Leaderboard(
|
@@ -182,24 +182,24 @@ with demo:
|
|
182 |
with gr.TabItem("π
LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
183 |
leaderboard = init_leaderboard(leaderboard_df)
|
184 |
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
|
204 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=3):
|
205 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
135 |
|
136 |
|
137 |
# Data processing for plots now only on demand in the respective Gradio tab
|
138 |
+
def load_and_create_plots():
|
139 |
+
plot_df = create_plot_df(create_scores_df(leaderboard_df))
|
140 |
+
return plot_df
|
141 |
|
142 |
def init_leaderboard(dataframe):
|
143 |
return Leaderboard(
|
|
|
182 |
with gr.TabItem("π
LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
183 |
leaderboard = init_leaderboard(leaderboard_df)
|
184 |
|
185 |
+
with gr.TabItem("π Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
|
186 |
+
with gr.Row():
|
187 |
+
with gr.Column():
|
188 |
+
plot_df = load_and_create_plots()
|
189 |
+
chart = create_metric_plot_obj(
|
190 |
+
plot_df,
|
191 |
+
[AutoEvalColumn.average.name],
|
192 |
+
title="Average of Top Scores and Human Baseline Over Time (from last update)",
|
193 |
+
)
|
194 |
+
gr.Plot(value=chart, min_width=500)
|
195 |
+
with gr.Column():
|
196 |
+
plot_df = load_and_create_plots()
|
197 |
+
chart = create_metric_plot_obj(
|
198 |
+
plot_df,
|
199 |
+
BENCHMARK_COLS,
|
200 |
+
title="Top Scores and Human Baseline Over Time (from last update)",
|
201 |
+
)
|
202 |
+
gr.Plot(value=chart, min_width=500)
|
203 |
|
204 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=3):
|
205 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
src/display/utils.py
CHANGED
@@ -93,6 +93,7 @@ auto_eval_column_dict.append(
|
|
93 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
94 |
auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
|
95 |
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
|
|
|
96 |
# Dummy column for the search bar (hidden by the custom CSS)
|
97 |
auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
|
98 |
|
|
|
93 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
94 |
auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
|
95 |
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
|
96 |
+
auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("date", "bool", False, hidden=True)])
|
97 |
# Dummy column for the search bar (hidden by the custom CSS)
|
98 |
auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
|
99 |
|
src/populate.py
CHANGED
@@ -43,10 +43,10 @@ def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list, benchmark_cols:
|
|
43 |
"""Retrieve and process leaderboard data."""
|
44 |
all_data_json = leaderboard_dataset.to_dict()
|
45 |
num_items = leaderboard_dataset.num_rows
|
46 |
-
|
47 |
-
filter_models_flags(
|
48 |
|
49 |
-
df = pd.DataFrame.from_records(
|
50 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
51 |
df = df[cols].round(decimals=2)
|
52 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
|
|
43 |
"""Retrieve and process leaderboard data."""
|
44 |
all_data_json = leaderboard_dataset.to_dict()
|
45 |
num_items = leaderboard_dataset.num_rows
|
46 |
+
all_data_json_list = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
|
47 |
+
filter_models_flags(all_data_json_list)
|
48 |
|
49 |
+
df = pd.DataFrame.from_records(all_data_json_list)
|
50 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
51 |
df = df[cols].round(decimals=2)
|
52 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
src/tools/plots.py
CHANGED
@@ -28,22 +28,18 @@ def create_scores_df(results_df: list[dict]) -> pd.DataFrame:
|
|
28 |
last_date = ""
|
29 |
column = task.col_name
|
30 |
for _, row in results_df.iterrows():
|
31 |
-
current_model = row[
|
32 |
# We ignore models that are flagged/no longer on the hub/not finished
|
33 |
to_ignore = (
|
34 |
-
not row[
|
35 |
-
or not row[
|
36 |
or current_model in FLAGGED_MODELS
|
37 |
-
or row["status"] != "FINISHED"
|
38 |
)
|
39 |
if to_ignore:
|
40 |
continue
|
41 |
|
42 |
-
current_date = row[
|
43 |
-
|
44 |
-
current_score = np.mean(list(row["results"].values()))
|
45 |
-
else:
|
46 |
-
current_score = row["results"][task.benchmark]
|
47 |
|
48 |
if current_score > current_max:
|
49 |
if current_date == last_date and len(scores[column]) > 0:
|
|
|
28 |
last_date = ""
|
29 |
column = task.col_name
|
30 |
for _, row in results_df.iterrows():
|
31 |
+
current_model = row[AutoEvalColumn.fullname.name]
|
32 |
# We ignore models that are flagged/no longer on the hub/not finished
|
33 |
to_ignore = (
|
34 |
+
not row[AutoEvalColumn.still_on_hub.name]
|
35 |
+
or not row[AutoEvalColumn.not_flagged.name]
|
36 |
or current_model in FLAGGED_MODELS
|
|
|
37 |
)
|
38 |
if to_ignore:
|
39 |
continue
|
40 |
|
41 |
+
current_date = row[AutoEvalColumn.date.name]
|
42 |
+
current_score = row[task.col_name]
|
|
|
|
|
|
|
43 |
|
44 |
if current_score > current_max:
|
45 |
if current_date == last_date and len(scores[column]) > 0:
|