do not display incomplete models for now
Browse files
app.py
CHANGED
|
@@ -93,6 +93,21 @@ if not IS_PUBLIC:
|
|
| 93 |
EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
|
| 94 |
EVAL_TYPES = ["markdown", "str", "bool", "bool", "bool", "str"]
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
def get_leaderboard():
|
| 98 |
if repo:
|
|
@@ -125,11 +140,22 @@ def get_leaderboard():
|
|
| 125 |
}
|
| 126 |
all_data.append(gpt35_values)
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
|
| 135 |
def get_eval_table():
|
|
@@ -144,7 +170,7 @@ def get_eval_table():
|
|
| 144 |
all_evals = []
|
| 145 |
|
| 146 |
for entry in entries:
|
| 147 |
-
print(entry)
|
| 148 |
if ".json" in entry:
|
| 149 |
file_path = os.path.join("evals/eval_requests", entry)
|
| 150 |
with open(file_path) as fp:
|
|
@@ -171,12 +197,17 @@ def get_eval_table():
|
|
| 171 |
data["model"] = make_clickable_model(data["model"])
|
| 172 |
all_evals.append(data)
|
| 173 |
|
| 174 |
-
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
|
| 178 |
leaderboard = get_leaderboard()
|
| 179 |
-
|
| 180 |
|
| 181 |
|
| 182 |
def is_model_on_hub(model_name, revision) -> bool:
|
|
@@ -237,7 +268,7 @@ def add_new_eval(
|
|
| 237 |
if out_path.lower() in requested_models:
|
| 238 |
duplicate_request_message = "This model has been already submitted."
|
| 239 |
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{duplicate_request_message}</p>"
|
| 240 |
-
|
| 241 |
with open(out_path, "w") as f:
|
| 242 |
f.write(json.dumps(eval_entry))
|
| 243 |
LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
|
|
@@ -256,7 +287,10 @@ def add_new_eval(
|
|
| 256 |
|
| 257 |
|
| 258 |
def refresh():
|
| 259 |
-
|
|
|
|
|
|
|
|
|
|
| 260 |
|
| 261 |
|
| 262 |
block = gr.Blocks()
|
|
@@ -289,16 +323,43 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
|
|
| 289 |
|
| 290 |
"""
|
| 291 |
)
|
| 292 |
-
with gr.Accordion("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
with gr.Row():
|
| 294 |
-
|
| 295 |
-
value=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
)
|
| 297 |
|
| 298 |
with gr.Row():
|
| 299 |
refresh_button = gr.Button("Refresh")
|
| 300 |
refresh_button.click(
|
| 301 |
-
refresh,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
)
|
| 303 |
|
| 304 |
with gr.Accordion("Submit a new model for evaluation"):
|
|
@@ -332,5 +393,14 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
|
|
| 332 |
submission_result,
|
| 333 |
)
|
| 334 |
|
| 335 |
-
block.load(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
block.launch()
|
|
|
|
| 93 |
EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
|
| 94 |
EVAL_TYPES = ["markdown", "str", "bool", "bool", "bool", "str"]
|
| 95 |
|
| 96 |
+
BENCHMARK_COLS = [
|
| 97 |
+
"ARC (25-shot) ⬆️",
|
| 98 |
+
"HellaSwag (10-shot) ⬆️",
|
| 99 |
+
"MMLU (5-shot) ⬆️",
|
| 100 |
+
"TruthfulQA (0-shot) ⬆️",
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def has_no_nan_values(df, columns):
|
| 105 |
+
return df[columns].notna().all(axis=1)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def has_nan_values(df, columns):
|
| 109 |
+
return df[columns].isna().any(axis=1)
|
| 110 |
+
|
| 111 |
|
| 112 |
def get_leaderboard():
|
| 113 |
if repo:
|
|
|
|
| 140 |
}
|
| 141 |
all_data.append(gpt35_values)
|
| 142 |
|
| 143 |
+
df = pd.DataFrame.from_records(all_data)
|
| 144 |
+
df = df.sort_values(by=["Average ⬆️"], ascending=False)
|
| 145 |
+
df = df[COLS]
|
| 146 |
+
|
| 147 |
+
# get incomplete models
|
| 148 |
+
incomplete_models = df[has_nan_values(df, BENCHMARK_COLS)]["Model"].tolist()
|
| 149 |
+
print(
|
| 150 |
+
[
|
| 151 |
+
model.split(" style")[0].split("https://huggingface.co/")[1]
|
| 152 |
+
for model in incomplete_models
|
| 153 |
+
]
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
# filter out if any of the benchmarks have not been produced
|
| 157 |
+
df = df[has_no_nan_values(df, BENCHMARK_COLS)]
|
| 158 |
+
return df
|
| 159 |
|
| 160 |
|
| 161 |
def get_eval_table():
|
|
|
|
| 170 |
all_evals = []
|
| 171 |
|
| 172 |
for entry in entries:
|
| 173 |
+
# print(entry)
|
| 174 |
if ".json" in entry:
|
| 175 |
file_path = os.path.join("evals/eval_requests", entry)
|
| 176 |
with open(file_path) as fp:
|
|
|
|
| 197 |
data["model"] = make_clickable_model(data["model"])
|
| 198 |
all_evals.append(data)
|
| 199 |
|
| 200 |
+
pending_list = [e for e in all_evals if e["status"] == "PENDING"]
|
| 201 |
+
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
| 202 |
+
finished_list = [e for e in all_evals if e["status"] == "FINISHED"]
|
| 203 |
+
df_pending = pd.DataFrame.from_records(pending_list)
|
| 204 |
+
df_running = pd.DataFrame.from_records(running_list)
|
| 205 |
+
df_finished = pd.DataFrame.from_records(finished_list)
|
| 206 |
+
return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
|
| 207 |
|
| 208 |
|
| 209 |
leaderboard = get_leaderboard()
|
| 210 |
+
finished_eval_queue, running_eval_queue, pending_eval_queue = get_eval_table()
|
| 211 |
|
| 212 |
|
| 213 |
def is_model_on_hub(model_name, revision) -> bool:
|
|
|
|
| 268 |
if out_path.lower() in requested_models:
|
| 269 |
duplicate_request_message = "This model has been already submitted."
|
| 270 |
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{duplicate_request_message}</p>"
|
| 271 |
+
|
| 272 |
with open(out_path, "w") as f:
|
| 273 |
f.write(json.dumps(eval_entry))
|
| 274 |
LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
|
|
|
|
| 287 |
|
| 288 |
|
| 289 |
def refresh():
|
| 290 |
+
leaderboard = get_leaderboard()
|
| 291 |
+
finished_eval_queue, running_eval_queue, pending_eval_queue = get_eval_table()
|
| 292 |
+
get_leaderboard(), get_eval_table()
|
| 293 |
+
return leaderboard, finished_eval_queue, running_eval_queue, pending_eval_queue
|
| 294 |
|
| 295 |
|
| 296 |
block = gr.Blocks()
|
|
|
|
| 323 |
|
| 324 |
"""
|
| 325 |
)
|
| 326 |
+
with gr.Accordion("Finished Evaluations", open=False):
|
| 327 |
+
with gr.Row():
|
| 328 |
+
finished_eval_table = gr.components.Dataframe(
|
| 329 |
+
value=finished_eval_queue,
|
| 330 |
+
headers=EVAL_COLS,
|
| 331 |
+
datatype=EVAL_TYPES,
|
| 332 |
+
max_rows=5,
|
| 333 |
+
)
|
| 334 |
+
with gr.Accordion("Running Evaluation Queue", open=False):
|
| 335 |
with gr.Row():
|
| 336 |
+
running_eval_table = gr.components.Dataframe(
|
| 337 |
+
value=running_eval_queue,
|
| 338 |
+
headers=EVAL_COLS,
|
| 339 |
+
datatype=EVAL_TYPES,
|
| 340 |
+
max_rows=5,
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
with gr.Accordion("Running & Pending Evaluation Queue", open=False):
|
| 344 |
+
with gr.Row():
|
| 345 |
+
pending_eval_table = gr.components.Dataframe(
|
| 346 |
+
value=pending_eval_queue,
|
| 347 |
+
headers=EVAL_COLS,
|
| 348 |
+
datatype=EVAL_TYPES,
|
| 349 |
+
max_rows=5,
|
| 350 |
)
|
| 351 |
|
| 352 |
with gr.Row():
|
| 353 |
refresh_button = gr.Button("Refresh")
|
| 354 |
refresh_button.click(
|
| 355 |
+
refresh,
|
| 356 |
+
inputs=[],
|
| 357 |
+
outputs=[
|
| 358 |
+
leaderboard_table,
|
| 359 |
+
finished_eval_table,
|
| 360 |
+
running_eval_table,
|
| 361 |
+
pending_eval_table,
|
| 362 |
+
],
|
| 363 |
)
|
| 364 |
|
| 365 |
with gr.Accordion("Submit a new model for evaluation"):
|
|
|
|
| 393 |
submission_result,
|
| 394 |
)
|
| 395 |
|
| 396 |
+
block.load(
|
| 397 |
+
refresh,
|
| 398 |
+
inputs=[],
|
| 399 |
+
outputs=[
|
| 400 |
+
leaderboard_table,
|
| 401 |
+
finished_eval_table,
|
| 402 |
+
running_eval_table,
|
| 403 |
+
pending_eval_table,
|
| 404 |
+
],
|
| 405 |
+
)
|
| 406 |
block.launch()
|