Spaces:
Runtime error
Runtime error
Add a baseline
Browse files
app.py
CHANGED
|
@@ -1,13 +1,11 @@
|
|
| 1 |
import os
|
| 2 |
-
import shutil
|
| 3 |
import numpy as np
|
| 4 |
import gradio as gr
|
| 5 |
from huggingface_hub import Repository, HfApi
|
| 6 |
from transformers import AutoConfig
|
| 7 |
import json
|
| 8 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
| 9 |
import pandas as pd
|
| 10 |
-
import
|
| 11 |
from utils import get_eval_results_dicts, make_clickable_model
|
| 12 |
|
| 13 |
# clone / pull the lmeh eval data
|
|
@@ -140,6 +138,19 @@ def get_leaderboard():
|
|
| 140 |
}
|
| 141 |
all_data.append(gpt35_values)
|
| 142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
df = pd.DataFrame.from_records(all_data)
|
| 144 |
df = df.sort_values(by=["Average ⬆️"], ascending=False)
|
| 145 |
df = df[COLS]
|
|
@@ -323,7 +334,7 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
|
|
| 323 |
|
| 324 |
"""
|
| 325 |
)
|
| 326 |
-
with gr.Accordion("Finished Evaluations", open=False):
|
| 327 |
with gr.Row():
|
| 328 |
finished_eval_table = gr.components.Dataframe(
|
| 329 |
value=finished_eval_queue,
|
|
@@ -331,7 +342,7 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
|
|
| 331 |
datatype=EVAL_TYPES,
|
| 332 |
max_rows=5,
|
| 333 |
)
|
| 334 |
-
with gr.Accordion("Running Evaluation Queue", open=False):
|
| 335 |
with gr.Row():
|
| 336 |
running_eval_table = gr.components.Dataframe(
|
| 337 |
value=running_eval_queue,
|
|
@@ -340,7 +351,7 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
|
|
| 340 |
max_rows=5,
|
| 341 |
)
|
| 342 |
|
| 343 |
-
with gr.Accordion("Pending Evaluation Queue", open=False):
|
| 344 |
with gr.Row():
|
| 345 |
pending_eval_table = gr.components.Dataframe(
|
| 346 |
value=pending_eval_queue,
|
|
@@ -378,6 +389,7 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
|
|
| 378 |
|
| 379 |
with gr.Row():
|
| 380 |
submit_button = gr.Button("Submit Eval")
|
|
|
|
| 381 |
with gr.Row():
|
| 382 |
submission_result = gr.Markdown()
|
| 383 |
submit_button.click(
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import gradio as gr
|
| 4 |
from huggingface_hub import Repository, HfApi
|
| 5 |
from transformers import AutoConfig
|
| 6 |
import json
|
|
|
|
| 7 |
import pandas as pd
|
| 8 |
+
from content import CHANGELOG_TEXT
|
| 9 |
from utils import get_eval_results_dicts, make_clickable_model
|
| 10 |
|
| 11 |
# clone / pull the lmeh eval data
|
|
|
|
| 138 |
}
|
| 139 |
all_data.append(gpt35_values)
|
| 140 |
|
| 141 |
+
base_line = {
|
| 142 |
+
"Model": '<p>Baseline</p>',
|
| 143 |
+
"Revision": "N/A",
|
| 144 |
+
"8bit": None,
|
| 145 |
+
"Average ⬆️": 25.0,
|
| 146 |
+
"ARC (25-shot) ⬆️": 25.0,
|
| 147 |
+
"HellaSwag (10-shot) ⬆️": 25.0,
|
| 148 |
+
"MMLU (5-shot) ⬆️": 25.0,
|
| 149 |
+
"TruthfulQA (0-shot) ⬆️": 25.0,
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
all_data.append(base_line)
|
| 153 |
+
|
| 154 |
df = pd.DataFrame.from_records(all_data)
|
| 155 |
df = df.sort_values(by=["Average ⬆️"], ascending=False)
|
| 156 |
df = df[COLS]
|
|
|
|
| 334 |
|
| 335 |
"""
|
| 336 |
)
|
| 337 |
+
with gr.Accordion("✅ Finished Evaluations", open=False):
|
| 338 |
with gr.Row():
|
| 339 |
finished_eval_table = gr.components.Dataframe(
|
| 340 |
value=finished_eval_queue,
|
|
|
|
| 342 |
datatype=EVAL_TYPES,
|
| 343 |
max_rows=5,
|
| 344 |
)
|
| 345 |
+
with gr.Accordion("🔄 Running Evaluation Queue", open=False):
|
| 346 |
with gr.Row():
|
| 347 |
running_eval_table = gr.components.Dataframe(
|
| 348 |
value=running_eval_queue,
|
|
|
|
| 351 |
max_rows=5,
|
| 352 |
)
|
| 353 |
|
| 354 |
+
with gr.Accordion("⏳ Pending Evaluation Queue", open=False):
|
| 355 |
with gr.Row():
|
| 356 |
pending_eval_table = gr.components.Dataframe(
|
| 357 |
value=pending_eval_queue,
|
|
|
|
| 389 |
|
| 390 |
with gr.Row():
|
| 391 |
submit_button = gr.Button("Submit Eval")
|
| 392 |
+
|
| 393 |
with gr.Row():
|
| 394 |
submission_result = gr.Markdown()
|
| 395 |
submit_button.click(
|