deep dive
Browse files- app.py +95 -4
- src/display/css_html_js.py +12 -2
- src/leaderboard/read_evals.py +23 -0
app.py
CHANGED
|
@@ -3,6 +3,9 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns,SearchCo
|
|
| 3 |
import pandas as pd
|
| 4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 5 |
#from huggingface_hub import snapshot_download
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
from src.about import (
|
| 8 |
CITATION_BUTTON_LABEL,
|
|
@@ -28,6 +31,9 @@ from src.display.utils import (
|
|
| 28 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 29 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 30 |
from src.submission.submit import add_new_eval
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
def restart_space():
|
|
@@ -86,7 +92,8 @@ def init_leaderboard(dataframe):
|
|
| 86 |
interactive=False,
|
| 87 |
column_widths=[30,50,50,150,60,60,60],
|
| 88 |
max_height=420,
|
| 89 |
-
elem_classes="leaderboard_col_style"
|
|
|
|
| 90 |
)
|
| 91 |
|
| 92 |
|
|
@@ -95,7 +102,6 @@ def init_skill_leaderboard(dataframe):
|
|
| 95 |
|
| 96 |
|
| 97 |
## create selector for model skills, based on the selector filter the dataframe
|
| 98 |
-
skills = ['MMLU', 'General Knowledge', 'Reasoning & Math', 'Translation (incl Dialects)', 'Trust & Safety', 'Writing (incl Dialects)', 'RAG QA', 'Reading Comprehension', 'Arabic Language & Grammar', 'Diacritization', 'Dialect Detection', 'Sentiment Analysis', 'Summarization', 'Instruction Following', 'Transliteration', 'Paraphrasing', 'Entity Extraction', 'Long Context', 'Coding', 'Hallucination', 'Function Calling', 'Structuring']
|
| 99 |
|
| 100 |
skills_dropdown = gr.Dropdown(choices=skills, label="Select Skill", value=skills[0])
|
| 101 |
|
|
@@ -153,6 +159,74 @@ def init_size_leaderboard(dataframe):
|
|
| 153 |
sizes_dropdown.change(filter_dataframe, inputs=sizes_dropdown, outputs=leaderboard_by_skill)
|
| 154 |
return leaderboard_by_skill
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
demo = gr.Blocks(css=custom_css)
|
| 157 |
with demo:
|
| 158 |
gr.HTML(TITLE, elem_classes="abl_header")
|
|
@@ -168,11 +242,28 @@ with demo:
|
|
| 168 |
with gr.TabItem("🏅 Top by Skill", elem_id="llm-benchmark-tab-skills", id=2):
|
| 169 |
leaderboard = init_skill_leaderboard(LEADERBOARD_DF)
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-about", id=
|
| 173 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 174 |
|
| 175 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-submit", id=
|
| 176 |
with gr.Column():
|
| 177 |
with gr.Row():
|
| 178 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 5 |
#from huggingface_hub import snapshot_download
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
+
|
| 9 |
|
| 10 |
from src.about import (
|
| 11 |
CITATION_BUTTON_LABEL,
|
|
|
|
| 31 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 32 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 33 |
from src.submission.submit import add_new_eval
|
| 34 |
+
from src.leaderboard.read_evals import get_model_answers_html_file
|
| 35 |
+
|
| 36 |
+
skills = ['MMLU', 'General Knowledge', 'Reasoning & Math', 'Translation (incl Dialects)', 'Trust & Safety', 'Writing (incl Dialects)', 'RAG QA', 'Reading Comprehension', 'Arabic Language & Grammar', 'Diacritization', 'Dialect Detection', 'Sentiment Analysis', 'Summarization', 'Instruction Following', 'Transliteration', 'Paraphrasing', 'Entity Extraction', 'Long Context', 'Coding', 'Hallucination', 'Function Calling', 'Structuring']
|
| 37 |
|
| 38 |
|
| 39 |
def restart_space():
|
|
|
|
| 92 |
interactive=False,
|
| 93 |
column_widths=[30,50,50,150,60,60,60],
|
| 94 |
max_height=420,
|
| 95 |
+
elem_classes="leaderboard_col_style",
|
| 96 |
+
show_search="search"
|
| 97 |
)
|
| 98 |
|
| 99 |
|
|
|
|
| 102 |
|
| 103 |
|
| 104 |
## create selector for model skills, based on the selector filter the dataframe
|
|
|
|
| 105 |
|
| 106 |
skills_dropdown = gr.Dropdown(choices=skills, label="Select Skill", value=skills[0])
|
| 107 |
|
|
|
|
| 159 |
sizes_dropdown.change(filter_dataframe, inputs=sizes_dropdown, outputs=leaderboard_by_skill)
|
| 160 |
return leaderboard_by_skill
|
| 161 |
|
| 162 |
+
def strip_html_tags(model_name):
|
| 163 |
+
return re.sub('<[^<]+?>', '', model_name)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def get_model_info_blocks(chosen_model_name):
|
| 168 |
+
|
| 169 |
+
model_names = LEADERBOARD_DF["Model Name"].unique().tolist()
|
| 170 |
+
model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
|
| 171 |
+
|
| 172 |
+
model_name_full = model_names[model_names_clean.index(chosen_model_name)]
|
| 173 |
+
filtered_df = LEADERBOARD_DF[LEADERBOARD_DF["Model Name"]==model_name_full].reset_index(drop=True)
|
| 174 |
+
skills_bar_df = pd.DataFrame({
|
| 175 |
+
'Skills': skills,
|
| 176 |
+
'Scores': filtered_df[skills].values[0]
|
| 177 |
+
})
|
| 178 |
+
|
| 179 |
+
skills_bar_df = skills_bar_df.sort_values(by=['Scores'], ascending=False).reset_index(drop=True)
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
with gr.Accordion("Model Details"):
|
| 183 |
+
|
| 184 |
+
with gr.Row():
|
| 185 |
+
model_name = gr.Markdown("""<span class='deep-dive-metric'><b>Model Name:</b> {}</span> """.format(chosen_model_name))
|
| 186 |
+
with gr.Row():
|
| 187 |
+
benchmark_score = gr.Markdown("""<span class='deep-dive-metric'><b>Benchmark Score:</b>{}/10</span>""".format(filtered_df["Benchmark Score"][0]))
|
| 188 |
+
rank = gr.Markdown("""<span class='deep-dive-metric'><b>Benchmark Rank:</b>{}</span>""".format(filtered_df["Rank"][0]))
|
| 189 |
+
speed = gr.Markdown("""<span class='deep-dive-metric'><b>Speed:</b>{} words per second</span>""".format(filtered_df["Speed (words/sec)"][0]))
|
| 190 |
+
contamination = gr.Markdown("""<span class='deep-dive-metric'><b>Contamination Score:</b>{}</span>""".format(filtered_df["Contamination Score"][0]))
|
| 191 |
+
size = gr.Markdown("""<span class='deep-dive-metric'><b>Size Category:</b>{}</span>""".format(filtered_df["Category"][0]))
|
| 192 |
+
|
| 193 |
+
with gr.Row():
|
| 194 |
+
skills_bar = gr.BarPlot(
|
| 195 |
+
value=skills_bar_df,
|
| 196 |
+
x="Skills",
|
| 197 |
+
y="Scores",
|
| 198 |
+
width=500,
|
| 199 |
+
height=500,
|
| 200 |
+
x_label_angle=45,
|
| 201 |
+
color="Skills",
|
| 202 |
+
color_title=None,
|
| 203 |
+
label="Model Skills"
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
html_file_content = get_model_answers_html_file(EVAL_RESULTS_PATH, chosen_model_name)
|
| 208 |
+
|
| 209 |
+
if html_file_content == "EMPTY":
|
| 210 |
+
answers_html = gr.Markdown("")
|
| 211 |
+
else:
|
| 212 |
+
with gr.Row():
|
| 213 |
+
|
| 214 |
+
##strip style and script tags from html
|
| 215 |
+
html_file_content = re.sub('<style.*?>.*?</style>', '', html_file_content, flags=re.DOTALL)
|
| 216 |
+
html_file_content = re.sub('<script.*?>.*?</script>', '', html_file_content, flags=re.DOTALL)
|
| 217 |
+
|
| 218 |
+
answers_html = gr.HTML(html_file_content,max_height=500,show_label=True,
|
| 219 |
+
label="Model Responses", container=True, elem_classes="model_responses_container")
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
return model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def init_compare_tab(dataframe):
|
| 227 |
+
pass
|
| 228 |
+
|
| 229 |
+
|
| 230 |
demo = gr.Blocks(css=custom_css)
|
| 231 |
with demo:
|
| 232 |
gr.HTML(TITLE, elem_classes="abl_header")
|
|
|
|
| 242 |
with gr.TabItem("🏅 Top by Skill", elem_id="llm-benchmark-tab-skills", id=2):
|
| 243 |
leaderboard = init_skill_leaderboard(LEADERBOARD_DF)
|
| 244 |
|
| 245 |
+
with gr.TabItem("⚖️ Compare", elem_id="llm-benchmark-tab-compare", id=3):
|
| 246 |
+
init_compare_tab(LEADERBOARD_DF)
|
| 247 |
+
|
| 248 |
+
with gr.TabItem("🔬 Deep Dive", elem_id="llm-benchmark-tab-compare", id=4):
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
model_names = LEADERBOARD_DF["Model Name"].unique().tolist()
|
| 252 |
+
model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
|
| 253 |
+
with gr.Row():
|
| 254 |
+
models_dropdown = gr.Dropdown(choices=model_names_clean, label="Select Model", value=model_names_clean[0])
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html = get_model_info_blocks(models_dropdown.value)
|
| 258 |
+
|
| 259 |
+
models_dropdown.change(get_model_info_blocks, inputs=models_dropdown, outputs=[model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html])
|
| 260 |
+
|
| 261 |
+
|
| 262 |
|
| 263 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-about", id=5):
|
| 264 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 265 |
|
| 266 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-submit", id=6):
|
| 267 |
with gr.Column():
|
| 268 |
with gr.Row():
|
| 269 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
src/display/css_html_js.py
CHANGED
|
@@ -118,10 +118,20 @@ border-radius: 10px;
|
|
| 118 |
}
|
| 119 |
|
| 120 |
.tabs{
|
| 121 |
-
gap:0px !important;
|
| 122 |
}
|
| 123 |
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
"""
|
| 127 |
|
|
|
|
| 118 |
}
|
| 119 |
|
| 120 |
.tabs{
|
| 121 |
+
gap:0px !important;
|
| 122 |
}
|
| 123 |
|
| 124 |
+
.deep-dive-metric{
|
| 125 |
+
font-size:20px;
|
| 126 |
+
padding: 10px;
|
| 127 |
+
display: flex;
|
| 128 |
+
flex-direction: column;
|
| 129 |
+
align-items: normal;
|
| 130 |
+
max-height: 120px;
|
| 131 |
+
}
|
| 132 |
+
.model_responses_container td{
|
| 133 |
+
max-width:180px;
|
| 134 |
+
}
|
| 135 |
|
| 136 |
"""
|
| 137 |
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -232,3 +232,26 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 232 |
|
| 233 |
print(results)
|
| 234 |
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
print(results)
|
| 234 |
return results
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def get_model_answers_html_file(results_path, model_name):
|
| 238 |
+
|
| 239 |
+
model_org,model_name_only = model_name.split("/")
|
| 240 |
+
model_answers_prefix = f"{results_path}/{model_org}/"
|
| 241 |
+
|
| 242 |
+
html_file_content = "EMPTY"
|
| 243 |
+
|
| 244 |
+
for root, _, files in os.walk(model_answers_prefix):
|
| 245 |
+
|
| 246 |
+
for file_name in files:
|
| 247 |
+
|
| 248 |
+
if file_name.startswith(f"{model_name_only}_abb_benchmark_answers_"):
|
| 249 |
+
|
| 250 |
+
file_path = os.path.join(root, file_name)
|
| 251 |
+
|
| 252 |
+
with open(file_path, "r") as f:
|
| 253 |
+
|
| 254 |
+
html_file_content = f.read()
|
| 255 |
+
break
|
| 256 |
+
|
| 257 |
+
return html_file_content
|