Arabic-LLM-Broad-Leaderboard

Running

App Files Files Community

karimouda commited on May 7

Commit

01cd9ce

1 Parent(s): 9ced1ec

deep dive

Browse files

Files changed (3) hide show

app.py +95 -4
src/display/css_html_js.py +12 -2
src/leaderboard/read_evals.py +23 -0

app.py CHANGED Viewed

@@ -3,6 +3,9 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns,SearchCo
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 #from huggingface_hub import snapshot_download
 from src.about import (
     CITATION_BUTTON_LABEL,
@@ -28,6 +31,9 @@ from src.display.utils import (
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 def restart_space():
@@ -86,7 +92,8 @@ def init_leaderboard(dataframe):
             interactive=False,
             column_widths=[30,50,50,150,60,60,60],
             max_height=420,
-            elem_classes="leaderboard_col_style"
         )
@@ -95,7 +102,6 @@ def init_skill_leaderboard(dataframe):
     ## create selector for model skills, based on the selector filter the dataframe
-    skills = ['MMLU', 'General Knowledge', 'Reasoning & Math', 'Translation (incl Dialects)', 'Trust & Safety', 'Writing (incl Dialects)', 'RAG QA', 'Reading Comprehension', 'Arabic Language & Grammar', 'Diacritization', 'Dialect Detection', 'Sentiment Analysis', 'Summarization', 'Instruction Following', 'Transliteration', 'Paraphrasing', 'Entity Extraction', 'Long Context', 'Coding', 'Hallucination', 'Function Calling', 'Structuring']
     skills_dropdown = gr.Dropdown(choices=skills, label="Select Skill", value=skills[0])
@@ -153,6 +159,74 @@ def init_size_leaderboard(dataframe):
     sizes_dropdown.change(filter_dataframe, inputs=sizes_dropdown, outputs=leaderboard_by_skill)
     return leaderboard_by_skill
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE, elem_classes="abl_header")
@@ -168,11 +242,28 @@ with demo:
         with gr.TabItem("🏅 Top by Skill", elem_id="llm-benchmark-tab-skills", id=2):
             leaderboard = init_skill_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-about", id=4):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-submit", id=5):
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 #from huggingface_hub import snapshot_download
+import re
 from src.about import (
     CITATION_BUTTON_LABEL,
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
+from src.leaderboard.read_evals import get_model_answers_html_file
+skills = ['MMLU', 'General Knowledge', 'Reasoning & Math', 'Translation (incl Dialects)', 'Trust & Safety', 'Writing (incl Dialects)', 'RAG QA', 'Reading Comprehension', 'Arabic Language & Grammar', 'Diacritization', 'Dialect Detection', 'Sentiment Analysis', 'Summarization', 'Instruction Following', 'Transliteration', 'Paraphrasing', 'Entity Extraction', 'Long Context', 'Coding', 'Hallucination', 'Function Calling', 'Structuring']
 def restart_space():
             interactive=False,
             column_widths=[30,50,50,150,60,60,60],
             max_height=420,
+            elem_classes="leaderboard_col_style",
+            show_search="search"
         )
     ## create selector for model skills, based on the selector filter the dataframe
     skills_dropdown = gr.Dropdown(choices=skills, label="Select Skill", value=skills[0])
     sizes_dropdown.change(filter_dataframe, inputs=sizes_dropdown, outputs=leaderboard_by_skill)
     return leaderboard_by_skill
+def strip_html_tags(model_name):
+        return re.sub('<[^<]+?>', '', model_name)
+def get_model_info_blocks(chosen_model_name):
+    model_names = LEADERBOARD_DF["Model Name"].unique().tolist()
+    model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
+    model_name_full = model_names[model_names_clean.index(chosen_model_name)]
+    filtered_df = LEADERBOARD_DF[LEADERBOARD_DF["Model Name"]==model_name_full].reset_index(drop=True)
+    skills_bar_df = pd.DataFrame({
+        'Skills': skills,
+        'Scores': filtered_df[skills].values[0]
+    })
+    skills_bar_df = skills_bar_df.sort_values(by=['Scores'], ascending=False).reset_index(drop=True)
+    with gr.Accordion("Model Details"):
+        with gr.Row():
+            model_name = gr.Markdown("""<span class='deep-dive-metric'><b>Model Name:</b> {}</span> """.format(chosen_model_name))
+        with gr.Row():
+            benchmark_score =   gr.Markdown("""<span class='deep-dive-metric'><b>Benchmark Score:</b>{}/10</span>""".format(filtered_df["Benchmark Score"][0]))
+            rank = gr.Markdown("""<span class='deep-dive-metric'><b>Benchmark Rank:</b>{}</span>""".format(filtered_df["Rank"][0]))
+            speed =  gr.Markdown("""<span class='deep-dive-metric'><b>Speed:</b>{} words per second</span>""".format(filtered_df["Speed (words/sec)"][0]))
+            contamination =     gr.Markdown("""<span class='deep-dive-metric'><b>Contamination Score:</b>{}</span>""".format(filtered_df["Contamination Score"][0]))
+            size =  gr.Markdown("""<span class='deep-dive-metric'><b>Size Category:</b>{}</span>""".format(filtered_df["Category"][0]))
+    with gr.Row():
+        skills_bar = gr.BarPlot(
+                        value=skills_bar_df,
+                        x="Skills",
+                        y="Scores",
+                        width=500,
+                        height=500,
+                        x_label_angle=45,
+                        color="Skills",
+                        color_title=None,
+                        label="Model Skills"
+                    )
+    html_file_content = get_model_answers_html_file(EVAL_RESULTS_PATH, chosen_model_name)
+    if html_file_content == "EMPTY":
+        answers_html = gr.Markdown("")
+    else:
+        with gr.Row():
+            ##strip style and script tags from html
+            html_file_content = re.sub('<style.*?>.*?</style>', '', html_file_content, flags=re.DOTALL)
+            html_file_content = re.sub('<script.*?>.*?</script>', '', html_file_content, flags=re.DOTALL)
+            answers_html = gr.HTML(html_file_content,max_height=500,show_label=True,
+                                    label="Model Responses", container=True, elem_classes="model_responses_container")
+    return model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html
+def init_compare_tab(dataframe):
+    pass
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE, elem_classes="abl_header")
         with gr.TabItem("🏅 Top by Skill", elem_id="llm-benchmark-tab-skills", id=2):
             leaderboard = init_skill_leaderboard(LEADERBOARD_DF)
+        with gr.TabItem("⚖️ Compare", elem_id="llm-benchmark-tab-compare", id=3):
+            init_compare_tab(LEADERBOARD_DF)
+        with gr.TabItem("🔬 Deep Dive", elem_id="llm-benchmark-tab-compare", id=4):
+            model_names = LEADERBOARD_DF["Model Name"].unique().tolist()
+            model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
+            with gr.Row():
+                models_dropdown = gr.Dropdown(choices=model_names_clean, label="Select Model", value=model_names_clean[0])
+            model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html  = get_model_info_blocks(models_dropdown.value)
+            models_dropdown.change(get_model_info_blocks, inputs=models_dropdown, outputs=[model_name,benchmark_score,rank,speed,contamination,size,skills_bar,answers_html])
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-about", id=5):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-submit", id=6):
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

src/display/css_html_js.py CHANGED Viewed

@@ -118,10 +118,20 @@ border-radius: 10px;
 }
 .tabs{
-gap:0px !important;
 }
 """

 }
 .tabs{
+    gap:0px !important;
 }
+.deep-dive-metric{
+    font-size:20px;
+    padding: 10px;
+    display: flex;
+    flex-direction: column;
+    align-items: normal;
+    max-height: 120px;
+}
+.model_responses_container td{
+    max-width:180px;
+}
 """

src/leaderboard/read_evals.py CHANGED Viewed

@@ -232,3 +232,26 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     print(results)
     return results

     print(results)
     return results
+def get_model_answers_html_file(results_path, model_name):
+    model_org,model_name_only = model_name.split("/")
+    model_answers_prefix = f"{results_path}/{model_org}/"
+    html_file_content = "EMPTY"
+    for root, _, files in os.walk(model_answers_prefix):
+        for file_name in files:
+            if file_name.startswith(f"{model_name_only}_abb_benchmark_answers_"):
+                file_path = os.path.join(root, file_name)
+                with open(file_path, "r") as f:
+                    html_file_content = f.read()
+                    break
+    return html_file_content