Spaces:

Open-Style
/

OSQ-Leaderboard

Running

App Files Files Community

SondosMB commited on Jun 1, 2024

Commit

b782462

verified ·

1 Parent(s): 640cdfd

Upload 5 files

Browse files

Files changed (5) hide show

app.py +99 -0
big (1).json +68 -0
constants.py +26 -0
requirements (1).txt +2 -0
small (1).json +134 -0

app.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import gradio as gr
+import pandas as pd
+from constants import INTRODUCTION_TEXT,CITATION_TEXT
+# Define the formatter function
+def formatter(x):
+    try:
+        return round(x, 2)
+    except:
+        return x
+# Example DataFrames
+jsond_data = pd.read_json('big.json')
+original_df = pd.DataFrame(jsond_data)
+print(original_df)
+jsond_data2 = pd.read_json('small.json')
+Small_original_df = pd.DataFrame(jsond_data2)
+print(Small_original_df)
+# Apply formatter to the entire DataFrame
+original_df = original_df.applymap(formatter)
+Small_original_df=Small_original_df.applymap(formatter)
+# Correct data types for Gradio DataFrame component
+TYPES = ['str', 'number', 'number', 'number']
+LAST_UPDATED = "May 10th 2024"
+# CSS for styling
+css = """
+.markdown-text{font-size: 200pt}
+.markdown-text-small{font-size: 13pt}
+th {
+  text-align: center;
+}
+td {
+  font-size: 15px; /* Adjust the font size as needed */
+  text-align: center;
+}
+#od-benchmark-tab-table-button{
+    font-size: 15pt;
+    font-weight: bold;
+}
+#Intro{
+font-size: 100pt;
+}
+"""
+def build_demo(original_df,Small_original_df, TYPES):
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(INTRODUCTION_TEXT, elem_id="Intro")
+        with gr.Tabs():
+            with gr.TabItem("🏅Leaderboard_Large",elem_id="od-benchmark-tab-table", id=0):
+                leaderboard_table = gr.components.Dataframe(
+                    value=original_df,
+                    datatype=TYPES,
+                    label="Leaderboard_Big",
+                    height=1000,
+                    wrap=False,
+                    interactive=False,
+                    visible=True,
+                    min_width=60,
+                )
+            with gr.TabItem("🏅 Leaderboard_Small",elem_id="od-benchmark-tab-table", id=1):
+                leaderboard_table = gr.components.Dataframe(
+                    value=Small_original_df,
+                    datatype=TYPES,
+                    label="Leaderboard_small",
+                    height=1000,
+                    wrap=False,
+                    interactive=False,
+                    visible=True,
+                    min_width=60,
+                )
+        gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text-small")
+        with gr.Row():
+            with gr.Accordion("📙 Citation", open=False):
+                gr.Textbox(
+                    value=CITATION_TEXT, lines=18,
+                    label="",
+                    elem_id="citation-button",
+                    show_copy_button=True)
+    return demo
+demo = build_demo(original_df,Small_original_df, TYPES)
+demo.launch(share='True')

big (1).json ADDED Viewed

	@@ -0,0 +1,68 @@

+[
+    {
+        "model": "GPT-4",
+        "Average": 65.94,
+        "MMLU": 74.8,
+        "WinoGrande": 66.2,
+        "PiQA": 61.6,
+        "CommonsenseQA": 63.0,
+        "Race": 67.0,
+        "MedMCQA": 51.8,
+        "OpenkookQA": 60.3
+    },
+    {
+        "model": "Claude-3 Opus",
+        "Average": 62.64,
+        "MMLU": 70.4,
+        "WinoGrande": 63.5,
+        "PiQA": 59.1,
+        "CommonsenseQA": 63.7,
+        "Race": 66.2,
+        "MedMCQA": 49.1,
+        "OpenkookQA": 54.0
+    },
+    {
+        "model": "Mistral Large",
+        "Average": 61.45,
+        "MMLU": 67.8,
+        "WinoGrande": 56.8,
+        "PiQA": 61.2,
+        "CommonsenseQA": 55.4,
+        "Race": 70.1,
+        "MedMCQA": 43.4,
+        "OpenkookQA": 58.7
+    },
+    {
+        "model": "GPT-3.5",
+        "Average": 59.06,
+        "MMLU": 65.4,
+        "WinoGrande": 54.6,
+        "PiQA": 54.9,
+        "CommonsenseQA": 67.9,
+        "Race": 60.1,
+        "MedMCQA": 41.4,
+        "OpenkookQA": 49.9
+    },
+    {
+        "model": "Gemini Pro",
+        "Average": 54.45,
+        "MMLU": 57.7,
+        "WinoGrande": 56.4,
+        "PiQA": 47.7,
+        "CommonsenseQA": 50.6,
+        "Race": 61.0,
+        "MedMCQA": 37.5,
+        "OpenkookQA": 52.5
+    },
+    {
+        "model": "Llama3-70b-instruct",
+        "Average": 54.06,
+        "MMLU": 64.67,
+        "WinoGrande": 57.14,
+        "PiQA": 43.1,
+        "CommonsenseQA": 55.49,
+        "Race": 58.21,
+        "MedMCQA": 41.67,
+        "OpenkookQA": 41.93
+    }
+]

constants.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from pathlib import Path
+banner_url = "https://huggingface.co/spaces/WildEval/WildBench-Leaderboard/resolve/main/%E2%80%8Eleaderboard_logo_v2.png" # the same repo here.
+BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>'
+INTRODUCTION_TEXT= """
+# OS Benchmark (Evaluating  LLMs with OS and MCQ)
+🔗 [Website](https://github.com/VILA-Lab/MBZUAI-LLM-Leaderboard) | 💻 [GitHub](https://github.com/VILA-Lab/MBZUAI-LLM-Leaderboard) | 📖 [Paper](#) | 🐦 [Tweet 1](#) | 🐦 [Tweet 2](#)
+> ### MBZUAI-LLM-Leaderboard, a new framework for evaluating large language models (LLMs) by transitioning from multiple-choice questions (MCQs) to open-style questions.
+This approach addresses the inherent biases and limitations of MCQs, such as selection bias and the effect of random guessing. By utilizing open-style questions,
+the framework aims to provide a more accurate assessment of LLMs' abilities across various benchmarks and ensure that the evaluation reflects true capabilities,
+particularly in terms of language understanding and reasoning.
+"""
+CITATION_TEXT = """@artical{..,
+      title={MBZUAI-LLM-Leaderboard: From Multi-choice to Open-style Questions for LLMs Evaluation, Benchmark, and Arena},
+      author={},
+      year={2024},
+      archivePrefix={arXiv}
+}
+"""

requirements (1).txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio
2	+ pandas

small (1).json ADDED Viewed

	@@ -0,0 +1,134 @@

+[
+    {
+        "model": "OPT (1.3B)",
+        "Average": 7.84,
+        "MMLU": 7.4,
+        "WinoGrande": 12.47,
+        "PiQA": 4.45,
+        "CommonsenseQA": 7.61,
+        "Race": 13.61,
+        "MedMCQA": 1.25,
+        "OpenkookQA": 4.48
+    },
+    {
+        "model": "SlimPajama",
+        "Average": 9.54,
+        "MMLU": 9.22,
+        "WinoGrande": 14.76,
+        "PiQA": 5.32,
+        "CommonsenseQA": 9.01,
+        "Race": 16.19,
+        "MedMCQA": 1.68,
+        "OpenkookQA": 5.7
+    },
+    {
+        "model": "OLMo (1B)",
+        "Average": 8.8,
+        "MMLU": 8.54,
+        "WinoGrande": 6.16,
+        "PiQA": 8.05,
+        "CommonsenseQA": 13.1,
+        "Race": 13.61,
+        "MedMCQA": 2.1,
+        "OpenkookQA": 6.11
+    },
+    {
+        "model": "GPT-Neo (1.3B)",
+        "Average": 7.38,
+        "MMLU": 6.94,
+        "WinoGrande": 10.81,
+        "PiQA": 4.31,
+        "CommonsenseQA": 6.34,
+        "Race": 13.75,
+        "MedMCQA": 2.63,
+        "OpenkookQA": 4.89
+    },
+    {
+        "model": "Cerebras-GPT (1.3B)",
+        "Average": 4.84,
+        "MMLU": 5.37,
+        "WinoGrande": 9.31,
+        "PiQA": 2.16,
+        "CommonsenseQA": 6.2,
+        "Race": 6.9,
+        "MedMCQA": 1.04,
+        "OpenkookQA": 3.46
+    },
+    {
+        "model": "RedPajama (1B)",
+        "Average": 9.01,
+        "MMLU": 9.21,
+        "WinoGrande": 16.97,
+        "PiQA": 1.39,
+        "CommonsenseQA": 11.41,
+        "Race": 14.35,
+        "MedMCQA": 1.86,
+        "OpenkookQA": 3.87
+    },
+    {
+        "model": "Pythia (1.4B)",
+        "Average": 8.73,
+        "MMLU": 9.66,
+        "WinoGrande": 11.52,
+        "PiQA": 4.17,
+        "CommonsenseQA": 9.01,
+        "Race": 12.76,
+        "MedMCQA": 3.19,
+        "OpenkookQA": 5.3
+    },
+    {
+        "model": "TinyLLama (1.1B)",
+        "Average": 8.39,
+        "MMLU": 8.94,
+        "WinoGrande": 12.23,
+        "PiQA": 3.59,
+        "CommonsenseQA": 6.06,
+        "Race": 16.7,
+        "MedMCQA": 2.07,
+        "OpenkookQA": 4.68
+    },
+    {
+        "model": "OELM (1B)",
+        "Average": 8.99,
+        "MMLU": 9.03,
+        "WinoGrande": 10.18,
+        "PiQA": 9.05,
+        "CommonsenseQA": 7.75,
+        "Race": 12.78,
+        "MedMCQA": 2.5,
+        "OpenkookQA": 6.31
+    },
+    {
+        "model": "Phi-3-mini-128k-instruct  (3.8B)",
+        "Average": 39.73,
+        "MMLU": 36.97,
+        "WinoGrande": 46.88,
+        "PiQA": 32.04,
+        "CommonsenseQA": 49.15,
+        "Race": 37.81,
+        "MedMCQA": 22.61,
+        "OpenkookQA": 33.6
+    },
+    {
+        "model": "Gemma (2B)",
+        "Average": 17.37,
+        "MMLU": 17.52,
+        "WinoGrande": 22.68,
+        "PiQA": 15.09,
+        "CommonsenseQA": 27.46,
+        "Race": 14.32,
+        "MedMCQA": 4.57,
+        "OpenkookQA": 14.26
+    },
+    {
+        "model": "Qwen (1.8B)",
+        "Average": 21.61,
+        "MMLU": 10.0,
+        "WinoGrande": 40.97,
+        "PiQA": 15.52,
+        "CommonsenseQA": 31.13,
+        "Race": 34.91,
+        "MedMCQA": 4.7,
+        "OpenkookQA": 20.37
+    }
+]