Spaces:
Running
Running
Upload 5 files
Browse files- app.py +99 -0
- big (1).json +68 -0
- constants.py +26 -0
- requirements (1).txt +2 -0
- small (1).json +134 -0
app.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from constants import INTRODUCTION_TEXT,CITATION_TEXT
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Define the formatter function
|
| 7 |
+
def formatter(x):
|
| 8 |
+
try:
|
| 9 |
+
return round(x, 2)
|
| 10 |
+
except:
|
| 11 |
+
return x
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Example DataFrames
|
| 15 |
+
|
| 16 |
+
jsond_data = pd.read_json('big.json')
|
| 17 |
+
original_df = pd.DataFrame(jsond_data)
|
| 18 |
+
print(original_df)
|
| 19 |
+
|
| 20 |
+
jsond_data2 = pd.read_json('small.json')
|
| 21 |
+
Small_original_df = pd.DataFrame(jsond_data2)
|
| 22 |
+
print(Small_original_df)
|
| 23 |
+
|
| 24 |
+
# Apply formatter to the entire DataFrame
|
| 25 |
+
original_df = original_df.applymap(formatter)
|
| 26 |
+
Small_original_df=Small_original_df.applymap(formatter)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# Correct data types for Gradio DataFrame component
|
| 30 |
+
TYPES = ['str', 'number', 'number', 'number']
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
LAST_UPDATED = "May 10th 2024"
|
| 34 |
+
|
| 35 |
+
# CSS for styling
|
| 36 |
+
css = """
|
| 37 |
+
.markdown-text{font-size: 200pt}
|
| 38 |
+
.markdown-text-small{font-size: 13pt}
|
| 39 |
+
th {
|
| 40 |
+
text-align: center;
|
| 41 |
+
}
|
| 42 |
+
td {
|
| 43 |
+
font-size: 15px; /* Adjust the font size as needed */
|
| 44 |
+
text-align: center;
|
| 45 |
+
}
|
| 46 |
+
#od-benchmark-tab-table-button{
|
| 47 |
+
font-size: 15pt;
|
| 48 |
+
font-weight: bold;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
#Intro{
|
| 52 |
+
font-size: 100pt;
|
| 53 |
+
}
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def build_demo(original_df,Small_original_df, TYPES):
|
| 58 |
+
with gr.Blocks(css=css) as demo:
|
| 59 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_id="Intro")
|
| 60 |
+
with gr.Tabs():
|
| 61 |
+
with gr.TabItem("🏅Leaderboard_Large",elem_id="od-benchmark-tab-table", id=0):
|
| 62 |
+
leaderboard_table = gr.components.Dataframe(
|
| 63 |
+
value=original_df,
|
| 64 |
+
datatype=TYPES,
|
| 65 |
+
label="Leaderboard_Big",
|
| 66 |
+
height=1000,
|
| 67 |
+
wrap=False,
|
| 68 |
+
interactive=False,
|
| 69 |
+
visible=True,
|
| 70 |
+
min_width=60,
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
with gr.TabItem("🏅 Leaderboard_Small",elem_id="od-benchmark-tab-table", id=1):
|
| 74 |
+
leaderboard_table = gr.components.Dataframe(
|
| 75 |
+
value=Small_original_df,
|
| 76 |
+
datatype=TYPES,
|
| 77 |
+
label="Leaderboard_small",
|
| 78 |
+
height=1000,
|
| 79 |
+
wrap=False,
|
| 80 |
+
interactive=False,
|
| 81 |
+
visible=True,
|
| 82 |
+
min_width=60,
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text-small")
|
| 87 |
+
|
| 88 |
+
with gr.Row():
|
| 89 |
+
with gr.Accordion("📙 Citation", open=False):
|
| 90 |
+
gr.Textbox(
|
| 91 |
+
value=CITATION_TEXT, lines=18,
|
| 92 |
+
label="",
|
| 93 |
+
elem_id="citation-button",
|
| 94 |
+
show_copy_button=True)
|
| 95 |
+
|
| 96 |
+
return demo
|
| 97 |
+
|
| 98 |
+
demo = build_demo(original_df,Small_original_df, TYPES)
|
| 99 |
+
demo.launch(share='True')
|
big (1).json
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"model": "GPT-4",
|
| 4 |
+
"Average": 65.94,
|
| 5 |
+
"MMLU": 74.8,
|
| 6 |
+
"WinoGrande": 66.2,
|
| 7 |
+
"PiQA": 61.6,
|
| 8 |
+
"CommonsenseQA": 63.0,
|
| 9 |
+
"Race": 67.0,
|
| 10 |
+
"MedMCQA": 51.8,
|
| 11 |
+
"OpenkookQA": 60.3
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"model": "Claude-3 Opus",
|
| 15 |
+
"Average": 62.64,
|
| 16 |
+
"MMLU": 70.4,
|
| 17 |
+
"WinoGrande": 63.5,
|
| 18 |
+
"PiQA": 59.1,
|
| 19 |
+
"CommonsenseQA": 63.7,
|
| 20 |
+
"Race": 66.2,
|
| 21 |
+
"MedMCQA": 49.1,
|
| 22 |
+
"OpenkookQA": 54.0
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"model": "Mistral Large",
|
| 26 |
+
"Average": 61.45,
|
| 27 |
+
"MMLU": 67.8,
|
| 28 |
+
"WinoGrande": 56.8,
|
| 29 |
+
"PiQA": 61.2,
|
| 30 |
+
"CommonsenseQA": 55.4,
|
| 31 |
+
"Race": 70.1,
|
| 32 |
+
"MedMCQA": 43.4,
|
| 33 |
+
"OpenkookQA": 58.7
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"model": "GPT-3.5",
|
| 37 |
+
"Average": 59.06,
|
| 38 |
+
"MMLU": 65.4,
|
| 39 |
+
"WinoGrande": 54.6,
|
| 40 |
+
"PiQA": 54.9,
|
| 41 |
+
"CommonsenseQA": 67.9,
|
| 42 |
+
"Race": 60.1,
|
| 43 |
+
"MedMCQA": 41.4,
|
| 44 |
+
"OpenkookQA": 49.9
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"model": "Gemini Pro",
|
| 48 |
+
"Average": 54.45,
|
| 49 |
+
"MMLU": 57.7,
|
| 50 |
+
"WinoGrande": 56.4,
|
| 51 |
+
"PiQA": 47.7,
|
| 52 |
+
"CommonsenseQA": 50.6,
|
| 53 |
+
"Race": 61.0,
|
| 54 |
+
"MedMCQA": 37.5,
|
| 55 |
+
"OpenkookQA": 52.5
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"model": "Llama3-70b-instruct",
|
| 59 |
+
"Average": 54.06,
|
| 60 |
+
"MMLU": 64.67,
|
| 61 |
+
"WinoGrande": 57.14,
|
| 62 |
+
"PiQA": 43.1,
|
| 63 |
+
"CommonsenseQA": 55.49,
|
| 64 |
+
"Race": 58.21,
|
| 65 |
+
"MedMCQA": 41.67,
|
| 66 |
+
"OpenkookQA": 41.93
|
| 67 |
+
}
|
| 68 |
+
]
|
constants.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
banner_url = "https://huggingface.co/spaces/WildEval/WildBench-Leaderboard/resolve/main/%E2%80%8Eleaderboard_logo_v2.png" # the same repo here.
|
| 7 |
+
BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>'
|
| 8 |
+
|
| 9 |
+
INTRODUCTION_TEXT= """
|
| 10 |
+
# OS Benchmark (Evaluating LLMs with OS and MCQ)
|
| 11 |
+
🔗 [Website](https://github.com/VILA-Lab/MBZUAI-LLM-Leaderboard) | 💻 [GitHub](https://github.com/VILA-Lab/MBZUAI-LLM-Leaderboard) | 📖 [Paper](#) | 🐦 [Tweet 1](#) | 🐦 [Tweet 2](#)
|
| 12 |
+
|
| 13 |
+
> ### MBZUAI-LLM-Leaderboard, a new framework for evaluating large language models (LLMs) by transitioning from multiple-choice questions (MCQs) to open-style questions.
|
| 14 |
+
This approach addresses the inherent biases and limitations of MCQs, such as selection bias and the effect of random guessing. By utilizing open-style questions,
|
| 15 |
+
the framework aims to provide a more accurate assessment of LLMs' abilities across various benchmarks and ensure that the evaluation reflects true capabilities,
|
| 16 |
+
particularly in terms of language understanding and reasoning.
|
| 17 |
+
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
CITATION_TEXT = """@artical{..,
|
| 21 |
+
title={MBZUAI-LLM-Leaderboard: From Multi-choice to Open-style Questions for LLMs Evaluation, Benchmark, and Arena},
|
| 22 |
+
author={},
|
| 23 |
+
year={2024},
|
| 24 |
+
archivePrefix={arXiv}
|
| 25 |
+
}
|
| 26 |
+
"""
|
requirements (1).txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
pandas
|
small (1).json
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"model": "OPT (1.3B)",
|
| 4 |
+
"Average": 7.84,
|
| 5 |
+
"MMLU": 7.4,
|
| 6 |
+
"WinoGrande": 12.47,
|
| 7 |
+
"PiQA": 4.45,
|
| 8 |
+
"CommonsenseQA": 7.61,
|
| 9 |
+
"Race": 13.61,
|
| 10 |
+
"MedMCQA": 1.25,
|
| 11 |
+
"OpenkookQA": 4.48
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"model": "SlimPajama",
|
| 15 |
+
"Average": 9.54,
|
| 16 |
+
"MMLU": 9.22,
|
| 17 |
+
"WinoGrande": 14.76,
|
| 18 |
+
"PiQA": 5.32,
|
| 19 |
+
"CommonsenseQA": 9.01,
|
| 20 |
+
"Race": 16.19,
|
| 21 |
+
"MedMCQA": 1.68,
|
| 22 |
+
"OpenkookQA": 5.7
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"model": "OLMo (1B)",
|
| 26 |
+
"Average": 8.8,
|
| 27 |
+
"MMLU": 8.54,
|
| 28 |
+
"WinoGrande": 6.16,
|
| 29 |
+
"PiQA": 8.05,
|
| 30 |
+
"CommonsenseQA": 13.1,
|
| 31 |
+
"Race": 13.61,
|
| 32 |
+
"MedMCQA": 2.1,
|
| 33 |
+
"OpenkookQA": 6.11
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"model": "GPT-Neo (1.3B)",
|
| 37 |
+
"Average": 7.38,
|
| 38 |
+
"MMLU": 6.94,
|
| 39 |
+
"WinoGrande": 10.81,
|
| 40 |
+
"PiQA": 4.31,
|
| 41 |
+
"CommonsenseQA": 6.34,
|
| 42 |
+
"Race": 13.75,
|
| 43 |
+
"MedMCQA": 2.63,
|
| 44 |
+
"OpenkookQA": 4.89
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"model": "Cerebras-GPT (1.3B)",
|
| 48 |
+
"Average": 4.84,
|
| 49 |
+
"MMLU": 5.37,
|
| 50 |
+
"WinoGrande": 9.31,
|
| 51 |
+
"PiQA": 2.16,
|
| 52 |
+
"CommonsenseQA": 6.2,
|
| 53 |
+
"Race": 6.9,
|
| 54 |
+
"MedMCQA": 1.04,
|
| 55 |
+
"OpenkookQA": 3.46
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"model": "RedPajama (1B)",
|
| 59 |
+
"Average": 9.01,
|
| 60 |
+
"MMLU": 9.21,
|
| 61 |
+
"WinoGrande": 16.97,
|
| 62 |
+
"PiQA": 1.39,
|
| 63 |
+
"CommonsenseQA": 11.41,
|
| 64 |
+
"Race": 14.35,
|
| 65 |
+
"MedMCQA": 1.86,
|
| 66 |
+
"OpenkookQA": 3.87
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"model": "Pythia (1.4B)",
|
| 70 |
+
"Average": 8.73,
|
| 71 |
+
"MMLU": 9.66,
|
| 72 |
+
"WinoGrande": 11.52,
|
| 73 |
+
"PiQA": 4.17,
|
| 74 |
+
"CommonsenseQA": 9.01,
|
| 75 |
+
"Race": 12.76,
|
| 76 |
+
"MedMCQA": 3.19,
|
| 77 |
+
"OpenkookQA": 5.3
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"model": "TinyLLama (1.1B)",
|
| 81 |
+
"Average": 8.39,
|
| 82 |
+
"MMLU": 8.94,
|
| 83 |
+
"WinoGrande": 12.23,
|
| 84 |
+
"PiQA": 3.59,
|
| 85 |
+
"CommonsenseQA": 6.06,
|
| 86 |
+
"Race": 16.7,
|
| 87 |
+
"MedMCQA": 2.07,
|
| 88 |
+
"OpenkookQA": 4.68
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"model": "OELM (1B)",
|
| 92 |
+
"Average": 8.99,
|
| 93 |
+
"MMLU": 9.03,
|
| 94 |
+
"WinoGrande": 10.18,
|
| 95 |
+
"PiQA": 9.05,
|
| 96 |
+
"CommonsenseQA": 7.75,
|
| 97 |
+
"Race": 12.78,
|
| 98 |
+
"MedMCQA": 2.5,
|
| 99 |
+
"OpenkookQA": 6.31
|
| 100 |
+
},
|
| 101 |
+
{
|
| 102 |
+
"model": "Phi-3-mini-128k-instruct (3.8B)",
|
| 103 |
+
"Average": 39.73,
|
| 104 |
+
"MMLU": 36.97,
|
| 105 |
+
"WinoGrande": 46.88,
|
| 106 |
+
"PiQA": 32.04,
|
| 107 |
+
"CommonsenseQA": 49.15,
|
| 108 |
+
"Race": 37.81,
|
| 109 |
+
"MedMCQA": 22.61,
|
| 110 |
+
"OpenkookQA": 33.6
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"model": "Gemma (2B)",
|
| 114 |
+
"Average": 17.37,
|
| 115 |
+
"MMLU": 17.52,
|
| 116 |
+
"WinoGrande": 22.68,
|
| 117 |
+
"PiQA": 15.09,
|
| 118 |
+
"CommonsenseQA": 27.46,
|
| 119 |
+
"Race": 14.32,
|
| 120 |
+
"MedMCQA": 4.57,
|
| 121 |
+
"OpenkookQA": 14.26
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"model": "Qwen (1.8B)",
|
| 125 |
+
"Average": 21.61,
|
| 126 |
+
"MMLU": 10.0,
|
| 127 |
+
"WinoGrande": 40.97,
|
| 128 |
+
"PiQA": 15.52,
|
| 129 |
+
"CommonsenseQA": 31.13,
|
| 130 |
+
"Race": 34.91,
|
| 131 |
+
"MedMCQA": 4.7,
|
| 132 |
+
"OpenkookQA": 20.37
|
| 133 |
+
}
|
| 134 |
+
]
|