|
import gradio as gr |
|
import json |
|
import pandas as pd |
|
from urllib.request import urlopen |
|
from urllib.error import URLError |
|
import re |
|
from datetime import datetime |
|
|
|
CITATION_BUTTON_TEXT = r"""@misc{2023opencompass, |
|
title={OpenCompass: A Universal Evaluation Platform for Foundation Models}, |
|
author={OpenCompass Contributors}, |
|
howpublished = {\url{https://github.com/open-compass/opencompass}}, |
|
year={2023} |
|
}""" |
|
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" |
|
|
|
|
|
Predictions_BUTTON_LABEL = "All model predictions are listed here. Access this URL for more details." |
|
|
|
Predictions_BUTTON_TEXT = "https://huggingface.co/datasets/opencompass/compass_academic_predictions" |
|
|
|
|
|
head_style = """ |
|
<style> |
|
@media (min-width: 1536px) |
|
{ |
|
.gradio-container { |
|
min-width: var(--size-full) !important; |
|
} |
|
} |
|
</style> |
|
""" |
|
|
|
DATA_URL_BASE = "http://opencompass.oss-cn-shanghai.aliyuncs.com/dev-assets/hf-research/" |
|
|
|
MAIN_LEADERBOARD_DESCRIPTION = """## Compass Academic Leaderboard (Full Version) |
|
The CompassAcademic currently focuses on the comprehensive reasoning abilities of LLMs. |
|
- The datasets selected so far include General Knowledge Reasoning (MMLU-Pro/GPQA-Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Completion (LiveCodeBench, HumanEval), and Instruction Following (IFEval). |
|
- Currently, the evaluation primarily targets chat models, with updates featuring the latest community models at irregular intervals. |
|
- Prompts and reproduction scripts can be found in [**OpenCompass**: A Toolkit for Evaluation of LLMs](https://github.com/open-compass/opencompass)🏆. |
|
|
|
""" |
|
Initial_title = 'Compass Academic Leaderboard' |
|
|
|
MODEL_SIZE = ['<10B', '10B-70B', '>70B', 'Unknown'] |
|
MODEL_TYPE = ['API', 'OpenSource'] |
|
|
|
|
|
|
|
def findfile(): |
|
model_meta_info = 'model-meta-info' |
|
results_sum = 'hf-academic' |
|
|
|
url = f"{DATA_URL_BASE}{model_meta_info}.json" |
|
response = urlopen(url) |
|
model_info = json.loads(response.read().decode('utf-8')) |
|
|
|
url = f"{DATA_URL_BASE}{results_sum}.json" |
|
response = urlopen(url) |
|
results = json.loads(response.read().decode('utf-8')) |
|
|
|
return model_info, results |
|
|
|
model_info, results = findfile() |
|
|
|
|
|
def findfile_predictions(): |
|
with open('data/hf-academic-predictions.json', 'r') as file: |
|
predictions = json.load(file) |
|
file.close() |
|
return predictions |
|
|
|
|
|
|
|
def make_results_tab(model_info, results): |
|
models_list, datasets_list = [], [] |
|
for i in model_info: |
|
models_list.append(i) |
|
for i in results.keys(): |
|
datasets_list.append(i) |
|
|
|
result_list = [] |
|
index = 1 |
|
for model in models_list: |
|
this_result = {} |
|
this_result['Index'] = index |
|
this_result['Model Name'] = model['display_name'] |
|
this_result['Release Time'] = model['release_time'] |
|
this_result['Parameters'] = model['num_param'] |
|
this_result['OpenSource'] = model['release_type'] |
|
is_all_results_none = 1 |
|
for dataset in datasets_list: |
|
if results[dataset][model['abbr']] != '-': |
|
is_all_results_none = 0 |
|
this_result[dataset] = results[dataset][model['abbr']] |
|
if is_all_results_none == 0: |
|
result_list.append(this_result) |
|
index += 1 |
|
|
|
df = pd.DataFrame(result_list) |
|
return df, models_list, datasets_list |
|
|
|
|
|
|
|
def calculate_column_widths(df): |
|
column_widths = [] |
|
for column in df.columns: |
|
header_length = len(str(column)) |
|
max_content_length = df[column].astype(str).map(len).max() |
|
width = max(header_length * 10, max_content_length * 8) + 20 |
|
width = max(160, min(400, width)) |
|
column_widths.append(width) |
|
return column_widths |
|
|
|
|
|
|
|
def show_results_tab(df): |
|
|
|
|
|
def filter_df(model_name, size_ranges, model_types): |
|
|
|
newdf, modellist, datasetlist = make_results_tab(model_info, results) |
|
|
|
|
|
default_val = 'Input the Model Name' |
|
if model_name != default_val: |
|
method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in newdf['Model Name']] |
|
flag = [model_name.lower() in name for name in method_names] |
|
newdf['TEMP'] = flag |
|
newdf = newdf[newdf['TEMP'] == True] |
|
newdf.pop('TEMP') |
|
|
|
|
|
|
|
if size_ranges: |
|
def get_size_in_B(param): |
|
if param == 'N/A': |
|
return None |
|
try: |
|
return float(param.replace('B', '')) |
|
except: |
|
return None |
|
|
|
newdf['size_in_B'] = newdf['Parameters'].apply(get_size_in_B) |
|
mask = pd.Series(False, index=newdf.index) |
|
|
|
for size_range in size_ranges: |
|
if size_range == '<10B': |
|
mask |= (newdf['size_in_B'] < 10) & (newdf['size_in_B'].notna()) |
|
elif size_range == '10B-70B': |
|
mask |= (newdf['size_in_B'] >= 10) & (newdf['size_in_B'] < 70) |
|
elif size_range == '>70B': |
|
mask |= newdf['size_in_B'] >= 70 |
|
elif size_range == 'Unknown': |
|
mask |= newdf['size_in_B'].isna() |
|
|
|
newdf = newdf[mask] |
|
newdf.drop('size_in_B', axis=1, inplace=True) |
|
|
|
|
|
if model_types: |
|
type_mask = pd.Series(False, index=newdf.index) |
|
for model_type in model_types: |
|
if model_type == 'API': |
|
type_mask |= newdf['OpenSource'] == 'API' |
|
elif model_type == 'OpenSource': |
|
type_mask |= newdf['OpenSource'] == 'OpenSource' |
|
newdf = newdf[type_mask] |
|
|
|
|
|
|
|
|
|
return newdf |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
model_name = gr.Textbox( |
|
value='Input the Model Name', |
|
label='Search Model Name', |
|
interactive=True |
|
) |
|
with gr.Column(): |
|
size_filter = gr.CheckboxGroup( |
|
choices=MODEL_SIZE, |
|
value=MODEL_SIZE, |
|
label='Model Size', |
|
interactive=True, |
|
) |
|
with gr.Column(): |
|
type_filter = gr.CheckboxGroup( |
|
choices=MODEL_TYPE, |
|
value=MODEL_TYPE, |
|
label='Model Type', |
|
interactive=True, |
|
) |
|
|
|
|
|
|
|
|
|
with gr.Column(): |
|
table = gr.DataFrame( |
|
value=df, |
|
interactive=False, |
|
wrap=False, |
|
column_widths=calculate_column_widths(df), |
|
) |
|
|
|
|
|
model_name.submit( |
|
fn=filter_df, |
|
inputs=[model_name, size_filter, type_filter], |
|
outputs=table |
|
) |
|
size_filter.change( |
|
fn=filter_df, |
|
inputs=[model_name, size_filter, type_filter], |
|
outputs=table, |
|
) |
|
type_filter.change( |
|
fn=filter_df, |
|
inputs=[model_name, size_filter, type_filter], |
|
outputs=table, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Row(): |
|
with gr.Accordion("Storage of Model Predictions", open=True): |
|
citation_button = gr.Textbox( |
|
value=Predictions_BUTTON_TEXT, |
|
label=Predictions_BUTTON_LABEL, |
|
elem_id='predictions-button', |
|
lines=2, |
|
max_lines=4, |
|
show_copy_button=True |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Accordion("Citation", open=True): |
|
citation_button = gr.Textbox( |
|
value=CITATION_BUTTON_TEXT, |
|
label=CITATION_BUTTON_LABEL, |
|
elem_id='citation-button', |
|
lines=6, |
|
max_lines=8, |
|
show_copy_button=True |
|
) |
|
|
|
ERROR_DF = { |
|
"Type": ['NoneType'], |
|
"Details": ["Do not find the combination predictions of the two options above."] |
|
} |
|
|
|
def show_predictions_tab(model_list, dataset_list, predictions): |
|
|
|
def get_pre_df(model_name, dataset_name): |
|
if dataset_name not in predictions.keys() or model_name not in predictions[dataset_name].keys(): |
|
return pd.DataFrame(ERROR_DF) |
|
|
|
this_predictions = predictions[dataset_name][model_name]['predictions'] |
|
for i in range(len(this_predictions)): |
|
this_predictions[i]['origin_prompt'] = str(this_predictions[i]['origin_prompt']) |
|
this_predictions[i]['gold'] = str(this_predictions[i]['gold']) |
|
this_predictions = pd.DataFrame(this_predictions) |
|
|
|
return this_predictions |
|
|
|
|
|
model_list = [i['abbr'] for i in model_list] |
|
initial_predictions = get_pre_df('MiniMax-Text-01', 'IFEval') |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
model_drop = gr.Dropdown( |
|
label="Model Name", |
|
choices=model_list, |
|
interactive=True |
|
) |
|
with gr.Column(): |
|
dataset_drop = gr.Dropdown( |
|
label="Dataset Name", |
|
choices=dataset_list, |
|
interactive=True |
|
) |
|
|
|
with gr.Column(): |
|
table = gr.DataFrame( |
|
value=initial_predictions, |
|
interactive=False, |
|
wrap=False, |
|
max_height=1000, |
|
column_widths=calculate_column_widths(initial_predictions), |
|
) |
|
|
|
model_drop.change( |
|
fn=get_pre_df, |
|
inputs=[model_drop, dataset_drop], |
|
outputs=table, |
|
) |
|
|
|
dataset_drop.change( |
|
fn=get_pre_df, |
|
inputs=[model_drop, dataset_drop], |
|
outputs=table, |
|
) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Accordion("Citation", open=False): |
|
citation_button = gr.Textbox( |
|
value=CITATION_BUTTON_TEXT, |
|
label=CITATION_BUTTON_LABEL, |
|
elem_id='citation-button', |
|
lines=6, |
|
max_lines=8, |
|
show_copy_button=True |
|
) |
|
|
|
|
|
def create_interface(): |
|
|
|
df, model_list, dataset_list = make_results_tab(model_info, results) |
|
predictions = findfile_predictions() |
|
|
|
with gr.Blocks() as demo: |
|
|
|
gr.Markdown(MAIN_LEADERBOARD_DESCRIPTION) |
|
with gr.Tabs(elem_classes='tab-buttons') as tabs: |
|
with gr.TabItem('Results', elem_id='main', id=0): |
|
|
|
show_results_tab(df) |
|
|
|
|
|
|
|
|
|
|
|
return demo |
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
demo = create_interface() |
|
demo.queue() |
|
demo.launch(server_name='0.0.0.0') |
|
|