Spaces:

opencompass
/

CompassAcademic-Leaderboard-Full-Version

Running

CompassAcademic-Leaderboard-Full-Version

File size: 11,585 Bytes

import gradio as gr
import json
import pandas as pd
from urllib.request import urlopen
from urllib.error import URLError
import re
from datetime import datetime

CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
    author={OpenCompass Contributors},
    howpublished = {\url{https://github.com/open-compass/opencompass}},
    year={2023}
}"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"


Predictions_BUTTON_LABEL = "All model predictions are listed here. Access this URL for more details."

Predictions_BUTTON_TEXT = "https://huggingface.co/datasets/opencompass/compass_academic_predictions"


head_style = """
<style>
@media (min-width: 1536px)
{
    .gradio-container {
        min-width: var(--size-full) !important;
    }
}
</style>
"""

DATA_URL_BASE = "http://opencompass.oss-cn-shanghai.aliyuncs.com/dev-assets/hf-research/"

MAIN_LEADERBOARD_DESCRIPTION = """## Compass Academic Leaderboard (Full Version)
The CompassAcademic currently focuses on the comprehensive reasoning abilities of LLMs.
- The datasets selected so far include General Knowledge Reasoning (MMLU-Pro/GPQA-Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Completion (LiveCodeBench, HumanEval), and Instruction Following (IFEval).
- Currently, the evaluation primarily targets chat models, with updates featuring the latest community models at irregular intervals. 
- Prompts and reproduction scripts can be found in [**OpenCompass**: A Toolkit for Evaluation of LLMs](https://github.com/open-compass/opencompass)🏆.

"""
Initial_title = 'Compass Academic Leaderboard'

MODEL_SIZE = ['<10B', '10B-70B', '>70B', 'Unknown']
MODEL_TYPE = ['API', 'OpenSource']



def findfile():
    model_meta_info = 'model-meta-info'
    results_sum = 'hf-academic'

    url = f"{DATA_URL_BASE}{model_meta_info}.json"
    response = urlopen(url)
    model_info = json.loads(response.read().decode('utf-8'))

    url = f"{DATA_URL_BASE}{results_sum}.json"
    response = urlopen(url)
    results = json.loads(response.read().decode('utf-8'))

    return model_info, results

model_info, results = findfile()


def findfile_predictions():
    with open('data/hf-academic-predictions.json', 'r') as file:
        predictions = json.load(file)
    file.close()
    return predictions



def make_results_tab(model_info, results):
    models_list, datasets_list = [], []
    for i in model_info:
        models_list.append(i)
    for i in results.keys():
        datasets_list.append(i)
    
    result_list = []
    index = 1
    for model in models_list:
        this_result = {}
        this_result['Index'] = index
        this_result['Model Name'] = model['display_name']
        this_result['Release Time'] = model['release_time']
        this_result['Parameters'] = model['num_param']
        this_result['OpenSource'] = model['release_type']
        is_all_results_none = 1
        for dataset in datasets_list:
            if results[dataset][model['abbr']] != '-':
                is_all_results_none = 0
            this_result[dataset] = results[dataset][model['abbr']]
        if is_all_results_none == 0:
            result_list.append(this_result)
            index += 1 

    df = pd.DataFrame(result_list)
    return df, models_list, datasets_list



def calculate_column_widths(df):
    column_widths = []
    for column in df.columns:
        header_length = len(str(column))
        max_content_length = df[column].astype(str).map(len).max()
        width = max(header_length * 10, max_content_length * 8) + 20
        width = max(160, min(400, width))
        column_widths.append(width)
    return column_widths



def show_results_tab(df):

    
    def filter_df(model_name, size_ranges, model_types):
        
        newdf, modellist, datasetlist = make_results_tab(model_info, results)

        # search model name
        default_val = 'Input the Model Name'
        if model_name != default_val:
            method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in newdf['Model Name']]
            flag = [model_name.lower() in name for name in method_names]
            newdf['TEMP'] = flag
            newdf = newdf[newdf['TEMP'] == True] 
            newdf.pop('TEMP')
            
        
        # filter size
        if size_ranges:
            def get_size_in_B(param):
                if param == 'N/A':
                    return None
                try:
                    return float(param.replace('B', ''))
                except:
                    return None
            
            newdf['size_in_B'] = newdf['Parameters'].apply(get_size_in_B)
            mask = pd.Series(False, index=newdf.index)
            
            for size_range in size_ranges:
                if size_range == '<10B':
                    mask |= (newdf['size_in_B'] < 10) & (newdf['size_in_B'].notna())
                elif size_range == '10B-70B':
                    mask |= (newdf['size_in_B'] >= 10) & (newdf['size_in_B'] < 70)
                elif size_range == '>70B':
                    mask |= newdf['size_in_B'] >= 70
                elif size_range == 'Unknown':
                    mask |= newdf['size_in_B'].isna()
                    
            newdf = newdf[mask]
            newdf.drop('size_in_B', axis=1, inplace=True)

        # filter opensource
        if model_types:
            type_mask = pd.Series(False, index=newdf.index)
            for model_type in model_types:
                if model_type == 'API':
                    type_mask |= newdf['OpenSource'] == 'API'
                elif model_type == 'OpenSource':
                    type_mask |= newdf['OpenSource'] == 'OpenSource'
            newdf = newdf[type_mask]

        # for i in range(len(newdf)):
        #     newdf.loc[i, 'Index'] = i+1
        
        return newdf

        
    with gr.Row():
        with gr.Column():
            model_name = gr.Textbox(
                value='Input the Model Name', 
                label='Search Model Name',
                interactive=True
            )
        with gr.Column():
            size_filter = gr.CheckboxGroup(
                choices=MODEL_SIZE,
                value=MODEL_SIZE,
                label='Model Size',
                interactive=True,
            )
        with gr.Column():
            type_filter = gr.CheckboxGroup(
                choices=MODEL_TYPE,
                value=MODEL_TYPE,
                label='Model Type',
                interactive=True,
            )

    # with gr.Row():
    #     btn = gr.Button(value="生成表格", interactive=True)
    
    with gr.Column():
        table = gr.DataFrame(
                value=df,
                interactive=False,
                wrap=False,
                column_widths=calculate_column_widths(df),
        )
        
    
    model_name.submit(
        fn=filter_df,
        inputs=[model_name, size_filter, type_filter],
        outputs=table
    )
    size_filter.change(
        fn=filter_df,
        inputs=[model_name, size_filter, type_filter],
        outputs=table,
    )
    type_filter.change(
        fn=filter_df,
        inputs=[model_name, size_filter, type_filter],
        outputs=table,
    )

    # def download_table():
    #     newdf, modellist, datasetlist = make_results_tab(model_info, results)
    #     return newdf.to_csv('df.csv',index=False,sep=',',encoding='utf-8',header=True)
        
    # download_btn = gr.File(visible=True)
    
    # btn.click(fn=download_table, inputs=None, outputs=download_btn)


    with gr.Row():
        with gr.Accordion("Storage of Model Predictions", open=True):
            citation_button = gr.Textbox(
                value=Predictions_BUTTON_TEXT,
                label=Predictions_BUTTON_LABEL,
                elem_id='predictions-button',
                lines=2,  # 增加行数
                max_lines=4,  # 设置最大行数
                show_copy_button=True  # 添加复制按钮使其更方便使用
            )
    
    with gr.Row():
        with gr.Accordion("Citation", open=True):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                elem_id='citation-button',
                lines=6,  # 增加行数
                max_lines=8,  # 设置最大行数
                show_copy_button=True  # 添加复制按钮使其更方便使用
            )

ERROR_DF = {
    "Type": ['NoneType'],
    "Details": ["Do not find the combination predictions of the two options above."]
}

def show_predictions_tab(model_list, dataset_list, predictions):

    def get_pre_df(model_name, dataset_name):
        if dataset_name not in predictions.keys() or model_name not in predictions[dataset_name].keys():
            return pd.DataFrame(ERROR_DF)

        this_predictions = predictions[dataset_name][model_name]['predictions']
        for i in range(len(this_predictions)):
            this_predictions[i]['origin_prompt'] = str(this_predictions[i]['origin_prompt'])
            this_predictions[i]['gold'] = str(this_predictions[i]['gold'])
        this_predictions = pd.DataFrame(this_predictions)

        return this_predictions


    model_list = [i['abbr'] for i in model_list]
    initial_predictions = get_pre_df('MiniMax-Text-01', 'IFEval')

    with gr.Row():
        with gr.Column():
            model_drop = gr.Dropdown(
                label="Model Name",
                choices=model_list,  # 去重获取主类别
                interactive=True
            )
        with gr.Column():
            dataset_drop = gr.Dropdown(
                label="Dataset Name",
                choices=dataset_list,  # 去重获取主类别
                interactive=True
            )

    with gr.Column():
        table = gr.DataFrame(
                value=initial_predictions,
                interactive=False,
                wrap=False,
                max_height=1000,
                column_widths=calculate_column_widths(initial_predictions),
        )

    model_drop.change(
        fn=get_pre_df,
        inputs=[model_drop, dataset_drop],
        outputs=table,
    )

    dataset_drop.change(
        fn=get_pre_df,
        inputs=[model_drop, dataset_drop],
        outputs=table,
    )


    with gr.Row():
        with gr.Accordion("Citation", open=False):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                elem_id='citation-button',
                lines=6,  # 增加行数
                max_lines=8,  # 设置最大行数
                show_copy_button=True  # 添加复制按钮使其更方便使用
            )


def create_interface():

    df, model_list, dataset_list = make_results_tab(model_info, results)
    predictions = findfile_predictions()

    with gr.Blocks() as demo:
        # title_comp = gr.Markdown(Initial_title)
        gr.Markdown(MAIN_LEADERBOARD_DESCRIPTION)
        with gr.Tabs(elem_classes='tab-buttons') as tabs:
            with gr.TabItem('Results', elem_id='main', id=0):
                
                show_results_tab(df)

            # with gr.TabItem('Predictions', elem_id='notmain', id=1):
                
            #     show_predictions_tab(model_list, dataset_list, predictions)

    return demo

# model_info, results = findfile()
# breakpoint()

if __name__ == '__main__':
    demo = create_interface()
    demo.queue()
    demo.launch(server_name='0.0.0.0')