myhs's picture
Update app.py
5f54938 verified
raw
history blame
2.47 kB
import gradio as gr
import json
import pandas as pd
from urllib.request import urlopen
from urllib.error import URLError
import re
from datetime import datetime
CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
author={OpenCompass Contributors},
howpublished = {\url{https://github.com/open-compass/opencompass}},
year={2023}
}"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
head_style = """
<style>
@media (min-width: 1536px)
{
.gradio-container {
min-width: var(--size-full) !important;
}
}
</style>
"""
DATA_URL_BASE = "http://opencompass.oss-cn-shanghai.aliyuncs.com/dev-assets/hf-research/"
def findfile():
model_meta_info = 'model-meta-info'
results_sum = 'hf-academic'
url = f"{DATA_URL_BASE}{model_meta_info}.json"
response = urlopen(url)
model_info = json.loads(response.read().decode('utf-8'))
url = f"{DATA_URL_BASE}{results_sum}.json"
response = urlopen(url)
results = json.loads(response.read().decode('utf-8'))
return model_info, results
MAIN_LEADERBOARD_DESCRIPTION = """## Main Evaluation Results
The CompassAcademic currently focuses on the comprehensive reasoning abilities of LLMs.
- The datasets selected so far include General Knowledge Reasoning (MMLU-Pro/GPQA-Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Completion (LiveCodeBench, HumanEval), and Instruction Following (IFEval).
- Currently, the evaluation primarily targets chat models, with updates featuring the latest community models at irregular intervals.
- Prompts and reproduction scripts can be found in [**OpenCompass**: A Toolkit for Evaluation of LLMs](https://github.com/open-compass/opencompass)πŸ†.
"""
def create_interface():
model_info, results = findfile()
with gr.Blocks(title="Math Leaderboard", head=head_style) as demo:
with gr.Tabs(elem_classes='tab-buttons') as tabs:
with gr.TabItem('Results', elem_id='main', id=0):
# math_main_tab(results)
pass
with gr.TabItem('Predictions', elem_id='notmain', id=0):
# dataset_tab(results, structs[i], dataset)
pass
return demo
# model_info, results = findfile()
# breakpoint()
if __name__ == '__main__':
demo = create_interface()
demo.queue()
demo.launch(server_name='0.0.0.0')