Spaces:

opencompass
/

CompassAcademic-Leaderboard-Full-Version

Running

App Files Files Community

CompassAcademic-Leaderboard-Full-Version / app.py

myhs

Update app.py

73af091 verified 5 months ago

raw

history blame contribute delete

11.6 kB

	import gradio as gr
	import json
	import pandas as pd
	from urllib.request import urlopen
	from urllib.error import URLError
	import re
	from datetime import datetime

	CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
	title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
	author={OpenCompass Contributors},
	howpublished = {\url{https://github.com/open-compass/opencompass}},
	year={2023}
	}"""
	CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"


	Predictions_BUTTON_LABEL = "All model predictions are listed here. Access this URL for more details."

	Predictions_BUTTON_TEXT = "https://huggingface.co/datasets/opencompass/compass_academic_predictions"


	head_style = """
	<style>
	@media (min-width: 1536px)
	{
	.gradio-container {
	min-width: var(--size-full) !important;
	}
	}
	</style>
	"""

	DATA_URL_BASE = "http://opencompass.oss-cn-shanghai.aliyuncs.com/dev-assets/hf-research/"

	MAIN_LEADERBOARD_DESCRIPTION = """## Compass Academic Leaderboard (Full Version)
	The CompassAcademic currently focuses on the comprehensive reasoning abilities of LLMs.
	- The datasets selected so far include General Knowledge Reasoning (MMLU-Pro/GPQA-Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Completion (LiveCodeBench, HumanEval), and Instruction Following (IFEval).
	- Currently, the evaluation primarily targets chat models, with updates featuring the latest community models at irregular intervals.
	- Prompts and reproduction scripts can be found in [OpenCompass: A Toolkit for Evaluation of LLMs](https://github.com/open-compass/opencompass)🏆.

	"""
	Initial_title = 'Compass Academic Leaderboard'

	MODEL_SIZE = ['<10B', '10B-70B', '>70B', 'Unknown']
	MODEL_TYPE = ['API', 'OpenSource']



	def findfile():
	model_meta_info = 'model-meta-info'
	results_sum = 'hf-academic'

	url = f"{DATA_URL_BASE}{model_meta_info}.json"
	response = urlopen(url)
	model_info = json.loads(response.read().decode('utf-8'))

	url = f"{DATA_URL_BASE}{results_sum}.json"
	response = urlopen(url)
	results = json.loads(response.read().decode('utf-8'))

	return model_info, results

	model_info, results = findfile()


	def findfile_predictions():
	with open('data/hf-academic-predictions.json', 'r') as file:
	predictions = json.load(file)
	file.close()
	return predictions



	def make_results_tab(model_info, results):
	models_list, datasets_list = [], []
	for i in model_info:
	models_list.append(i)
	for i in results.keys():
	datasets_list.append(i)

	result_list = []
	index = 1
	for model in models_list:
	this_result = {}
	this_result['Index'] = index
	this_result['Model Name'] = model['display_name']
	this_result['Release Time'] = model['release_time']
	this_result['Parameters'] = model['num_param']
	this_result['OpenSource'] = model['release_type']
	is_all_results_none = 1
	for dataset in datasets_list:
	if results[dataset][model['abbr']] != '-':
	is_all_results_none = 0
	this_result[dataset] = results[dataset][model['abbr']]
	if is_all_results_none == 0:
	result_list.append(this_result)
	index += 1

	df = pd.DataFrame(result_list)
	return df, models_list, datasets_list



	def calculate_column_widths(df):
	column_widths = []
	for column in df.columns:
	header_length = len(str(column))
	max_content_length = df[column].astype(str).map(len).max()
	width = max(header_length * 10, max_content_length * 8) + 20
	width = max(160, min(400, width))
	column_widths.append(width)
	return column_widths



	def show_results_tab(df):


	def filter_df(model_name, size_ranges, model_types):

	newdf, modellist, datasetlist = make_results_tab(model_info, results)

	# search model name
	default_val = 'Input the Model Name'
	if model_name != default_val:
	method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in newdf['Model Name']]
	flag = [model_name.lower() in name for name in method_names]
	newdf['TEMP'] = flag
	newdf = newdf[newdf['TEMP'] == True]
	newdf.pop('TEMP')


	# filter size
	if size_ranges:
	def get_size_in_B(param):
	if param == 'N/A':
	return None
	try:
	return float(param.replace('B', ''))
	except:
	return None

	newdf['size_in_B'] = newdf['Parameters'].apply(get_size_in_B)
	mask = pd.Series(False, index=newdf.index)

	for size_range in size_ranges:
	if size_range == '<10B':
	mask \|= (newdf['size_in_B'] < 10) & (newdf['size_in_B'].notna())
	elif size_range == '10B-70B':
	mask \|= (newdf['size_in_B'] >= 10) & (newdf['size_in_B'] < 70)
	elif size_range == '>70B':
	mask \|= newdf['size_in_B'] >= 70
	elif size_range == 'Unknown':
	mask \|= newdf['size_in_B'].isna()

	newdf = newdf[mask]
	newdf.drop('size_in_B', axis=1, inplace=True)

	# filter opensource
	if model_types:
	type_mask = pd.Series(False, index=newdf.index)
	for model_type in model_types:
	if model_type == 'API':
	type_mask \|= newdf['OpenSource'] == 'API'
	elif model_type == 'OpenSource':
	type_mask \|= newdf['OpenSource'] == 'OpenSource'
	newdf = newdf[type_mask]

	# for i in range(len(newdf)):
	# newdf.loc[i, 'Index'] = i+1

	return newdf


	with gr.Row():
	with gr.Column():
	model_name = gr.Textbox(
	value='Input the Model Name',
	label='Search Model Name',
	interactive=True
	)
	with gr.Column():
	size_filter = gr.CheckboxGroup(
	choices=MODEL_SIZE,
	value=MODEL_SIZE,
	label='Model Size',
	interactive=True,
	)
	with gr.Column():
	type_filter = gr.CheckboxGroup(
	choices=MODEL_TYPE,
	value=MODEL_TYPE,
	label='Model Type',
	interactive=True,
	)

	# with gr.Row():
	# btn = gr.Button(value="生成表格", interactive=True)

	with gr.Column():
	table = gr.DataFrame(
	value=df,
	interactive=False,
	wrap=False,
	column_widths=calculate_column_widths(df),
	)


	model_name.submit(
	fn=filter_df,
	inputs=[model_name, size_filter, type_filter],
	outputs=table
	)
	size_filter.change(
	fn=filter_df,
	inputs=[model_name, size_filter, type_filter],
	outputs=table,
	)
	type_filter.change(
	fn=filter_df,
	inputs=[model_name, size_filter, type_filter],
	outputs=table,
	)

	# def download_table():
	# newdf, modellist, datasetlist = make_results_tab(model_info, results)
	# return newdf.to_csv('df.csv',index=False,sep=',',encoding='utf-8',header=True)

	# download_btn = gr.File(visible=True)

	# btn.click(fn=download_table, inputs=None, outputs=download_btn)


	with gr.Row():
	with gr.Accordion("Storage of Model Predictions", open=True):
	citation_button = gr.Textbox(
	value=Predictions_BUTTON_TEXT,
	label=Predictions_BUTTON_LABEL,
	elem_id='predictions-button',
	lines=2, # 增加行数
	max_lines=4, # 设置最大行数
	show_copy_button=True # 添加复制按钮使其更方便使用
	)

	with gr.Row():
	with gr.Accordion("Citation", open=True):
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	elem_id='citation-button',
	lines=6, # 增加行数
	max_lines=8, # 设置最大行数
	show_copy_button=True # 添加复制按钮使其更方便使用
	)

	ERROR_DF = {
	"Type": ['NoneType'],
	"Details": ["Do not find the combination predictions of the two options above."]
	}

	def show_predictions_tab(model_list, dataset_list, predictions):

	def get_pre_df(model_name, dataset_name):
	if dataset_name not in predictions.keys() or model_name not in predictions[dataset_name].keys():
	return pd.DataFrame(ERROR_DF)

	this_predictions = predictions[dataset_name][model_name]['predictions']
	for i in range(len(this_predictions)):
	this_predictions[i]['origin_prompt'] = str(this_predictions[i]['origin_prompt'])
	this_predictions[i]['gold'] = str(this_predictions[i]['gold'])
	this_predictions = pd.DataFrame(this_predictions)

	return this_predictions


	model_list = [i['abbr'] for i in model_list]
	initial_predictions = get_pre_df('MiniMax-Text-01', 'IFEval')

	with gr.Row():
	with gr.Column():
	model_drop = gr.Dropdown(
	label="Model Name",
	choices=model_list, # 去重获取主类别
	interactive=True
	)
	with gr.Column():
	dataset_drop = gr.Dropdown(
	label="Dataset Name",
	choices=dataset_list, # 去重获取主类别
	interactive=True
	)

	with gr.Column():
	table = gr.DataFrame(
	value=initial_predictions,
	interactive=False,
	wrap=False,
	max_height=1000,
	column_widths=calculate_column_widths(initial_predictions),
	)

	model_drop.change(
	fn=get_pre_df,
	inputs=[model_drop, dataset_drop],
	outputs=table,
	)

	dataset_drop.change(
	fn=get_pre_df,
	inputs=[model_drop, dataset_drop],
	outputs=table,
	)


	with gr.Row():
	with gr.Accordion("Citation", open=False):
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	elem_id='citation-button',
	lines=6, # 增加行数
	max_lines=8, # 设置最大行数
	show_copy_button=True # 添加复制按钮使其更方便使用
	)


	def create_interface():

	df, model_list, dataset_list = make_results_tab(model_info, results)
	predictions = findfile_predictions()

	with gr.Blocks() as demo:
	# title_comp = gr.Markdown(Initial_title)
	gr.Markdown(MAIN_LEADERBOARD_DESCRIPTION)
	with gr.Tabs(elem_classes='tab-buttons') as tabs:
	with gr.TabItem('Results', elem_id='main', id=0):

	show_results_tab(df)

	# with gr.TabItem('Predictions', elem_id='notmain', id=1):

	# show_predictions_tab(model_list, dataset_list, predictions)

	return demo

	# model_info, results = findfile()
	# breakpoint()

	if __name__ == '__main__':
	demo = create_interface()
	demo.queue()
	demo.launch(server_name='0.0.0.0')