Jerry0723's picture
update three instruct models
ba75603
raw
history blame
8.14 kB
from typing import List
import gradio as gr
import numpy as np
import pandas as pd
from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", sep='\t') # space separated values
ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", sep='\t') #
ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", sep=',') #
ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", sep=',')
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
#SPLITS = ["Overall", "Subclass"]
SPLITS = ["Overall", "Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
CLASSIFICATION = {
"model_size": [
">65B",
"~30B",
"10B~20B",
"5B~10B",
"API",
]
}
# _BIBTEX = """ Waiting for paper ... """
_BIBTEX = """
@misc{zhang2024chinesesafechinesebenchmarkevaluating,
title={ChineseSafe: A Chinese Benchmark for Evaluating Safety in Large Language Models},
author={Hengxiang Zhang and Hongfu Gao and Qiang Hu and Guanhua Chen and Lili Yang and Bingyi Jing and Hongxin Wei and Bing Wang and Haifeng Bai and Lei Yang},
year={2024},
eprint={2410.18491},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2410.18491},
}
"""
_LAST_UPDATED = "November 24, 2024"
banner_url = "./assets/logo.png"
_BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>' # noqa
def retrieve_array_from_text(text):
return np.fromstring(text.replace("[", "").replace("]", ""), dtype=float, sep=",")
def format_csv_numbers(text):
return text.split('/')[0]
def format_csv_numbers_second(text):
return text.split()
def format_number(x):
return float(f"{x:.3}")
def get_dataset_csv(
model_size: List[str],
):
df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
df = df.drop(columns="Size")
leaderboard_table = gr.components.Dataframe(
value=df,
interactive=False,
visible=True,
)
return leaderboard_table
def get_dataset_csv_per(
model_size: List[str],
):
df = ORIGINAL_DF_PER[ORIGINAL_DF_PER['Size'].isin(model_size)]
df = df.drop(columns="Size")
leaderboard_table = gr.components.Dataframe(
value=df,
interactive=False,
visible=True,
)
return leaderboard_table
# this is a sub function for csv table
def get_dataset_csv_sub_gen(
model_size: List[str],
subclass_choice: List[str],
):
df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
df = df.drop(columns="Size")
# get subclass
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
df = df[subclass_choice_label]
leaderboard_table = gr.components.Dataframe(
value=df,
interactive=False,
visible=True,
)
return leaderboard_table
# this is a sub function for csv table
def get_dataset_csv_sub_per(
model_size: List[str],
subclass_choice: List[str],
):
df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
df = df.drop(columns="Size")
# get subclass
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
df = df[subclass_choice_label]
leaderboard_table = gr.components.Dataframe(
value=df,
interactive=False,
visible=True,
)
return leaderboard_table
def get_dataset_classfier_gen(
model_size: List[str],
main_choice: List[str],
):
if main_choice == "Overall":
leaderboard_table = get_dataset_csv(model_size)
elif main_choice != "Subclass":
subclass_choice = main_choice
leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
return leaderboard_table
def get_dataset_classfier_per(
model_size: List[str],
main_choice: List[str],
):
if main_choice == "Overall":
leaderboard_table = get_dataset_csv_per(model_size)
elif main_choice != "Overall":
subclass_choice = main_choice
leaderboard_table = get_dataset_csv_sub_per(model_size, subclass_choice)
return leaderboard_table
with gr.Blocks() as demo:
gr.Markdown("<center><h1>ChineseSafe Leaderboard</h1></center>", elem_classes="markdown-text")
with gr.Row():
#gr.Image(banner_url, height=160, scale=1) # πŸ‘‰ this part is for image
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
# gr.Textbox(_INTRODUCTION_TEXT, scale=5)
with gr.Row():
gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
with gr.Row():
gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
with gr.Row():
with gr.Column(scale=0.8):
main_choice = gr.Dropdown(
choices=SPLITS,
value="Overall",
label="Type",
info="Please choose the type to display.",
)
with gr.Column(scale=10):
model_choice = gr.CheckboxGroup(
choices=CLASSIFICATION["model_size"],
value=CLASSIFICATION["model_size"], # all be choosed
label="Model Size",
info="Please choose the model size to display.",
)
#πŸ‘‰ this part is for csv table generatived
with gr.Tabs(elem_classes="tab-buttons") as tabs:
# ----------------- modify text -----------------
with gr.TabItem("πŸ… Generation", elem_id="od-benchmark-tab-table", id=6):
dataframe_all_gen = gr.components.Dataframe(
elem_id="leaderboard-table",
)
with gr.TabItem("πŸ… Perplexity", elem_id="od-benchmark-tab-table", id=5):
dataframe_all_per = gr.components.Dataframe(
elem_id="leaderboard-table",
)
# ----------------- modify text -----------------
with gr.Row():
gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
with gr.Row():
gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
# πŸ‘‰ this part is for citation
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=True):
gr.Textbox(
value=_BIBTEX,
lines=7,
label="Copy the BibTeX snippet to cite this source",
elem_id="citation-button",
show_copy_button=True
)
gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
# --------------------------- all --------------------------------
# this is all result Perplexity
main_choice.change(
get_dataset_classfier_per,
inputs=[model_choice, main_choice],
outputs=dataframe_all_per,
)
model_choice.change(
get_dataset_classfier_per,
inputs=[model_choice, main_choice],
outputs=dataframe_all_per,
)
demo.load(
fn=get_dataset_classfier_per,
inputs=[model_choice, main_choice],
outputs=dataframe_all_per,
)
# this is all result generatived
main_choice.change(
get_dataset_classfier_gen,
inputs=[model_choice, main_choice],
outputs=dataframe_all_gen,
)
model_choice.change(
get_dataset_classfier_gen,
inputs=[model_choice, main_choice],
outputs=dataframe_all_gen,
)
demo.load(
fn=get_dataset_classfier_gen,
inputs=[model_choice, main_choice],
outputs=dataframe_all_gen,
)
demo.launch(share=True)