Spaces:
Running
Running
Commit
·
17b190b
1
Parent(s):
7e588e5
Add application file
Browse files
app.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from gradio_leaderboard import Leaderboard
|
3 |
+
from pathlib import Path
|
4 |
+
import pandas as pd
|
5 |
+
from collections import defaultdict
|
6 |
+
|
7 |
+
|
8 |
+
abs_path = Path(__file__).parent
|
9 |
+
|
10 |
+
|
11 |
+
def overall_leaderboard(df: pd.DataFrame, sort_column: str = "f1_macro"):
|
12 |
+
df = df[df["language"] == "all"]
|
13 |
+
df = df[["model", "temperature", "f1_macro", "weighted_f1", "accuracy"]]
|
14 |
+
df = df.sort_values(by=sort_column, ascending=False)
|
15 |
+
df.insert(0, "Rank", range(1, len(df) + 1))
|
16 |
+
|
17 |
+
df["temperature"] = df["temperature"].round(1)
|
18 |
+
|
19 |
+
for col in ["f1_macro", "weighted_f1", "accuracy"]:
|
20 |
+
df[col] = df[col].round(4)
|
21 |
+
|
22 |
+
return df
|
23 |
+
|
24 |
+
|
25 |
+
def build_lang_dict(df: pd.DataFrame):
|
26 |
+
lang_data = defaultdict(lambda: defaultdict(dict))
|
27 |
+
for row in df.itertuples():
|
28 |
+
for metric in ["f1_macro", "weighted_f1", "accuracy"]:
|
29 |
+
key = f"{row.language}.{metric}"
|
30 |
+
lang_data[row.model][row.temperature][key] = getattr(row, metric)
|
31 |
+
return lang_data
|
32 |
+
|
33 |
+
|
34 |
+
def build_ds_dict(df: pd.DataFrame):
|
35 |
+
ds_data = defaultdict(lambda: defaultdict(dict))
|
36 |
+
for row in df.itertuples():
|
37 |
+
for metric in ["f1_macro", "weighted_f1", "accuracy"]:
|
38 |
+
key = f"{row.dataset}.{metric}"
|
39 |
+
ds_data[row.model][row.temperature][key] = getattr(row, metric)
|
40 |
+
return ds_data
|
41 |
+
|
42 |
+
|
43 |
+
def build_emo_dict(df: pd.DataFrame):
|
44 |
+
df = df[df["language"] == "all"]
|
45 |
+
emo_data = defaultdict(lambda: defaultdict(dict))
|
46 |
+
emotions = df.iloc[0].metrics_per_label.keys() - ["accuracy", "macro avg", "weighted avg"]
|
47 |
+
for row in df.itertuples():
|
48 |
+
for emotion in emotions:
|
49 |
+
emo_data[row.model][row.temperature][emotion] = row.metrics_per_label[emotion].get("f1-score")
|
50 |
+
emo_data[row.model][row.temperature]["all"] = row.f1_macro
|
51 |
+
return emo_data
|
52 |
+
|
53 |
+
|
54 |
+
def leaderboard_per_group(lang_dict, metric: str = "f1_macro"):
|
55 |
+
df = []
|
56 |
+
for model, inner in lang_dict.items():
|
57 |
+
for temperature, metrics in inner.items():
|
58 |
+
entry = {"model": model, "temperature": temperature}
|
59 |
+
for k, v in metrics.items():
|
60 |
+
if metric in k:
|
61 |
+
entry[k.split(".")[0]] = v
|
62 |
+
elif "." not in k:
|
63 |
+
entry[k] = v
|
64 |
+
df.append(entry)
|
65 |
+
|
66 |
+
df = pd.DataFrame(df)
|
67 |
+
|
68 |
+
df["temperature"] = df["temperature"].round(1)
|
69 |
+
|
70 |
+
for col in df.columns.difference(["model", "temperature"]):
|
71 |
+
df[col] = df[col].round(4)
|
72 |
+
|
73 |
+
df = df[["model", "temperature", "all"] + sorted(df.columns.difference(["model", "temperature", "all"]))]
|
74 |
+
df = df.sort_values(by="all", ascending=False)
|
75 |
+
df.insert(0, "Rank", range(1, len(df) + 1))
|
76 |
+
|
77 |
+
return df
|
78 |
+
|
79 |
+
|
80 |
+
def app():
|
81 |
+
with gr.Blocks() as demo:
|
82 |
+
gr.Markdown("# 🏆 Leaderboard Viewer")
|
83 |
+
|
84 |
+
with gr.Tabs():
|
85 |
+
with gr.Tab("Overall Results"):
|
86 |
+
overall_table = gr.Dataframe()
|
87 |
+
|
88 |
+
with gr.Tab("Results per Language"):
|
89 |
+
lang_table = gr.Dataframe()
|
90 |
+
|
91 |
+
with gr.Tab("Results per Dataset"):
|
92 |
+
dataset_table = gr.Dataframe()
|
93 |
+
|
94 |
+
with gr.Tab("Results per Emotion"):
|
95 |
+
emotion_table = gr.Dataframe()
|
96 |
+
|
97 |
+
df_state = gr.State()
|
98 |
+
|
99 |
+
def update_leaderboards(select_lang_metric="f1_macro", select_ds_metric="f1_macro"):
|
100 |
+
df = pd.read_json(str(abs_path / "results.jsonl"), lines=True)
|
101 |
+
lang_dict = build_lang_dict(df)
|
102 |
+
ds_dict = build_ds_dict(df)
|
103 |
+
emo_dict = build_emo_dict(df)
|
104 |
+
overall = overall_leaderboard(df)
|
105 |
+
by_lang = leaderboard_per_group(lang_dict, metric=select_lang_metric)
|
106 |
+
by_dataset = leaderboard_per_group(ds_dict, metric=select_ds_metric)
|
107 |
+
by_emotion = leaderboard_per_group(emo_dict)
|
108 |
+
return overall, by_lang, by_dataset, by_emotion, "Loaded successfully.", df
|
109 |
+
|
110 |
+
demo.load(
|
111 |
+
update_leaderboards,
|
112 |
+
inputs=[],
|
113 |
+
outputs=[overall_table, lang_table, dataset_table, emotion_table, df_state]
|
114 |
+
)
|
115 |
+
|
116 |
+
return demo
|
117 |
+
|
118 |
+
if __name__ == "__main__":
|
119 |
+
demo = app()
|
120 |
+
demo.launch()
|