cameo-leaderboard / src /process_data.py
iwonachristop's picture
Update descriptions
8b5f20b
import pandas as pd
from collections import defaultdict
def overall_leaderboard(df: pd.DataFrame, sort_column: str = "f1_macro"):
df = df[df["language"] == "All"]
df = df[["model", "temperature", "f1_macro", "weighted_f1", "accuracy"]]
df = df.sort_values(by=sort_column, ascending=False)
df["temperature"] = df["temperature"].round(1)
for col in ["f1_macro", "weighted_f1", "accuracy"]:
df[col] = df[col].round(4)
return df
def build_lang_dict(df: pd.DataFrame):
lang_data = defaultdict(lambda: defaultdict(dict))
for row in df.itertuples():
for metric in ["f1_macro", "weighted_f1", "accuracy"]:
key = f"{row.language}.{metric}"
lang_data[row.model][row.temperature][key] = getattr(row, metric)
return lang_data
def build_ds_dict(df: pd.DataFrame):
ds_data = defaultdict(lambda: defaultdict(dict))
for row in df.itertuples():
for metric in ["f1_macro", "weighted_f1", "accuracy"]:
key = f"{row.dataset}.{metric}"
ds_data[row.model][row.temperature][key] = getattr(row, metric)
return ds_data
def build_emo_dict(df: pd.DataFrame):
df = df[df["language"] == "All"]
emo_data = defaultdict(lambda: defaultdict(dict))
emotions = df.iloc[0].metrics_per_label.keys() - ["accuracy", "macro avg", "weighted avg"]
for row in df.itertuples():
for emotion in emotions:
emo_data[row.model][row.temperature][emotion] = row.metrics_per_label[emotion].get("f1-score")
emo_data[row.model][row.temperature]["All"] = row.f1_macro
return emo_data
def leaderboard_per_group(lang_dict, use_cols, metric: str = "f1_macro"):
df = []
for model, inner in lang_dict.items():
for temperature, metrics in inner.items():
entry = {"model": model, "temperature": temperature}
for k, v in metrics.items():
if metric in k:
entry[k.split(".")[0]] = v
elif "." not in k:
entry[k] = v
df.append(entry)
df = pd.DataFrame(df)
df["temperature"] = df["temperature"].round(1)
for col in df.columns.difference(["model", "temperature"]):
df[col] = df[col].round(4)
df = df[["model", "temperature"] + sorted(use_cols)]
if "All" in use_cols:
df = df.sort_values(by="All", ascending=False)
return df