Fix elo ratings model links
Browse files- app.py +1 -1
- elo_utils.py +59 -11
app.py
CHANGED
|
@@ -205,7 +205,7 @@ def get_leaderboard_df():
|
|
| 205 |
def get_evaluation_queue_df():
|
| 206 |
if repo:
|
| 207 |
print("Pulling changes for the evaluation queue.")
|
| 208 |
-
|
| 209 |
|
| 210 |
entries = [
|
| 211 |
entry
|
|
|
|
| 205 |
def get_evaluation_queue_df():
|
| 206 |
if repo:
|
| 207 |
print("Pulling changes for the evaluation queue.")
|
| 208 |
+
repo.git_pull()
|
| 209 |
|
| 210 |
entries = [
|
| 211 |
entry
|
elo_utils.py
CHANGED
|
@@ -8,10 +8,37 @@ from datasets import load_dataset
|
|
| 8 |
|
| 9 |
from content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE
|
| 10 |
from utils import make_clickable_model
|
| 11 |
-
from visualizations import (
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
@dataclass
|
|
@@ -26,7 +53,7 @@ class EloEvalResult:
|
|
| 26 |
def to_dict(self):
|
| 27 |
base_model = f"{self.model}"
|
| 28 |
data_dict = {}
|
| 29 |
-
data_dict["Model"] =
|
| 30 |
data_dict["GPT-4 (all)"] = self.gpt_4_all
|
| 31 |
data_dict["Human (all)"] = self.human_all
|
| 32 |
data_dict["Human (instruct)"] = self.human_instruct
|
|
@@ -61,7 +88,13 @@ def create_eval_df(df, tie_allowed):
|
|
| 61 |
}
|
| 62 |
|
| 63 |
if tie_allowed:
|
| 64 |
-
response["win"] =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
else:
|
| 66 |
response["win"] = "model_a" if response["rating"] < 5 else "model_b"
|
| 67 |
|
|
@@ -84,7 +117,13 @@ def create_eval_df_for_gpt(df, tie_allowed):
|
|
| 84 |
}
|
| 85 |
|
| 86 |
if tie_allowed:
|
| 87 |
-
response["win"] =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
else:
|
| 89 |
response["win"] = "model_a" if response["rating"] < 5 else "model_b"
|
| 90 |
|
|
@@ -124,13 +163,20 @@ def get_elo_results(df_instruct, df_code_instruct, tie_allowed):
|
|
| 124 |
df_all = pd.concat([df_instruct, df_code_instruct])
|
| 125 |
|
| 126 |
df_gpt_4 = load_dataset(
|
| 127 |
-
"gpt_4_evals/data/",
|
|
|
|
|
|
|
| 128 |
).to_pandas()
|
| 129 |
|
| 130 |
dfs = [df_instruct, df_code_instruct, df_all]
|
| 131 |
-
elo_ratings = [
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
-
gpt_4_elo_ratings = convert_rating_from_float_to_int(
|
|
|
|
|
|
|
| 134 |
elo_ratings.append(gpt_4_elo_ratings)
|
| 135 |
|
| 136 |
results = [
|
|
@@ -166,7 +212,9 @@ def get_elo_plots(df_instruct, df_code_instruct, tie_allowed):
|
|
| 166 |
|
| 167 |
BOOTSTRAP_ROUNDS = 1000
|
| 168 |
if "bootstrap_elo_lu" not in globals():
|
| 169 |
-
bootstrap_elo_lu = get_bootstrap_result(
|
|
|
|
|
|
|
| 170 |
|
| 171 |
plot_3 = visualize_bootstrap_scores(bootstrap_elo_lu, PLOT_3_TITLE)
|
| 172 |
|
|
|
|
| 8 |
|
| 9 |
from content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE
|
| 10 |
from utils import make_clickable_model
|
| 11 |
+
from visualizations import (
|
| 12 |
+
get_bootstrap_result,
|
| 13 |
+
switch_model_a_b,
|
| 14 |
+
visualize_battle_count,
|
| 15 |
+
visualize_bootstrap_scores,
|
| 16 |
+
visualize_pairwise_win_fraction,
|
| 17 |
+
visualize_rating_count,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
|
| 22 |
+
VICUNA_LINK = "https://huggingface.co/HuggingFaceH4/stable-vicuna-13b-2904"
|
| 23 |
+
OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
|
| 24 |
+
DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
|
| 25 |
+
MODEL_PAGE = "https://huggingface.co/models"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def make_clickable_model_elo(model_name):
|
| 29 |
+
link = ""
|
| 30 |
+
if model_name == "dolly-12b":
|
| 31 |
+
link = DOLLY_LINK
|
| 32 |
+
elif model_name == "vicuna-13b":
|
| 33 |
+
link = VICUNA_LINK
|
| 34 |
+
elif model_name == "koala-13b":
|
| 35 |
+
link = KOALA_LINK
|
| 36 |
+
elif model_name == "oasst-12b":
|
| 37 |
+
link = OASST_LINK
|
| 38 |
+
else:
|
| 39 |
+
link = MODEL_PAGE
|
| 40 |
+
|
| 41 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 42 |
|
| 43 |
|
| 44 |
@dataclass
|
|
|
|
| 53 |
def to_dict(self):
|
| 54 |
base_model = f"{self.model}"
|
| 55 |
data_dict = {}
|
| 56 |
+
data_dict["Model"] = make_clickable_model_elo(base_model)
|
| 57 |
data_dict["GPT-4 (all)"] = self.gpt_4_all
|
| 58 |
data_dict["Human (all)"] = self.human_all
|
| 59 |
data_dict["Human (instruct)"] = self.human_instruct
|
|
|
|
| 88 |
}
|
| 89 |
|
| 90 |
if tie_allowed:
|
| 91 |
+
response["win"] = (
|
| 92 |
+
"model_a"
|
| 93 |
+
if response["rating"] < 4
|
| 94 |
+
else "model_b"
|
| 95 |
+
if response["rating"] > 5
|
| 96 |
+
else "tie"
|
| 97 |
+
)
|
| 98 |
else:
|
| 99 |
response["win"] = "model_a" if response["rating"] < 5 else "model_b"
|
| 100 |
|
|
|
|
| 117 |
}
|
| 118 |
|
| 119 |
if tie_allowed:
|
| 120 |
+
response["win"] = (
|
| 121 |
+
"model_a"
|
| 122 |
+
if response["rating"] < 4
|
| 123 |
+
else "model_b"
|
| 124 |
+
if response["rating"] > 5
|
| 125 |
+
else "tie"
|
| 126 |
+
)
|
| 127 |
else:
|
| 128 |
response["win"] = "model_a" if response["rating"] < 5 else "model_b"
|
| 129 |
|
|
|
|
| 163 |
df_all = pd.concat([df_instruct, df_code_instruct])
|
| 164 |
|
| 165 |
df_gpt_4 = load_dataset(
|
| 166 |
+
"gpt_4_evals/data/",
|
| 167 |
+
split="train",
|
| 168 |
+
revision="e007baaf6e505731c08a0bc1a833a1f8f8cb8846",
|
| 169 |
).to_pandas()
|
| 170 |
|
| 171 |
dfs = [df_instruct, df_code_instruct, df_all]
|
| 172 |
+
elo_ratings = [
|
| 173 |
+
convert_rating_from_float_to_int(create_eval_df(df, tie_allowed=tie_allowed))
|
| 174 |
+
for df in dfs
|
| 175 |
+
]
|
| 176 |
|
| 177 |
+
gpt_4_elo_ratings = convert_rating_from_float_to_int(
|
| 178 |
+
create_eval_df_for_gpt(df_gpt_4, tie_allowed=tie_allowed)
|
| 179 |
+
)
|
| 180 |
elo_ratings.append(gpt_4_elo_ratings)
|
| 181 |
|
| 182 |
results = [
|
|
|
|
| 212 |
|
| 213 |
BOOTSTRAP_ROUNDS = 1000
|
| 214 |
if "bootstrap_elo_lu" not in globals():
|
| 215 |
+
bootstrap_elo_lu = get_bootstrap_result(
|
| 216 |
+
game_switch, compute_elo, BOOTSTRAP_ROUNDS
|
| 217 |
+
)
|
| 218 |
|
| 219 |
plot_3 = visualize_bootstrap_scores(bootstrap_elo_lu, PLOT_3_TITLE)
|
| 220 |
|