Spaces:
Runtime error
Runtime error
Clémentine
commited on
Commit
·
b323764
1
Parent(s):
217b585
Added icons for types + fixed pending queue
Browse files- app.py +9 -10
- src/assets/hardcoded_evals.py +3 -0
- src/assets/text_content.py +7 -0
- src/auto_leaderboard/load_results.py +5 -1
- src/auto_leaderboard/model_metadata_type.py +19 -16
- src/utils_display.py +2 -2
app.py
CHANGED
|
@@ -99,7 +99,6 @@ def get_leaderboard_df():
|
|
| 99 |
|
| 100 |
|
| 101 |
def get_evaluation_queue_df():
|
| 102 |
-
# todo @saylortwift: replace the repo by the one you created for the eval queue
|
| 103 |
if eval_queue:
|
| 104 |
print("Pulling changes for the evaluation queue.")
|
| 105 |
eval_queue.git_pull()
|
|
@@ -141,7 +140,7 @@ def get_evaluation_queue_df():
|
|
| 141 |
data["model"] = make_clickable_model(data["model"])
|
| 142 |
all_evals.append(data)
|
| 143 |
|
| 144 |
-
pending_list = [e for e in all_evals if e["status"]
|
| 145 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
| 146 |
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
|
| 147 |
df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
|
|
@@ -388,6 +387,14 @@ with demo:
|
|
| 388 |
private = gr.Checkbox(
|
| 389 |
False, label="Private", visible=not IS_PUBLIC
|
| 390 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
|
| 392 |
with gr.Column():
|
| 393 |
precision = gr.Dropdown(
|
|
@@ -398,14 +405,6 @@ with demo:
|
|
| 398 |
max_choices=1,
|
| 399 |
interactive=True,
|
| 400 |
)
|
| 401 |
-
model_type = gr.Dropdown(
|
| 402 |
-
choices=["pretrained", "fine-tuned", "with RL"],
|
| 403 |
-
label="Model type",
|
| 404 |
-
multiselect=False,
|
| 405 |
-
value="pretrained",
|
| 406 |
-
max_choices=1,
|
| 407 |
-
interactive=True,
|
| 408 |
-
)
|
| 409 |
weight_type = gr.Dropdown(
|
| 410 |
choices=["Original", "Delta", "Adapter"],
|
| 411 |
label="Weights type",
|
|
|
|
| 99 |
|
| 100 |
|
| 101 |
def get_evaluation_queue_df():
|
|
|
|
| 102 |
if eval_queue:
|
| 103 |
print("Pulling changes for the evaluation queue.")
|
| 104 |
eval_queue.git_pull()
|
|
|
|
| 140 |
data["model"] = make_clickable_model(data["model"])
|
| 141 |
all_evals.append(data)
|
| 142 |
|
| 143 |
+
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
| 144 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
| 145 |
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
|
| 146 |
df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
|
|
|
|
| 387 |
private = gr.Checkbox(
|
| 388 |
False, label="Private", visible=not IS_PUBLIC
|
| 389 |
)
|
| 390 |
+
model_type = gr.Dropdown(
|
| 391 |
+
choices=["pretrained", "fine-tuned", "with RL"],
|
| 392 |
+
label="Model type",
|
| 393 |
+
multiselect=False,
|
| 394 |
+
value="pretrained",
|
| 395 |
+
max_choices=1,
|
| 396 |
+
interactive=True,
|
| 397 |
+
)
|
| 398 |
|
| 399 |
with gr.Column():
|
| 400 |
precision = gr.Dropdown(
|
|
|
|
| 405 |
max_choices=1,
|
| 406 |
interactive=True,
|
| 407 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
weight_type = gr.Dropdown(
|
| 409 |
choices=["Original", "Delta", "Adapter"],
|
| 410 |
label="Weights type",
|
src/assets/hardcoded_evals.py
CHANGED
|
@@ -10,6 +10,7 @@ gpt4_values = {
|
|
| 10 |
AutoEvalColumn.mmlu.name: 86.4,
|
| 11 |
AutoEvalColumn.truthfulqa.name: 59.0,
|
| 12 |
AutoEvalColumn.dummy.name: "GPT-4",
|
|
|
|
| 13 |
}
|
| 14 |
|
| 15 |
gpt35_values = {
|
|
@@ -22,6 +23,7 @@ gpt35_values = {
|
|
| 22 |
AutoEvalColumn.mmlu.name: 70.0,
|
| 23 |
AutoEvalColumn.truthfulqa.name: 47.0,
|
| 24 |
AutoEvalColumn.dummy.name: "GPT-3.5",
|
|
|
|
| 25 |
}
|
| 26 |
|
| 27 |
baseline = {
|
|
@@ -34,5 +36,6 @@ baseline = {
|
|
| 34 |
AutoEvalColumn.mmlu.name: 25.0,
|
| 35 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
| 36 |
AutoEvalColumn.dummy.name: "baseline",
|
|
|
|
| 37 |
}
|
| 38 |
|
|
|
|
| 10 |
AutoEvalColumn.mmlu.name: 86.4,
|
| 11 |
AutoEvalColumn.truthfulqa.name: 59.0,
|
| 12 |
AutoEvalColumn.dummy.name: "GPT-4",
|
| 13 |
+
AutoEvalColumn.model_type.name: "",
|
| 14 |
}
|
| 15 |
|
| 16 |
gpt35_values = {
|
|
|
|
| 23 |
AutoEvalColumn.mmlu.name: 70.0,
|
| 24 |
AutoEvalColumn.truthfulqa.name: 47.0,
|
| 25 |
AutoEvalColumn.dummy.name: "GPT-3.5",
|
| 26 |
+
AutoEvalColumn.model_type.name: "",
|
| 27 |
}
|
| 28 |
|
| 29 |
baseline = {
|
|
|
|
| 36 |
AutoEvalColumn.mmlu.name: 25.0,
|
| 37 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
| 38 |
AutoEvalColumn.dummy.name: "baseline",
|
| 39 |
+
AutoEvalColumn.model_type.name: "",
|
| 40 |
}
|
| 41 |
|
src/assets/text_content.py
CHANGED
|
@@ -128,6 +128,13 @@ To get more information about quantization, see:
|
|
| 128 |
- 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
|
| 129 |
- 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
# In case of model failure
|
| 132 |
If your model is displayed in the `FAILED` category, its execution stopped.
|
| 133 |
Make sure you have followed the above steps first.
|
|
|
|
| 128 |
- 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
|
| 129 |
- 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
|
| 130 |
|
| 131 |
+
### Icons
|
| 132 |
+
🟢 means that the model is pretrained
|
| 133 |
+
🔶 that it is finetuned
|
| 134 |
+
🟦 that is was trained with RL.
|
| 135 |
+
If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
|
| 136 |
+
|
| 137 |
+
|
| 138 |
# In case of model failure
|
| 139 |
If your model is displayed in the `FAILED` category, its execution stopped.
|
| 140 |
Make sure you have followed the above steps first.
|
src/auto_leaderboard/load_results.py
CHANGED
|
@@ -26,6 +26,8 @@ class EvalResult:
|
|
| 26 |
revision: str
|
| 27 |
results: dict
|
| 28 |
precision: str = "16bit"
|
|
|
|
|
|
|
| 29 |
|
| 30 |
def to_dict(self):
|
| 31 |
if self.org is not None:
|
|
@@ -35,7 +37,9 @@ class EvalResult:
|
|
| 35 |
data_dict = {}
|
| 36 |
|
| 37 |
data_dict["eval_name"] = self.eval_name # not a column, just a save name
|
|
|
|
| 38 |
data_dict[AutoEvalColumn.precision.name] = self.precision
|
|
|
|
| 39 |
data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
|
| 40 |
data_dict[AutoEvalColumn.dummy.name] = base_model
|
| 41 |
data_dict[AutoEvalColumn.revision.name] = self.revision
|
|
@@ -92,7 +96,7 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
|
|
| 92 |
continue
|
| 93 |
mean_acc = round(np.mean(accs) * 100.0, 1)
|
| 94 |
eval_results.append(EvalResult(
|
| 95 |
-
result_key, org, model, model_sha, {benchmark: mean_acc}
|
| 96 |
))
|
| 97 |
|
| 98 |
return result_key, eval_results
|
|
|
|
| 26 |
revision: str
|
| 27 |
results: dict
|
| 28 |
precision: str = "16bit"
|
| 29 |
+
model_type: str = ""
|
| 30 |
+
weight_type: str = ""
|
| 31 |
|
| 32 |
def to_dict(self):
|
| 33 |
if self.org is not None:
|
|
|
|
| 37 |
data_dict = {}
|
| 38 |
|
| 39 |
data_dict["eval_name"] = self.eval_name # not a column, just a save name
|
| 40 |
+
data_dict["weight_type"] = self.weight_type # not a column, just a save name
|
| 41 |
data_dict[AutoEvalColumn.precision.name] = self.precision
|
| 42 |
+
data_dict[AutoEvalColumn.model_type.name] = self.model_type
|
| 43 |
data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
|
| 44 |
data_dict[AutoEvalColumn.dummy.name] = base_model
|
| 45 |
data_dict[AutoEvalColumn.revision.name] = self.revision
|
|
|
|
| 96 |
continue
|
| 97 |
mean_acc = round(np.mean(accs) * 100.0, 1)
|
| 98 |
eval_results.append(EvalResult(
|
| 99 |
+
eval_name=result_key, org=org, model=model, revision=model_sha, results={benchmark: mean_acc}, #todo model_type=, weight_type=
|
| 100 |
))
|
| 101 |
|
| 102 |
return result_key, eval_results
|
src/auto_leaderboard/model_metadata_type.py
CHANGED
|
@@ -2,6 +2,8 @@ from dataclasses import dataclass
|
|
| 2 |
from enum import Enum
|
| 3 |
from typing import Dict, List
|
| 4 |
|
|
|
|
|
|
|
| 5 |
@dataclass
|
| 6 |
class ModelInfo:
|
| 7 |
name: str
|
|
@@ -167,23 +169,24 @@ TYPE_METADATA: Dict[str, ModelType] = {
|
|
| 167 |
|
| 168 |
def get_model_type(leaderboard_data: List[dict]):
|
| 169 |
for model_data in leaderboard_data:
|
| 170 |
-
#
|
| 171 |
-
model_data["
|
| 172 |
-
model_data["Type"] = ""
|
| 173 |
-
|
| 174 |
# Stored information
|
| 175 |
if model_data["model_name_for_query"] in TYPE_METADATA:
|
| 176 |
-
model_data[
|
| 177 |
-
model_data[
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
|
|
|
|
| 2 |
from enum import Enum
|
| 3 |
from typing import Dict, List
|
| 4 |
|
| 5 |
+
from ..utils_display import AutoEvalColumn
|
| 6 |
+
|
| 7 |
@dataclass
|
| 8 |
class ModelInfo:
|
| 9 |
name: str
|
|
|
|
| 169 |
|
| 170 |
def get_model_type(leaderboard_data: List[dict]):
|
| 171 |
for model_data in leaderboard_data:
|
| 172 |
+
# Todo @clefourrier once requests are connected with results
|
| 173 |
+
is_delta = False # (model_data["weight_type"] != "Original")
|
|
|
|
|
|
|
| 174 |
# Stored information
|
| 175 |
if model_data["model_name_for_query"] in TYPE_METADATA:
|
| 176 |
+
model_data[AutoEvalColumn.model_type.name] = TYPE_METADATA[model_data["model_name_for_query"]].value.name
|
| 177 |
+
model_data[AutoEvalColumn.model_type_symbol.name] = TYPE_METADATA[model_data["model_name_for_query"]].value.symbol + ("🔺" if is_delta else "")
|
| 178 |
+
# Inferred from the name or the selected type
|
| 179 |
+
elif model_data[AutoEvalColumn.model_type.name] == "pretrained" or any([i in model_data["model_name_for_query"] for i in ["pretrained"]]):
|
| 180 |
+
model_data[AutoEvalColumn.model_type.name] = ModelType.PT.value.name
|
| 181 |
+
model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.PT.value.symbol + ("🔺" if is_delta else "")
|
| 182 |
+
elif model_data[AutoEvalColumn.model_type.name] == "finetuned" or any([i in model_data["model_name_for_query"] for i in ["finetuned", "-ft-"]]):
|
| 183 |
+
model_data[AutoEvalColumn.model_type.name] = ModelType.SFT.value.name
|
| 184 |
+
model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.SFT.value.symbol + ("🔺" if is_delta else "")
|
| 185 |
+
elif model_data[AutoEvalColumn.model_type.name] == "with RL" or any([i in model_data["model_name_for_query"] for i in ["-rl-", "-rlhf-"]]):
|
| 186 |
+
model_data[AutoEvalColumn.model_type.name] = ModelType.RL.value.name
|
| 187 |
+
model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.RL.value.symbol + ("🔺" if is_delta else "")
|
| 188 |
+
else:
|
| 189 |
+
model_data[AutoEvalColumn.model_type.name] = "N/A"
|
| 190 |
+
model_data[AutoEvalColumn.model_type_symbol.name] = ("🔺" if is_delta else "")
|
| 191 |
|
| 192 |
|
src/utils_display.py
CHANGED
|
@@ -14,14 +14,14 @@ def fields(raw_class):
|
|
| 14 |
|
| 15 |
@dataclass(frozen=True)
|
| 16 |
class AutoEvalColumn: # Auto evals column
|
| 17 |
-
model_type_symbol = ColumnContent("
|
| 18 |
model = ColumnContent("Model", "markdown", True)
|
| 19 |
average = ColumnContent("Average ⬆️", "number", True)
|
| 20 |
arc = ColumnContent("ARC", "number", True)
|
| 21 |
hellaswag = ColumnContent("HellaSwag", "number", True)
|
| 22 |
mmlu = ColumnContent("MMLU", "number", True)
|
| 23 |
truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
|
| 24 |
-
model_type = ColumnContent("Type
|
| 25 |
precision = ColumnContent("Precision", "str", False, True)
|
| 26 |
license = ColumnContent("Hub License", "str", False)
|
| 27 |
params = ColumnContent("#Params (B)", "number", False)
|
|
|
|
| 14 |
|
| 15 |
@dataclass(frozen=True)
|
| 16 |
class AutoEvalColumn: # Auto evals column
|
| 17 |
+
model_type_symbol = ColumnContent("T", "str", True)
|
| 18 |
model = ColumnContent("Model", "markdown", True)
|
| 19 |
average = ColumnContent("Average ⬆️", "number", True)
|
| 20 |
arc = ColumnContent("ARC", "number", True)
|
| 21 |
hellaswag = ColumnContent("HellaSwag", "number", True)
|
| 22 |
mmlu = ColumnContent("MMLU", "number", True)
|
| 23 |
truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
|
| 24 |
+
model_type = ColumnContent("Type", "str", False)
|
| 25 |
precision = ColumnContent("Precision", "str", False, True)
|
| 26 |
license = ColumnContent("Hub License", "str", False)
|
| 27 |
params = ColumnContent("#Params (B)", "number", False)
|