Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Fix TruthQA typo
Browse files
app.py
CHANGED
|
@@ -43,11 +43,11 @@ def load_results(model, benchmark, metric):
|
|
| 43 |
with open(file_path) as fp:
|
| 44 |
data = json.load(fp)
|
| 45 |
accs = np.array([v[metric] for k, v in data["results"].items()])
|
| 46 |
-
mean_acc = np.mean(accs)
|
| 47 |
return mean_acc, data["config"]["model_args"]
|
| 48 |
|
| 49 |
|
| 50 |
-
COLS = ["Model", "Revision", "Average ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", "
|
| 51 |
TYPES = ["markdown","str", "number", "number", "number", "number", "number", ]
|
| 52 |
|
| 53 |
if not IS_PUBLIC:
|
|
@@ -57,36 +57,36 @@ if not IS_PUBLIC:
|
|
| 57 |
EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
|
| 58 |
EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
|
| 59 |
def get_leaderboard():
|
| 60 |
-
if repo:
|
| 61 |
print("pulling changes")
|
| 62 |
repo.git_pull()
|
| 63 |
-
|
| 64 |
all_data = get_eval_results_dicts(IS_PUBLIC)
|
| 65 |
-
|
| 66 |
if not IS_PUBLIC:
|
| 67 |
gpt4_values = {
|
| 68 |
-
"Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt4</a>',
|
| 69 |
-
"Revision":"tech report",
|
| 70 |
"8bit":None,
|
| 71 |
"Average ⬆️":84.3,
|
| 72 |
"ARC (25-shot) ⬆️":96.3,
|
| 73 |
"HellaSwag (10-shot) ⬆️":95.3,
|
| 74 |
"MMLU (5-shot) ⬆️":86.4,
|
| 75 |
-
"
|
| 76 |
}
|
| 77 |
all_data.append(gpt4_values)
|
| 78 |
gpt35_values = {
|
| 79 |
-
"Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt3.5</a>',
|
| 80 |
-
"Revision":"tech report",
|
| 81 |
"8bit":None,
|
| 82 |
"Average ⬆️":71.9,
|
| 83 |
"ARC (25-shot) ⬆️":85.2,
|
| 84 |
"HellaSwag (10-shot) ⬆️":85.5,
|
| 85 |
"MMLU (5-shot) ⬆️":70.0,
|
| 86 |
-
"
|
| 87 |
}
|
| 88 |
all_data.append(gpt35_values)
|
| 89 |
-
|
| 90 |
dataframe = pd.DataFrame.from_records(all_data)
|
| 91 |
dataframe = dataframe.sort_values(by=['Average ⬆️'], ascending=False)
|
| 92 |
print(dataframe)
|
|
@@ -94,38 +94,38 @@ def get_leaderboard():
|
|
| 94 |
return dataframe
|
| 95 |
|
| 96 |
def get_eval_table():
|
| 97 |
-
if repo:
|
| 98 |
print("pulling changes for eval")
|
| 99 |
repo.git_pull()
|
| 100 |
-
entries = [entry for entry in os.listdir("evals/eval_requests") if not entry.startswith('.')]
|
| 101 |
all_evals = []
|
| 102 |
-
|
| 103 |
for entry in entries:
|
| 104 |
print(entry)
|
| 105 |
if ".json"in entry:
|
| 106 |
file_path = os.path.join("evals/eval_requests", entry)
|
| 107 |
with open(file_path) as fp:
|
| 108 |
data = json.load(fp)
|
| 109 |
-
|
| 110 |
data["# params"] = "unknown"
|
| 111 |
data["model"] = make_clickable_model(data["model"])
|
| 112 |
data["revision"] = data.get("revision", "main")
|
| 113 |
-
|
| 114 |
|
| 115 |
all_evals.append(data)
|
| 116 |
else:
|
| 117 |
# this is a folder
|
| 118 |
-
sub_entries = [e for e in os.listdir(f"evals/eval_requests/{entry}") if not e.startswith('.')]
|
| 119 |
for sub_entry in sub_entries:
|
| 120 |
file_path = os.path.join("evals/eval_requests", entry, sub_entry)
|
| 121 |
with open(file_path) as fp:
|
| 122 |
data = json.load(fp)
|
| 123 |
-
|
| 124 |
#data["# params"] = get_n_params(data["model"])
|
| 125 |
data["model"] = make_clickable_model(data["model"])
|
| 126 |
all_evals.append(data)
|
| 127 |
|
| 128 |
-
|
| 129 |
dataframe = pd.DataFrame.from_records(all_evals)
|
| 130 |
return dataframe[EVAL_COLS]
|
| 131 |
|
|
@@ -137,12 +137,12 @@ def is_model_on_hub(model_name, revision) -> bool:
|
|
| 137 |
try:
|
| 138 |
config = AutoConfig.from_pretrained(model_name, revision=revision)
|
| 139 |
return True
|
| 140 |
-
|
| 141 |
except Exception as e:
|
| 142 |
print("Could not get the model config from the hub")
|
| 143 |
print(e)
|
| 144 |
return False
|
| 145 |
-
|
| 146 |
|
| 147 |
|
| 148 |
def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool, private:bool, is_delta_weight:bool):
|
|
@@ -152,12 +152,12 @@ def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool,
|
|
| 152 |
if is_delta_weight and not is_model_on_hub(base_model, revision):
|
| 153 |
print(base_model, "base model not found on hub")
|
| 154 |
return
|
| 155 |
-
|
| 156 |
if not is_model_on_hub(model, revision):
|
| 157 |
print(model, "not found on hub")
|
| 158 |
return
|
| 159 |
print("adding new eval")
|
| 160 |
-
|
| 161 |
eval_entry = {
|
| 162 |
"model" : model,
|
| 163 |
"base_model" : base_model,
|
|
@@ -166,22 +166,22 @@ def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool,
|
|
| 166 |
"8bit_eval" : is_8_bit_eval,
|
| 167 |
"is_delta_weight" : is_delta_weight,
|
| 168 |
"status" : "PENDING"
|
| 169 |
-
}
|
| 170 |
-
|
| 171 |
user_name = ""
|
| 172 |
model_path = model
|
| 173 |
if "/" in model:
|
| 174 |
user_name = model.split("/")[0]
|
| 175 |
model_path = model.split("/")[1]
|
| 176 |
-
|
| 177 |
OUT_DIR=f"eval_requests/{user_name}"
|
| 178 |
os.makedirs(OUT_DIR, exist_ok=True)
|
| 179 |
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
|
| 180 |
-
|
| 181 |
with open(out_path, "w") as f:
|
| 182 |
f.write(json.dumps(eval_entry))
|
| 183 |
LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
|
| 184 |
-
|
| 185 |
api = HfApi()
|
| 186 |
api.upload_file(
|
| 187 |
path_or_fileobj=out_path,
|
|
@@ -191,14 +191,14 @@ def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool,
|
|
| 191 |
repo_type="dataset",
|
| 192 |
)
|
| 193 |
|
| 194 |
-
|
| 195 |
def refresh():
|
| 196 |
return get_leaderboard(), get_eval_table()
|
| 197 |
-
|
| 198 |
|
| 199 |
|
| 200 |
block = gr.Blocks()
|
| 201 |
-
with block:
|
| 202 |
with gr.Row():
|
| 203 |
gr.Markdown(f"""
|
| 204 |
# 🤗 Open LLM Leaderboard
|
|
@@ -208,49 +208,47 @@ Evaluation is performed against 4 popular benchmarks:
|
|
| 208 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
|
| 209 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
|
| 210 |
- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
|
| 211 |
-
- <a href="https://arxiv.org/abs/2109.07958" target="_blank">
|
| 212 |
|
| 213 |
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings. </font>
|
| 214 |
""")
|
| 215 |
-
|
| 216 |
with gr.Row():
|
| 217 |
leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
|
| 218 |
datatype=TYPES, max_rows=5)
|
| 219 |
|
| 220 |
-
|
| 221 |
-
|
| 222 |
with gr.Row():
|
| 223 |
gr.Markdown(f"""
|
| 224 |
# Evaluation Queue for the 🤗 Open LLM Leaderboard, these models will be automatically evaluated on the 🤗 cluster
|
| 225 |
-
|
| 226 |
""")
|
| 227 |
with gr.Accordion("Evaluation Queue", open=False):
|
| 228 |
with gr.Row():
|
| 229 |
eval_table = gr.components.Dataframe(value=eval_queue, headers=EVAL_COLS,
|
| 230 |
-
datatype=EVAL_TYPES, max_rows=5)
|
| 231 |
-
|
| 232 |
with gr.Row():
|
| 233 |
refresh_button = gr.Button("Refresh")
|
| 234 |
-
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
|
| 235 |
-
|
| 236 |
with gr.Accordion("Submit a new model for evaluation"):
|
| 237 |
-
# with gr.Row():
|
| 238 |
-
# gr.Markdown(f"""# Submit a new model for evaluation""")
|
| 239 |
with gr.Row():
|
| 240 |
with gr.Column():
|
| 241 |
model_name_textbox = gr.Textbox(label="Model name")
|
| 242 |
revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
|
| 243 |
-
|
| 244 |
with gr.Column():
|
| 245 |
is_8bit_toggle = gr.Checkbox(False, label="8 bit eval", visible=not IS_PUBLIC)
|
| 246 |
private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
|
| 247 |
is_delta_weight = gr.Checkbox(False, label="Delta weights")
|
| 248 |
base_model_name_textbox = gr.Textbox(label="base model (for delta)")
|
| 249 |
-
|
| 250 |
with gr.Row():
|
| 251 |
submit_button = gr.Button("Submit Eval")
|
| 252 |
submit_button.click(add_new_eval, [model_name_textbox, base_model_name_textbox, revision_name_textbox, is_8bit_toggle, private, is_delta_weight])
|
| 253 |
-
|
| 254 |
|
| 255 |
block.load(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
|
| 256 |
block.launch()
|
|
|
|
| 43 |
with open(file_path) as fp:
|
| 44 |
data = json.load(fp)
|
| 45 |
accs = np.array([v[metric] for k, v in data["results"].items()])
|
| 46 |
+
mean_acc = np.mean(accs)
|
| 47 |
return mean_acc, data["config"]["model_args"]
|
| 48 |
|
| 49 |
|
| 50 |
+
COLS = ["Model", "Revision", "Average ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", "TruthfulQA (0-shot) ⬆️"]
|
| 51 |
TYPES = ["markdown","str", "number", "number", "number", "number", "number", ]
|
| 52 |
|
| 53 |
if not IS_PUBLIC:
|
|
|
|
| 57 |
EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
|
| 58 |
EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
|
| 59 |
def get_leaderboard():
|
| 60 |
+
if repo:
|
| 61 |
print("pulling changes")
|
| 62 |
repo.git_pull()
|
| 63 |
+
|
| 64 |
all_data = get_eval_results_dicts(IS_PUBLIC)
|
| 65 |
+
|
| 66 |
if not IS_PUBLIC:
|
| 67 |
gpt4_values = {
|
| 68 |
+
"Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt4</a>',
|
| 69 |
+
"Revision":"tech report",
|
| 70 |
"8bit":None,
|
| 71 |
"Average ⬆️":84.3,
|
| 72 |
"ARC (25-shot) ⬆️":96.3,
|
| 73 |
"HellaSwag (10-shot) ⬆️":95.3,
|
| 74 |
"MMLU (5-shot) ⬆️":86.4,
|
| 75 |
+
"TruthfulQA (0-shot) ⬆️":59.0,
|
| 76 |
}
|
| 77 |
all_data.append(gpt4_values)
|
| 78 |
gpt35_values = {
|
| 79 |
+
"Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt3.5</a>',
|
| 80 |
+
"Revision":"tech report",
|
| 81 |
"8bit":None,
|
| 82 |
"Average ⬆️":71.9,
|
| 83 |
"ARC (25-shot) ⬆️":85.2,
|
| 84 |
"HellaSwag (10-shot) ⬆️":85.5,
|
| 85 |
"MMLU (5-shot) ⬆️":70.0,
|
| 86 |
+
"TruthfulQA (0-shot) ⬆️":47.0,
|
| 87 |
}
|
| 88 |
all_data.append(gpt35_values)
|
| 89 |
+
|
| 90 |
dataframe = pd.DataFrame.from_records(all_data)
|
| 91 |
dataframe = dataframe.sort_values(by=['Average ⬆️'], ascending=False)
|
| 92 |
print(dataframe)
|
|
|
|
| 94 |
return dataframe
|
| 95 |
|
| 96 |
def get_eval_table():
|
| 97 |
+
if repo:
|
| 98 |
print("pulling changes for eval")
|
| 99 |
repo.git_pull()
|
| 100 |
+
entries = [entry for entry in os.listdir("evals/eval_requests") if not entry.startswith('.')]
|
| 101 |
all_evals = []
|
| 102 |
+
|
| 103 |
for entry in entries:
|
| 104 |
print(entry)
|
| 105 |
if ".json"in entry:
|
| 106 |
file_path = os.path.join("evals/eval_requests", entry)
|
| 107 |
with open(file_path) as fp:
|
| 108 |
data = json.load(fp)
|
| 109 |
+
|
| 110 |
data["# params"] = "unknown"
|
| 111 |
data["model"] = make_clickable_model(data["model"])
|
| 112 |
data["revision"] = data.get("revision", "main")
|
| 113 |
+
|
| 114 |
|
| 115 |
all_evals.append(data)
|
| 116 |
else:
|
| 117 |
# this is a folder
|
| 118 |
+
sub_entries = [e for e in os.listdir(f"evals/eval_requests/{entry}") if not e.startswith('.')]
|
| 119 |
for sub_entry in sub_entries:
|
| 120 |
file_path = os.path.join("evals/eval_requests", entry, sub_entry)
|
| 121 |
with open(file_path) as fp:
|
| 122 |
data = json.load(fp)
|
| 123 |
+
|
| 124 |
#data["# params"] = get_n_params(data["model"])
|
| 125 |
data["model"] = make_clickable_model(data["model"])
|
| 126 |
all_evals.append(data)
|
| 127 |
|
| 128 |
+
|
| 129 |
dataframe = pd.DataFrame.from_records(all_evals)
|
| 130 |
return dataframe[EVAL_COLS]
|
| 131 |
|
|
|
|
| 137 |
try:
|
| 138 |
config = AutoConfig.from_pretrained(model_name, revision=revision)
|
| 139 |
return True
|
| 140 |
+
|
| 141 |
except Exception as e:
|
| 142 |
print("Could not get the model config from the hub")
|
| 143 |
print(e)
|
| 144 |
return False
|
| 145 |
+
|
| 146 |
|
| 147 |
|
| 148 |
def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool, private:bool, is_delta_weight:bool):
|
|
|
|
| 152 |
if is_delta_weight and not is_model_on_hub(base_model, revision):
|
| 153 |
print(base_model, "base model not found on hub")
|
| 154 |
return
|
| 155 |
+
|
| 156 |
if not is_model_on_hub(model, revision):
|
| 157 |
print(model, "not found on hub")
|
| 158 |
return
|
| 159 |
print("adding new eval")
|
| 160 |
+
|
| 161 |
eval_entry = {
|
| 162 |
"model" : model,
|
| 163 |
"base_model" : base_model,
|
|
|
|
| 166 |
"8bit_eval" : is_8_bit_eval,
|
| 167 |
"is_delta_weight" : is_delta_weight,
|
| 168 |
"status" : "PENDING"
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
user_name = ""
|
| 172 |
model_path = model
|
| 173 |
if "/" in model:
|
| 174 |
user_name = model.split("/")[0]
|
| 175 |
model_path = model.split("/")[1]
|
| 176 |
+
|
| 177 |
OUT_DIR=f"eval_requests/{user_name}"
|
| 178 |
os.makedirs(OUT_DIR, exist_ok=True)
|
| 179 |
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
|
| 180 |
+
|
| 181 |
with open(out_path, "w") as f:
|
| 182 |
f.write(json.dumps(eval_entry))
|
| 183 |
LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
|
| 184 |
+
|
| 185 |
api = HfApi()
|
| 186 |
api.upload_file(
|
| 187 |
path_or_fileobj=out_path,
|
|
|
|
| 191 |
repo_type="dataset",
|
| 192 |
)
|
| 193 |
|
| 194 |
+
|
| 195 |
def refresh():
|
| 196 |
return get_leaderboard(), get_eval_table()
|
| 197 |
+
|
| 198 |
|
| 199 |
|
| 200 |
block = gr.Blocks()
|
| 201 |
+
with block:
|
| 202 |
with gr.Row():
|
| 203 |
gr.Markdown(f"""
|
| 204 |
# 🤗 Open LLM Leaderboard
|
|
|
|
| 208 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
|
| 209 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
|
| 210 |
- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
|
| 211 |
+
- <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a benchmark to measure whether a language model is truthful in generating answers to questions.
|
| 212 |
|
| 213 |
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings. </font>
|
| 214 |
""")
|
| 215 |
+
|
| 216 |
with gr.Row():
|
| 217 |
leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
|
| 218 |
datatype=TYPES, max_rows=5)
|
| 219 |
|
| 220 |
+
|
| 221 |
+
|
| 222 |
with gr.Row():
|
| 223 |
gr.Markdown(f"""
|
| 224 |
# Evaluation Queue for the 🤗 Open LLM Leaderboard, these models will be automatically evaluated on the 🤗 cluster
|
| 225 |
+
|
| 226 |
""")
|
| 227 |
with gr.Accordion("Evaluation Queue", open=False):
|
| 228 |
with gr.Row():
|
| 229 |
eval_table = gr.components.Dataframe(value=eval_queue, headers=EVAL_COLS,
|
| 230 |
+
datatype=EVAL_TYPES, max_rows=5)
|
| 231 |
+
|
| 232 |
with gr.Row():
|
| 233 |
refresh_button = gr.Button("Refresh")
|
| 234 |
+
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
|
| 235 |
+
|
| 236 |
with gr.Accordion("Submit a new model for evaluation"):
|
|
|
|
|
|
|
| 237 |
with gr.Row():
|
| 238 |
with gr.Column():
|
| 239 |
model_name_textbox = gr.Textbox(label="Model name")
|
| 240 |
revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
|
| 241 |
+
|
| 242 |
with gr.Column():
|
| 243 |
is_8bit_toggle = gr.Checkbox(False, label="8 bit eval", visible=not IS_PUBLIC)
|
| 244 |
private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
|
| 245 |
is_delta_weight = gr.Checkbox(False, label="Delta weights")
|
| 246 |
base_model_name_textbox = gr.Textbox(label="base model (for delta)")
|
| 247 |
+
|
| 248 |
with gr.Row():
|
| 249 |
submit_button = gr.Button("Submit Eval")
|
| 250 |
submit_button.click(add_new_eval, [model_name_textbox, base_model_name_textbox, revision_name_textbox, is_8bit_toggle, private, is_delta_weight])
|
| 251 |
+
|
| 252 |
|
| 253 |
block.load(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
|
| 254 |
block.launch()
|
utils.py
CHANGED
|
@@ -21,7 +21,7 @@ BENCH_TO_NAME = {
|
|
| 21 |
"arc_challenge":"ARC (25-shot) ⬆️",
|
| 22 |
"hellaswag":"HellaSwag (10-shot) ⬆️",
|
| 23 |
"hendrycks":"MMLU (5-shot) ⬆️",
|
| 24 |
-
"truthfulqa_mc":"
|
| 25 |
}
|
| 26 |
def make_clickable_model(model_name):
|
| 27 |
LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
|
|
|
|
| 21 |
"arc_challenge":"ARC (25-shot) ⬆️",
|
| 22 |
"hellaswag":"HellaSwag (10-shot) ⬆️",
|
| 23 |
"hendrycks":"MMLU (5-shot) ⬆️",
|
| 24 |
+
"truthfulqa_mc":"TruthfulQA (0-shot) ⬆️",
|
| 25 |
}
|
| 26 |
def make_clickable_model(model_name):
|
| 27 |
LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
|