Spaces:
Running
Running
explore data
Browse files- .gitignore +1 -0
- _header.md +2 -2
- app.py +48 -9
- constants.py +14 -9
- data_utils.py +93 -5
- eval_utils.py +217 -0
- update_data.sh +3 -2
- zebra_banner.png +0 -0
.gitignore
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
|
2 |
*.pyc
|
3 |
*.DS_Store
|
|
|
|
1 |
|
2 |
*.pyc
|
3 |
*.DS_Store
|
4 |
+
ZeroEval-main/result_dirs/zebra-grid/
|
_header.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
<br/>
|
2 |
|
3 |
-
#
|
4 |
-
[📑
|
5 |
|
|
|
1 |
<br/>
|
2 |
|
3 |
+
# 🦓 ZebraLogic Bench: Testing the Limits of LLMs in Logical Reasoning
|
4 |
+
[📑 Blog](https://allenai.github.io/WildBench/WildBench_paper.pdf) | [💻 GitHub](https://github.com/allenai/WildBench) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/wildbench-65e8f2fa9c1260a85a933627) | [🐦 X]() | [💬 Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
|
5 |
|
app.py
CHANGED
@@ -18,7 +18,7 @@ import os, uuid
|
|
18 |
from utils_display import model_info
|
19 |
from constants import column_names, LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
|
20 |
import pytz
|
21 |
-
from data_utils import post_processing
|
22 |
|
23 |
# get the last updated time from the elo_ranks.all.jsonl file
|
24 |
LAST_UPDATED = None
|
@@ -34,6 +34,7 @@ with open("_header.md", "r") as f:
|
|
34 |
with open("_metrics.md", "r") as f:
|
35 |
METRICS_MD = f.read()
|
36 |
|
|
|
37 |
original_df = None
|
38 |
# available_models = [] # to be filled in later
|
39 |
available_models = list(model_info.keys())
|
@@ -89,7 +90,44 @@ def _tab_leaderboard():
|
|
89 |
mode_selection_radio.change(fn=df_filters, inputs=[mode_selection_radio, _gstr("")], outputs=[leaderboard_table])
|
90 |
|
91 |
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
def _tab_submit():
|
94 |
pass
|
95 |
|
@@ -101,13 +139,14 @@ def build_demo():
|
|
101 |
gr.HTML(BANNER, elem_id="banner")
|
102 |
# convert LAST_UPDATED to the PDT time
|
103 |
LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
|
104 |
-
|
105 |
-
|
106 |
|
107 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
108 |
with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
|
109 |
_tab_leaderboard()
|
110 |
-
|
|
|
111 |
with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
|
112 |
_tab_submit()
|
113 |
|
@@ -129,7 +168,7 @@ def build_demo():
|
|
129 |
|
130 |
|
131 |
def data_load(result_file):
|
132 |
-
global original_df
|
133 |
print(f"Loading {result_file}")
|
134 |
column_names_main = column_names.copy()
|
135 |
# column_names_main.update({})
|
@@ -137,15 +176,15 @@ def data_load(result_file):
|
|
137 |
click_url = True
|
138 |
# read json file from the result_file
|
139 |
with open(result_file, "r") as f:
|
140 |
-
|
141 |
# floatify the data, if possible
|
142 |
-
for d in
|
143 |
for k, v in d.items():
|
144 |
try:
|
145 |
d[k] = float(v)
|
146 |
except:
|
147 |
pass
|
148 |
-
original_df = pd.DataFrame(
|
149 |
original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
|
150 |
# print(original_df.columns)
|
151 |
|
|
|
18 |
from utils_display import model_info
|
19 |
from constants import column_names, LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
|
20 |
import pytz
|
21 |
+
from data_utils import post_processing, get_random_item
|
22 |
|
23 |
# get the last updated time from the elo_ranks.all.jsonl file
|
24 |
LAST_UPDATED = None
|
|
|
34 |
with open("_metrics.md", "r") as f:
|
35 |
METRICS_MD = f.read()
|
36 |
|
37 |
+
raw_data = None
|
38 |
original_df = None
|
39 |
# available_models = [] # to be filled in later
|
40 |
available_models = list(model_info.keys())
|
|
|
90 |
mode_selection_radio.change(fn=df_filters, inputs=[mode_selection_radio, _gstr("")], outputs=[leaderboard_table])
|
91 |
|
92 |
|
93 |
+
def sample_explore_item(model_name, size_H, size_W, greedy_or_sample):
|
94 |
+
print(model_name, size_H, size_W, greedy_or_sample)
|
95 |
+
explore_item = get_random_item(model_name, size_H, size_W)
|
96 |
+
if explore_item is None:
|
97 |
+
return "No item found", "No item found", "No item found", "No item found"
|
98 |
+
model_name = explore_item['Model']
|
99 |
+
example_id = explore_item['id']
|
100 |
+
puzzle_md = f"### Puzzle [{example_id}]:\n\n" + explore_item['puzzle'].replace("## Clues", "### Clues").replace("\n", "<br>")
|
101 |
+
model_reasoning_md = f"### {model_name}'s Reasoning:\n\n {explore_item['reasoning']}"
|
102 |
+
model_prediction_md = f"### {model_name}'s Prediction:\n\n {explore_item['solution']}" + "\n\n" + explore_item['solution_table_md']
|
103 |
+
puzzle_solved = explore_item['correct_cells'] == explore_item['total_cells']
|
104 |
+
cell_acc = explore_item["correct_cells"] / explore_item["total_cells"] * 100
|
105 |
+
model_eval_md = f"### Evaluation:\n\n **Total Cells**: {explore_item['total_cells']} | **Correct Cells**: {explore_item['correct_cells']} | **Puzzle solved**: {puzzle_solved} | **Cell Acc**: {cell_acc:.2f}%"
|
106 |
+
return puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md
|
107 |
+
|
108 |
+
|
109 |
+
def _tab_explore():
|
110 |
+
global raw_data
|
111 |
+
model_names = [item["Model"] for item in raw_data]
|
112 |
+
with gr.Row():
|
113 |
+
model_selection = gr.Dropdown(choices = ["random"] + model_names, label="Model: ", elem_id="select-models", value="random", interactive=True)
|
114 |
+
size_H_selection = gr.Dropdown(choices = ["random"] + [f"{i}" for i in range(2,7)], label="Num of Houses", elem_id="select-H", value="random", interactive=True)
|
115 |
+
size_W_selection = gr.Dropdown(choices = ["random"] + [f"{i}" for i in range(2,7)], label="Num of Features", elem_id="select-W", value="random", interactive=True)
|
116 |
+
with gr.Column(scale=1):
|
117 |
+
greedy_or_sample = gr.Radio(["greedy", "sampling"], show_label=False, elem_id="greedy-or-sample", value="greedy", interactive=True)
|
118 |
+
explore_button = gr.Button("Sample", elem_id="explore-button")
|
119 |
+
|
120 |
+
puzzle_md = gr.Markdown("\n\nTo be loaded", elem_id="puzzle-md", elem_classes="box_md")
|
121 |
+
model_reasoning_md = gr.Markdown("\n\nTo be loaded", elem_id="model-reasoning-md", elem_classes="box_md")
|
122 |
+
model_prediction_md = gr.Markdown("\n\nTo be loaded", elem_id="model-prediction-md", elem_classes="box_md")
|
123 |
+
model_eval_md = gr.Markdown("\n\nTo be loaded", elem_id="model-eval-md", elem_classes="box_md")
|
124 |
+
|
125 |
+
explore_button.click(fn=sample_explore_item,
|
126 |
+
inputs=[model_selection, size_H_selection, size_W_selection, greedy_or_sample],
|
127 |
+
outputs=[puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md])
|
128 |
+
|
129 |
+
|
130 |
+
|
131 |
def _tab_submit():
|
132 |
pass
|
133 |
|
|
|
139 |
gr.HTML(BANNER, elem_id="banner")
|
140 |
# convert LAST_UPDATED to the PDT time
|
141 |
LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
|
142 |
+
header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
|
143 |
+
gr.Markdown(header_md_text, elem_classes="markdown-text")
|
144 |
|
145 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
146 |
with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
|
147 |
_tab_leaderboard()
|
148 |
+
with gr.TabItem("🔍 Explore", elem_id="od-benchmark-tab-table", id=1):
|
149 |
+
_tab_explore()
|
150 |
with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
|
151 |
_tab_submit()
|
152 |
|
|
|
168 |
|
169 |
|
170 |
def data_load(result_file):
|
171 |
+
global raw_data, original_df
|
172 |
print(f"Loading {result_file}")
|
173 |
column_names_main = column_names.copy()
|
174 |
# column_names_main.update({})
|
|
|
176 |
click_url = True
|
177 |
# read json file from the result_file
|
178 |
with open(result_file, "r") as f:
|
179 |
+
raw_data = json.load(f)
|
180 |
# floatify the data, if possible
|
181 |
+
for d in raw_data:
|
182 |
for k, v in d.items():
|
183 |
try:
|
184 |
d[k] = float(v)
|
185 |
except:
|
186 |
pass
|
187 |
+
original_df = pd.DataFrame(raw_data)
|
188 |
original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
|
189 |
# print(original_df.columns)
|
190 |
|
constants.py
CHANGED
@@ -5,20 +5,17 @@ DEFAULT_K = "∞"
|
|
5 |
# DEFAULT_K = "1500"
|
6 |
|
7 |
banner_url = "https://github.com/yuchenlin/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
|
8 |
-
BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width:
|
9 |
|
10 |
TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
|
11 |
|
12 |
WINRATE_HEATMAP = "<div><img src='https://github.com/WildEval/WildBench-Leaderboard/blob/main/gradio/pairwise_win_fractions.png?raw=true' style='width:100%;'></div>"
|
13 |
|
14 |
-
CITATION_TEXT = """@article{
|
15 |
-
title={
|
16 |
-
author={
|
17 |
-
|
18 |
-
|
19 |
-
archivePrefix={arXiv},
|
20 |
-
primaryClass={cs.CL},
|
21 |
-
url={https://arxiv.org/abs/2406.04770}
|
22 |
}
|
23 |
"""
|
24 |
|
@@ -279,5 +276,13 @@ button.selected[role="tab"][aria-selected="true"] {
|
|
279 |
font-size: 12pt;
|
280 |
font-decoration: bold;
|
281 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
"""
|
283 |
|
|
|
5 |
# DEFAULT_K = "1500"
|
6 |
|
7 |
banner_url = "https://github.com/yuchenlin/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
|
8 |
+
BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
|
9 |
|
10 |
TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
|
11 |
|
12 |
WINRATE_HEATMAP = "<div><img src='https://github.com/WildEval/WildBench-Leaderboard/blob/main/gradio/pairwise_win_fractions.png?raw=true' style='width:100%;'></div>"
|
13 |
|
14 |
+
CITATION_TEXT = """@article{tbd,
|
15 |
+
title={tbd},
|
16 |
+
author={tbd},
|
17 |
+
journal={tbd},
|
18 |
+
year={2024}
|
|
|
|
|
|
|
19 |
}
|
20 |
"""
|
21 |
|
|
|
276 |
font-size: 12pt;
|
277 |
font-decoration: bold;
|
278 |
}
|
279 |
+
|
280 |
+
.box_md{
|
281 |
+
border: 1px solid #000000;
|
282 |
+
border-radius: 10px;
|
283 |
+
padding: 5px;
|
284 |
+
font-size: 12pt;
|
285 |
+
margin: 5px;
|
286 |
+
}
|
287 |
"""
|
288 |
|
data_utils.py
CHANGED
@@ -11,12 +11,13 @@ import math
|
|
11 |
import json
|
12 |
from tqdm import tqdm
|
13 |
import numpy as np
|
|
|
|
|
|
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
eval_results = None
|
19 |
-
score_eval_results = None
|
20 |
|
21 |
# Formats the columns
|
22 |
def formatter(x):
|
@@ -41,3 +42,90 @@ def post_processing(df, column_names, rank_column=RANKING_COLUMN, ordered_column
|
|
41 |
df.sort_values(by=rank_column, inplace=True, ascending=False)
|
42 |
return df
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
import json
|
12 |
from tqdm import tqdm
|
13 |
import numpy as np
|
14 |
+
import os
|
15 |
+
|
16 |
+
from eval_utils import *
|
17 |
|
18 |
+
summary_file = "ZeroEval-main/result_dirs/zebra-grid.summary.json"
|
19 |
+
result_dir = "ZeroEval-main/result_dirs/zebra-grid/"
|
20 |
+
results_by_model = {}
|
|
|
|
|
21 |
|
22 |
# Formats the columns
|
23 |
def formatter(x):
|
|
|
42 |
df.sort_values(by=rank_column, inplace=True, ascending=False)
|
43 |
return df
|
44 |
|
45 |
+
|
46 |
+
def load_all_data():
|
47 |
+
global summary_file, result_dir
|
48 |
+
with open(summary_file, "r") as f:
|
49 |
+
model_summary = json.load(f)
|
50 |
+
model_names = [model["Model"] for model in model_summary]
|
51 |
+
for model_name in model_names:
|
52 |
+
download_url = f"https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid/{model_name}.json"
|
53 |
+
output_file = os.path.join(result_dir, f"{model_name}.json")
|
54 |
+
# mkdir -p result_dir if not exists
|
55 |
+
os.makedirs(result_dir, exist_ok=True)
|
56 |
+
if not os.path.exists(output_file):
|
57 |
+
os.system(f"wget {download_url} -O {output_file}")
|
58 |
+
print(f"Downloaded {model_name}.json")
|
59 |
+
with open(output_file, "r") as f:
|
60 |
+
print(f"Loading {output_file}")
|
61 |
+
results_by_model[model_name] = json.load(f)
|
62 |
+
|
63 |
+
def get_random_item(model_name="random", size_H="random", size_W="random"):
|
64 |
+
global summary_file, result_dir, results_by_model
|
65 |
+
if results_by_model is None or len(results_by_model) == 0:
|
66 |
+
load_all_data()
|
67 |
+
if model_name == "random":
|
68 |
+
model_name = random.choice(list(results_by_model.keys()))
|
69 |
+
data = results_by_model[model_name]
|
70 |
+
random.shuffle(data)
|
71 |
+
selected_item = None
|
72 |
+
prediction_table = None
|
73 |
+
prediction_reasoning = None
|
74 |
+
id_to_item = {}
|
75 |
+
for item in data:
|
76 |
+
id_to_item[item["id"]] = item
|
77 |
+
|
78 |
+
if size_H == "random":
|
79 |
+
size_H_choice = random.choice(list(range(2, 7)))
|
80 |
+
else:
|
81 |
+
size_H_choice = size_H
|
82 |
+
if size_W == "random":
|
83 |
+
size_W_choice = random.choice(list(range(2, 7)))
|
84 |
+
else:
|
85 |
+
size_W_choice = size_W
|
86 |
+
ok_ids = [id for id in id_to_item if id_to_item[id]["size"].startswith(f"{size_H_choice}*{size_W_choice}")]
|
87 |
+
for ok_id in ok_ids:
|
88 |
+
item = id_to_item[ok_id]
|
89 |
+
prediction_str = item["output"][0]
|
90 |
+
prediction_json = extract_last_complete_json(prediction_str)
|
91 |
+
if prediction_json is None or "solution" not in prediction_json:
|
92 |
+
continue
|
93 |
+
prediction_reasoning = prediction_json.get("reasoning", "")
|
94 |
+
prediction_table = prediction_json["solution"]
|
95 |
+
if prediction_table is not None:
|
96 |
+
selected_item = item
|
97 |
+
break
|
98 |
+
|
99 |
+
if selected_item is None:
|
100 |
+
# selected_item = random.choice(data)
|
101 |
+
print("No item found!")
|
102 |
+
return None
|
103 |
+
|
104 |
+
explore_item = {}
|
105 |
+
explore_item["id"] = selected_item["id"]
|
106 |
+
explore_item["Model"] = model_name
|
107 |
+
explore_item["size"] = selected_item["size"]
|
108 |
+
explore_item["puzzle"] = selected_item["puzzle"]
|
109 |
+
explore_item["solution"] = prediction_table
|
110 |
+
explore_item["reasoning"] = prediction_reasoning
|
111 |
+
headers = ["Houses"] + list(prediction_table["House 1"].keys())
|
112 |
+
rows = []
|
113 |
+
for row_id in range(len(prediction_table)):
|
114 |
+
row = [row_id+1]
|
115 |
+
for feature in headers[1:]:
|
116 |
+
row.append(prediction_table[f"House {row_id+1}"][feature])
|
117 |
+
rows.append(row)
|
118 |
+
table_md = tabulate(rows, headers=headers, tablefmt="github")
|
119 |
+
explore_item["solution_table_md"] = table_md
|
120 |
+
|
121 |
+
this_total_cells, this_correct_cells = eval_each_puzzle(explore_item["id"], prediction_table)
|
122 |
+
# print(table_md)
|
123 |
+
explore_item["correct_cells"] = this_correct_cells
|
124 |
+
explore_item["total_cells"] = this_total_cells
|
125 |
+
return explore_item
|
126 |
+
|
127 |
+
|
128 |
+
if __name__ == "__main__":
|
129 |
+
load_all_data()
|
130 |
+
print("All data downloaded!")
|
131 |
+
print(json.dumps(get_random_item(model_name="gemini-1.5-pro", size_H="2", size_W="5"), indent=2))
|
eval_utils.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from collections import defaultdict
|
3 |
+
import os
|
4 |
+
from tabulate import tabulate
|
5 |
+
from datasets import load_dataset
|
6 |
+
|
7 |
+
private_solutions = {}
|
8 |
+
|
9 |
+
def load_private_solutions():
|
10 |
+
global private_solutions
|
11 |
+
private_zebra_data = load_dataset("allenai/ZebraLogicBench-private", "grid_mode", split="test")
|
12 |
+
for item in private_zebra_data:
|
13 |
+
private_solutions[item["id"]] = item["solution"]
|
14 |
+
return
|
15 |
+
|
16 |
+
def load_model_results(run_name_folders):
|
17 |
+
model_results = {}
|
18 |
+
for run_name, folder in run_name_folders.items():
|
19 |
+
# iterate all json files under the folder
|
20 |
+
for filename in os.listdir(folder):
|
21 |
+
filepath = os.path.join(folder, filename)
|
22 |
+
if not filename.endswith(".json"):
|
23 |
+
continue
|
24 |
+
model_name = filename.replace(".json", "")
|
25 |
+
model_name = f"{model_name}%{run_name}"
|
26 |
+
model_results[model_name] = filepath
|
27 |
+
return model_results
|
28 |
+
|
29 |
+
def extract_last_complete_json(s):
|
30 |
+
# Stack to keep track of opening and closing braces
|
31 |
+
stack = []
|
32 |
+
last_json_start = None
|
33 |
+
last_json_str = None
|
34 |
+
|
35 |
+
for i, char in enumerate(s):
|
36 |
+
if char == '{':
|
37 |
+
stack.append(i)
|
38 |
+
if last_json_start is None:
|
39 |
+
last_json_start = i
|
40 |
+
elif char == '}':
|
41 |
+
if stack:
|
42 |
+
start = stack.pop()
|
43 |
+
if not stack:
|
44 |
+
# Complete JSON object found
|
45 |
+
last_json_str = s[last_json_start:i+1]
|
46 |
+
last_json_start = None
|
47 |
+
|
48 |
+
# Load the last JSON object
|
49 |
+
if last_json_str:
|
50 |
+
try:
|
51 |
+
return json.loads(last_json_str.replace("\n", ""))
|
52 |
+
except json.JSONDecodeError:
|
53 |
+
pass
|
54 |
+
|
55 |
+
return None
|
56 |
+
|
57 |
+
def eval_each_puzzle(id, prediction_table):
|
58 |
+
global private_solutions
|
59 |
+
if not private_solutions:
|
60 |
+
load_private_solutions()
|
61 |
+
solution = private_solutions[id]
|
62 |
+
solution_table = {}
|
63 |
+
num_houses = len(solution["rows"])
|
64 |
+
columns = solution["header"]
|
65 |
+
assert columns[0] == "House"
|
66 |
+
solution_table = {}
|
67 |
+
this_total_cells = 0
|
68 |
+
for i in range(num_houses):
|
69 |
+
solution_table[f'House {i+1}'] = {columns[j]: solution["rows"][i][j] for j in range(1, len(columns))}
|
70 |
+
this_total_cells += len(columns) - 1
|
71 |
+
|
72 |
+
this_correct_cells = 0 # number in the solution_table
|
73 |
+
for house in solution_table:
|
74 |
+
for column in solution_table[house]:
|
75 |
+
# if prediction_table[house][column] not exist then pass
|
76 |
+
if house in prediction_table and column in prediction_table[house]:
|
77 |
+
truth_cell = solution_table[house][column].lower().strip()
|
78 |
+
if prediction_table[house][column] is None or len(prediction_table[house][column]) == 0:
|
79 |
+
continue
|
80 |
+
if type(prediction_table[house][column]) == list:
|
81 |
+
predicted_cell = prediction_table[house][column][0].lower().strip()
|
82 |
+
elif type(prediction_table[house][column]) == str:
|
83 |
+
predicted_cell = prediction_table[house][column].lower().strip()
|
84 |
+
if truth_cell == predicted_cell:
|
85 |
+
this_correct_cells += 1
|
86 |
+
return this_total_cells, this_correct_cells
|
87 |
+
|
88 |
+
def eval_model(model, filepath):
|
89 |
+
global private_solutions
|
90 |
+
with open(filepath, "r") as f:
|
91 |
+
print(f"Processing {filepath}")
|
92 |
+
data = json.load(f)
|
93 |
+
|
94 |
+
solved_puzzles = 0
|
95 |
+
num_total_puzzles = len(data)
|
96 |
+
correct_cells = 0
|
97 |
+
total_cells = 0
|
98 |
+
no_asnwer = 0
|
99 |
+
|
100 |
+
num_total_puzzles_by_size = defaultdict(int)
|
101 |
+
solved_puzzles_by_size = defaultdict(int)
|
102 |
+
reason_lens = []
|
103 |
+
for item in data:
|
104 |
+
# solution = item["solution"]
|
105 |
+
solution = private_solutions[item["id"]]
|
106 |
+
size = item["size"]
|
107 |
+
num_total_puzzles_by_size[size] += 1
|
108 |
+
|
109 |
+
# Process the solution
|
110 |
+
solution_table = {}
|
111 |
+
num_houses = len(solution["rows"])
|
112 |
+
columns = solution["header"]
|
113 |
+
assert columns[0] == "House"
|
114 |
+
solution_table = {}
|
115 |
+
this_total_cells = 0
|
116 |
+
for i in range(num_houses):
|
117 |
+
solution_table[f'House {i+1}'] = {columns[j]: solution["rows"][i][j] for j in range(1, len(columns))}
|
118 |
+
this_total_cells += len(columns) - 1
|
119 |
+
total_cells += this_total_cells
|
120 |
+
|
121 |
+
# Read and Parse the prediction from model output
|
122 |
+
prediction_str = item["output"][0]
|
123 |
+
prediction_json = extract_last_complete_json(prediction_str)
|
124 |
+
if prediction_json is None or "solution" not in prediction_json:
|
125 |
+
# print("-"*100)
|
126 |
+
# prediction_str = prediction_str.replace("\n", "")
|
127 |
+
# print([prediction_str])
|
128 |
+
# json.loads(prediction_str)
|
129 |
+
no_asnwer += 1
|
130 |
+
# print(item["id"])
|
131 |
+
continue
|
132 |
+
reason = prediction_json.get("reasoning", "")
|
133 |
+
prediction_table = prediction_json["solution"]
|
134 |
+
|
135 |
+
reason_lens.append(len(reason))
|
136 |
+
|
137 |
+
this_correct_cells = 0 # number in the solution_table
|
138 |
+
for house in solution_table:
|
139 |
+
for column in solution_table[house]:
|
140 |
+
# if prediction_table[house][column] not exist then pass
|
141 |
+
if house in prediction_table and column in prediction_table[house]:
|
142 |
+
truth_cell = solution_table[house][column].lower().strip()
|
143 |
+
if prediction_table[house][column] is None or len(prediction_table[house][column]) == 0:
|
144 |
+
continue
|
145 |
+
if type(prediction_table[house][column]) == list:
|
146 |
+
predicted_cell = prediction_table[house][column][0].lower().strip()
|
147 |
+
elif type(prediction_table[house][column]) == str:
|
148 |
+
predicted_cell = prediction_table[house][column].lower().strip()
|
149 |
+
else:
|
150 |
+
raise ValueError(f"Unknown type: {type(prediction_table[house][column])}")
|
151 |
+
if truth_cell == predicted_cell:
|
152 |
+
this_correct_cells += 1
|
153 |
+
correct_cells += this_correct_cells
|
154 |
+
|
155 |
+
# compute puzzle success rate
|
156 |
+
if this_correct_cells == this_total_cells:
|
157 |
+
solved_puzzles += 1
|
158 |
+
solved_puzzles_by_size[size] += 1
|
159 |
+
|
160 |
+
|
161 |
+
|
162 |
+
|
163 |
+
# # print the success rate by size; order the dict by size first
|
164 |
+
sizes = sorted(num_total_puzzles_by_size.keys())
|
165 |
+
easy_sizes = ['2*2', '2*3', '2*4', '2*5', '2*6', '3*2', '3*3',]
|
166 |
+
hard_sizes = ['3*4', '3*5', '4*2', '3*6', '4*3', '4*4', '5*2', '6*2', '4*5', '4*6', '5*3', '5*4', '5*5', '5*6', '6*3', '6*4', '6*5', '6*6']
|
167 |
+
|
168 |
+
easy_solved_puzzles = sum([solved_puzzles_by_size[size] for size in easy_sizes])
|
169 |
+
easy_total_puzzles = sum([num_total_puzzles_by_size[size] for size in easy_sizes])
|
170 |
+
hard_solved_puzzles = sum([solved_puzzles_by_size[size] for size in hard_sizes])
|
171 |
+
hard_total_puzzles = sum([num_total_puzzles_by_size[size] for size in hard_sizes])
|
172 |
+
|
173 |
+
# for size in sizes:
|
174 |
+
# print(f"Size {size}: {solved_puzzles_by_size[size]}/{num_total_puzzles_by_size[size]} -> {solved_puzzles_by_size[size]/num_total_puzzles_by_size[size]*100:.2f}%")
|
175 |
+
|
176 |
+
result = {}
|
177 |
+
result["Model"] = model.split("%")[0]
|
178 |
+
result["Mode"] = model.split("%")[1]
|
179 |
+
result["Puzzle Acc"] = f"{solved_puzzles/num_total_puzzles*100:.2f}"
|
180 |
+
result["Cell Acc"] = f"{correct_cells/total_cells*100:.2f}"
|
181 |
+
result["No answer"] = f"{no_asnwer/num_total_puzzles*100:.2f}"
|
182 |
+
result["Easy Puzzle Acc"] = f"{easy_solved_puzzles/easy_total_puzzles*100:.2f}"
|
183 |
+
result["Hard Puzzle Acc"] = f"{hard_solved_puzzles/hard_total_puzzles*100:.2f}"
|
184 |
+
result["Total Puzzles"] = num_total_puzzles
|
185 |
+
result["Reason Lens"] = f"{sum(reason_lens)/len(reason_lens):.2f}"
|
186 |
+
return result
|
187 |
+
|
188 |
+
|
189 |
+
def gen_results(run_name_folders):
|
190 |
+
model_results = load_model_results(run_name_folders)
|
191 |
+
|
192 |
+
columns = ["Model", "Mode", "Puzzle Acc", "Cell Acc", "No answer", "Easy Puzzle Acc", "Hard Puzzle Acc", "Total Puzzles", "Reason Lens"]
|
193 |
+
rows = []
|
194 |
+
for model_name, filepath in model_results.items():
|
195 |
+
result = eval_model(model_name, filepath)
|
196 |
+
rows.append(result)
|
197 |
+
|
198 |
+
# sort the rows by puzzle accuracy
|
199 |
+
rows = sorted(rows, key=lambda x: -float(x["Puzzle Acc"]))
|
200 |
+
# Convert rows to the expected format for tabulate
|
201 |
+
table_data = [[row[col] for col in columns] for row in rows]
|
202 |
+
|
203 |
+
print(tabulate(table_data, headers=columns, tablefmt="fancy_outline", stralign="center", numalign="center"))
|
204 |
+
# print(tabulate(rows, headers=columns, tablefmt="github"))
|
205 |
+
|
206 |
+
# write to json file
|
207 |
+
with open("result_dirs/zebra-grid.summary.json", "w") as f:
|
208 |
+
json.dump(rows, f, indent=2)
|
209 |
+
|
210 |
+
|
211 |
+
if __name__ == "__main__":
|
212 |
+
run_name_folders = {
|
213 |
+
"greedy": "result_dirs/zebra-grid",
|
214 |
+
"sampling": "result_dirs/zebra-grid/sampling",
|
215 |
+
}
|
216 |
+
load_private_solutions()
|
217 |
+
gen_results(run_name_folders)
|
update_data.sh
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
# download the file from https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json
|
2 |
# and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
|
3 |
-
mkdir -p ZeroEval-main/result_dirs
|
4 |
-
wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
|
|
|
|
1 |
# download the file from https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json
|
2 |
# and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
|
3 |
+
mkdir -p ZeroEval-main/result_dirs/zebra-grid/
|
4 |
+
wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
|
5 |
+
wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid/deepseek-chat.json -O ZeroEval-main/result_dirs/zebra-grid/deepseek-chat.json
|
zebra_banner.png
CHANGED