Spaces:
Sleeping
Sleeping
initial commit
Browse files- .gitattributes +1 -0
- app.py +322 -0
- data/mwoz_leaderboard_results.json +3 -0
- data/tau_leaderboard_results.json +3 -0
- process_submissions.py +76 -0
- requirements.txt +2 -0
- submissions/20250130_140218-4o.json +3 -0
- submissions/20250130_140439-4omini.json +3 -0
- submissions/20250130_145202-gpt35.json +3 -0
- submissions/20250130_183030-claude.json +3 -0
- submissions/20250130_184905-mistrallarge.json +3 -0
- submissions/20250131_010143-o1mini.json +3 -0
- submissions/20250131_012338-llama405.json +3 -0
- submissions/20250131_012449-llama70.json +3 -0
- submissions/20250131_013711-qwen72b.json +3 -0
- submissions/20250131_152226-tau-4o-mini-airline.json +3 -0
- submissions/20250131_152338-tau-4o-mini-retail.json +3 -0
- submissions/20250131_152422-tau-4o-retail.json +3 -0
- submissions/20250131_152503-tau-4o-airline.json +3 -0
- submissions/20250131_152610-tau-gpt35-retail.json +3 -0
- submissions/20250131_152708-tau-gpt35-airline.json +3 -0
- submissions/20250131_152807-tau-sonnet-retail.json +3 -0
- submissions/20250202_112945-qwen72b-airline.json +3 -0
- submissions/20250202_140527-qwen72b-retail.json +3 -0
- submissions/20250204_144222-tau-llama-405b-airline.json +3 -0
- submissions/20250205_024823-tau-mistrallarge-airline.json +3 -0
- submissions/20250205_030422-tau-sonnet-airline.json +3 -0
- submissions/20250205_033820-tau-llama405b-retail.json +3 -0
- submissions/20250205_044403-tau-mistrallarge-retail.json +3 -0
- submissions/20250208_024344-tau-llama70b-airline.json +3 -0
- submissions/20250208_030407-tau-llama70b-retail.json +3 -0
- submissions/20250214_142736-tau-o1-mini-retail.json +3 -0
- submissions/20250214_180731-tau-o1-mini-airline.json +3 -0
- submissions/20250214_193236-o1.json +3 -0
- submissions/20250215_115156-tau-o1-airline.json +3 -0
- submissions/20250215_121147-tau-o1-retail.json +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
|
6 |
+
def strip_timestamp(name):
|
7 |
+
"""Remove the timestamp portion from the model name."""
|
8 |
+
parts = name.split('-')
|
9 |
+
return '-'.join(parts[1:]) if len(parts) > 1 else name
|
10 |
+
|
11 |
+
# Static grouping mapping for the 10 general submissions.
|
12 |
+
GROUPS = [
|
13 |
+
{
|
14 |
+
"mwoz": "20250214_193236-o1",
|
15 |
+
"tau_airline": "20250215_115156-tau-o1-airline",
|
16 |
+
"tau_retail": "20250215_121147-tau-o1-retail"
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"mwoz": "20250131_012338-llama405",
|
20 |
+
"tau_airline": "20250204_144222-tau-llama-405b-airline",
|
21 |
+
"tau_retail": "20250205_033820-tau-llama405b-retail"
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"mwoz": "20250130_140218-4o",
|
25 |
+
"tau_airline": "20250131_152503-tau-4o-airline",
|
26 |
+
"tau_retail": "20250131_152422-tau-4o-retail"
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"mwoz": "20250130_183030-claude",
|
30 |
+
"tau_airline": "20250205_030422-tau-sonnet-airline",
|
31 |
+
"tau_retail": "20250131_152807-tau-sonnet-retail"
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"mwoz": "20250131_012449-llama70",
|
35 |
+
"tau_airline": "20250208_024344-tau-llama70b-airline",
|
36 |
+
"tau_retail": "20250208_030407-tau-llama70b-retail"
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"mwoz": "20250131_013711-qwen72b",
|
40 |
+
"tau_airline": "20250202_112945-qwen72b-airline",
|
41 |
+
"tau_retail": "20250202_140527-qwen72b-retail"
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"mwoz": "20250130_184905-mistrallarge",
|
45 |
+
"tau_airline": "20250205_024823-tau-mistrallarge-airline",
|
46 |
+
"tau_retail": "20250205_044403-tau-mistrallarge-retail"
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"mwoz": "20250131_010143-o1mini",
|
50 |
+
"tau_airline": "20250214_180731-tau-o1-mini-airline",
|
51 |
+
"tau_retail": "20250214_142736-tau-o1-mini-retail"
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"mwoz": "20250130_140439-4omini",
|
55 |
+
"tau_airline": "20250131_152226-tau-4o-mini-airline",
|
56 |
+
"tau_retail": "20250131_152338-tau-4o-mini-retail"
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"mwoz": "20250130_145202-gpt35",
|
60 |
+
"tau_airline": "20250131_152708-tau-gpt35-airline",
|
61 |
+
"tau_retail": "20250131_152610-tau-gpt35-retail"
|
62 |
+
}
|
63 |
+
]
|
64 |
+
|
65 |
+
def load_mwoz_results():
|
66 |
+
"""Load mwoz results from data/mwoz_leaderboard_results.json."""
|
67 |
+
path = os.path.join("data", "mwoz_leaderboard_results.json")
|
68 |
+
if not os.path.exists(path):
|
69 |
+
return []
|
70 |
+
with open(path, "r") as f:
|
71 |
+
return json.load(f)
|
72 |
+
|
73 |
+
def load_tau_results():
|
74 |
+
"""Load tau results from data/tau_leaderboard_results.json."""
|
75 |
+
path = os.path.join("data", "tau_leaderboard_results.json")
|
76 |
+
if not os.path.exists(path):
|
77 |
+
return []
|
78 |
+
with open(path, "r") as f:
|
79 |
+
return json.load(f)
|
80 |
+
|
81 |
+
def create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state):
|
82 |
+
"""
|
83 |
+
Create the aggregated leaderboard DataFrame.
|
84 |
+
Aggregates metrics based on the selected variants and sorts the DataFrame using sort_state.
|
85 |
+
"""
|
86 |
+
# Ensure at least one variant is active.
|
87 |
+
if not (selected_mwoz or selected_tau_airline or selected_tau_retail):
|
88 |
+
selected_mwoz = True
|
89 |
+
|
90 |
+
mwoz_data = load_mwoz_results()
|
91 |
+
tau_data = load_tau_results()
|
92 |
+
mwoz_lookup = {entry["model_name"]: entry for entry in mwoz_data}
|
93 |
+
tau_lookup = {entry["model_name"]: entry for entry in tau_data}
|
94 |
+
|
95 |
+
aggregated = []
|
96 |
+
for group in GROUPS:
|
97 |
+
metrics = {"avg_conv_consistency": 0, "avg_backend_consistency": 0, "avg_policy_completeness": 0}
|
98 |
+
count = 0
|
99 |
+
title_parts = []
|
100 |
+
judge_model = ""
|
101 |
+
if selected_mwoz:
|
102 |
+
key = group["mwoz"]
|
103 |
+
if key in mwoz_lookup:
|
104 |
+
record = mwoz_lookup[key]
|
105 |
+
metrics["avg_conv_consistency"] += record.get("avg_conv_consistency", 0)
|
106 |
+
metrics["avg_backend_consistency"] += record.get("avg_backend_consistency", 0)
|
107 |
+
metrics["avg_policy_completeness"] += record.get("avg_policy_completeness", 0)
|
108 |
+
count += 1
|
109 |
+
title_parts.append(strip_timestamp(key))
|
110 |
+
judge_model = record.get("judge_model", "")
|
111 |
+
if selected_tau_airline:
|
112 |
+
key = group["tau_airline"]
|
113 |
+
if key in tau_lookup:
|
114 |
+
record = tau_lookup[key]
|
115 |
+
metrics["avg_conv_consistency"] += record.get("avg_conv_consistency", 0)
|
116 |
+
metrics["avg_backend_consistency"] += record.get("avg_backend_consistency", 0)
|
117 |
+
metrics["avg_policy_completeness"] += record.get("avg_policy_completeness", 0)
|
118 |
+
count += 1
|
119 |
+
title_parts.append(strip_timestamp(key))
|
120 |
+
judge_model = record.get("judge_model", "")
|
121 |
+
if selected_tau_retail:
|
122 |
+
key = group["tau_retail"]
|
123 |
+
if key in tau_lookup:
|
124 |
+
record = tau_lookup[key]
|
125 |
+
metrics["avg_conv_consistency"] += record.get("avg_conv_consistency", 0)
|
126 |
+
metrics["avg_backend_consistency"] += record.get("avg_backend_consistency", 0)
|
127 |
+
metrics["avg_policy_completeness"] += record.get("avg_policy_completeness", 0)
|
128 |
+
count += 1
|
129 |
+
title_parts.append(strip_timestamp(key))
|
130 |
+
judge_model = record.get("judge_model", "")
|
131 |
+
if count > 0:
|
132 |
+
avg_conv = metrics["avg_conv_consistency"] / count
|
133 |
+
avg_backend = metrics["avg_backend_consistency"] / count
|
134 |
+
avg_policy = metrics["avg_policy_completeness"] / count
|
135 |
+
overall_avg = (avg_conv + avg_backend + avg_policy) / 3
|
136 |
+
else:
|
137 |
+
avg_conv = avg_backend = avg_policy = overall_avg = 0
|
138 |
+
|
139 |
+
aggregated.append({
|
140 |
+
"Model": " / ".join(title_parts),
|
141 |
+
"Average Score": round(overall_avg, 4),
|
142 |
+
"Conversation Consistency": round(avg_conv, 4),
|
143 |
+
"Backend Consistency": round(avg_backend, 4),
|
144 |
+
"Policy Completeness": round(avg_policy, 4),
|
145 |
+
"Judge Model": judge_model
|
146 |
+
})
|
147 |
+
|
148 |
+
df = pd.DataFrame(aggregated)
|
149 |
+
# Sort if a valid column is provided.
|
150 |
+
allowed_sort_cols = ["Average Score", "Conversation Consistency", "Backend Consistency", "Policy Completeness"]
|
151 |
+
sort_by = sort_state.get("sort_by") if sort_state else None
|
152 |
+
ascending = sort_state.get("ascending") if sort_state else True
|
153 |
+
if sort_by in allowed_sort_cols:
|
154 |
+
df = df.sort_values(sort_by, ascending=ascending)
|
155 |
+
return df
|
156 |
+
|
157 |
+
def update_sort_state(current_state, clicked_column):
|
158 |
+
"""
|
159 |
+
Update the sort state based on the clicked column.
|
160 |
+
If the same column is clicked, toggle the sort order;
|
161 |
+
otherwise, switch to the new column with ascending order.
|
162 |
+
"""
|
163 |
+
if current_state is None:
|
164 |
+
current_state = {"sort_by": clicked_column, "ascending": True}
|
165 |
+
else:
|
166 |
+
if current_state.get("sort_by") == clicked_column:
|
167 |
+
current_state["ascending"] = not current_state.get("ascending", True)
|
168 |
+
else:
|
169 |
+
current_state["sort_by"] = clicked_column
|
170 |
+
current_state["ascending"] = True
|
171 |
+
return current_state
|
172 |
+
|
173 |
+
def sort_by_avg(sort_state):
|
174 |
+
return update_sort_state(sort_state, "Average Score")
|
175 |
+
|
176 |
+
def sort_by_conv(sort_state):
|
177 |
+
return update_sort_state(sort_state, "Conversation Consistency")
|
178 |
+
|
179 |
+
def sort_by_backend(sort_state):
|
180 |
+
return update_sort_state(sort_state, "Backend Consistency")
|
181 |
+
|
182 |
+
def sort_by_policy(sort_state):
|
183 |
+
return update_sort_state(sort_state, "Policy Completeness")
|
184 |
+
|
185 |
+
def get_color_for_value(value, min_val, max_val):
|
186 |
+
"""
|
187 |
+
Compute a color for a given value based on its normalized position.
|
188 |
+
Interpolates from red (lowest) to yellow (mid) to green (highest).
|
189 |
+
"""
|
190 |
+
if max_val == min_val:
|
191 |
+
norm = 0.5
|
192 |
+
else:
|
193 |
+
norm = (value - min_val) / (max_val - min_val)
|
194 |
+
if norm < 0.5:
|
195 |
+
ratio = norm / 0.5
|
196 |
+
r = 255
|
197 |
+
g = int(255 * ratio)
|
198 |
+
b = 0
|
199 |
+
else:
|
200 |
+
ratio = (norm - 0.5) / 0.5
|
201 |
+
r = int(255 * (1 - ratio))
|
202 |
+
g = 255
|
203 |
+
b = 0
|
204 |
+
return f"#{r:02X}{g:02X}{b:02X}"
|
205 |
+
|
206 |
+
def generate_html_table(df):
|
207 |
+
"""
|
208 |
+
Generate an HTML table from the DataFrame.
|
209 |
+
For each numeric column, apply a text color based on its relative value.
|
210 |
+
"""
|
211 |
+
numeric_cols = ["Average Score", "Conversation Consistency", "Backend Consistency", "Policy Completeness"]
|
212 |
+
col_min = {}
|
213 |
+
col_max = {}
|
214 |
+
for col in numeric_cols:
|
215 |
+
col_min[col] = df[col].min() if not df.empty else 0
|
216 |
+
col_max[col] = df[col].max() if not df.empty else 0
|
217 |
+
|
218 |
+
html = "<table border='1' style='border-collapse: collapse; text-align: center; width: 100%;'>"
|
219 |
+
# Header row
|
220 |
+
html += "<tr>"
|
221 |
+
for col in df.columns:
|
222 |
+
html += f"<th style='padding: 8px;'>{col}</th>"
|
223 |
+
html += "</tr>"
|
224 |
+
|
225 |
+
# Data rows
|
226 |
+
for _, row in df.iterrows():
|
227 |
+
html += "<tr>"
|
228 |
+
for col in df.columns:
|
229 |
+
cell_value = row[col]
|
230 |
+
if col in numeric_cols:
|
231 |
+
color = get_color_for_value(cell_value, col_min[col], col_max[col])
|
232 |
+
# Now applying the color to the text (color property) instead of background.
|
233 |
+
html += f"<td style='padding: 8px; color: {color};'>{cell_value}</td>"
|
234 |
+
else:
|
235 |
+
html += f"<td style='padding: 8px;'>{cell_value}</td>"
|
236 |
+
html += "</tr>"
|
237 |
+
html += "</table>"
|
238 |
+
return html
|
239 |
+
|
240 |
+
def update_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state):
|
241 |
+
"""
|
242 |
+
Update the leaderboard by creating the aggregated DataFrame and converting it to HTML.
|
243 |
+
"""
|
244 |
+
df = create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state)
|
245 |
+
html_table = generate_html_table(df)
|
246 |
+
return html_table
|
247 |
+
|
248 |
+
with gr.Blocks(title="Rome Leaderboard") as demo:
|
249 |
+
gr.Markdown("# 🏆 Rome Model Evaluation Leaderboard")
|
250 |
+
gr.Markdown("""
|
251 |
+
This leaderboard displays aggregated model performance across multiple evaluation metrics.
|
252 |
+
|
253 |
+
**Variants:**
|
254 |
+
- **mwoz:** Baseline variant.
|
255 |
+
- **tau-airline:** Airline specialty variant.
|
256 |
+
- **tau-retail:** Retail specialty variant.
|
257 |
+
|
258 |
+
Use the checkboxes below to select which variants to include. At least one variant must be active.
|
259 |
+
""")
|
260 |
+
|
261 |
+
with gr.Row():
|
262 |
+
cb_mwoz = gr.Checkbox(label="mwoz", value=True)
|
263 |
+
cb_tau_airline = gr.Checkbox(label="tau-airline", value=True)
|
264 |
+
cb_tau_retail = gr.Checkbox(label="tau-retail", value=True)
|
265 |
+
|
266 |
+
gr.Markdown("### Sort by (click a button to toggle ascending/descending):")
|
267 |
+
with gr.Row():
|
268 |
+
btn_avg = gr.Button("Average Score")
|
269 |
+
btn_conv = gr.Button("Conversation Consistency")
|
270 |
+
btn_backend = gr.Button("Backend Consistency")
|
271 |
+
btn_policy = gr.Button("Policy Completeness")
|
272 |
+
|
273 |
+
# Initialize sort state: default sort by Average Score descending.
|
274 |
+
sort_state = gr.State({"sort_by": "Average Score", "ascending": False})
|
275 |
+
|
276 |
+
leaderboard_display = gr.HTML(label="Aggregated Model Rankings")
|
277 |
+
|
278 |
+
refresh_btn = gr.Button("🔄 Refresh Leaderboard")
|
279 |
+
|
280 |
+
# Sort button events.
|
281 |
+
btn_avg.click(fn=sort_by_avg, inputs=[sort_state], outputs=[sort_state]).then(
|
282 |
+
fn=update_leaderboard,
|
283 |
+
inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
|
284 |
+
outputs=leaderboard_display
|
285 |
+
)
|
286 |
+
btn_conv.click(fn=sort_by_conv, inputs=[sort_state], outputs=[sort_state]).then(
|
287 |
+
fn=update_leaderboard,
|
288 |
+
inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
|
289 |
+
outputs=leaderboard_display
|
290 |
+
)
|
291 |
+
btn_backend.click(fn=sort_by_backend, inputs=[sort_state], outputs=[sort_state]).then(
|
292 |
+
fn=update_leaderboard,
|
293 |
+
inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
|
294 |
+
outputs=leaderboard_display
|
295 |
+
)
|
296 |
+
btn_policy.click(fn=sort_by_policy, inputs=[sort_state], outputs=[sort_state]).then(
|
297 |
+
fn=update_leaderboard,
|
298 |
+
inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
|
299 |
+
outputs=leaderboard_display
|
300 |
+
)
|
301 |
+
|
302 |
+
# Refresh button event.
|
303 |
+
refresh_btn.click(
|
304 |
+
fn=update_leaderboard,
|
305 |
+
inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
|
306 |
+
outputs=leaderboard_display
|
307 |
+
)
|
308 |
+
|
309 |
+
# Update leaderboard immediately when any checkbox changes.
|
310 |
+
cb_mwoz.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state], outputs=leaderboard_display)
|
311 |
+
cb_tau_airline.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state], outputs=leaderboard_display)
|
312 |
+
cb_tau_retail.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state], outputs=leaderboard_display)
|
313 |
+
|
314 |
+
# Load initial leaderboard on app start.
|
315 |
+
demo.load(
|
316 |
+
fn=update_leaderboard,
|
317 |
+
inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
|
318 |
+
outputs=leaderboard_display
|
319 |
+
)
|
320 |
+
|
321 |
+
if __name__ == "__main__":
|
322 |
+
demo.launch()
|
data/mwoz_leaderboard_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:30f233c17f2a9e1068eb5313c1cc1c1e4b622593eb01a40a34b1a95be2824873
|
3 |
+
size 3052
|
data/tau_leaderboard_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6dd7553f04d89e492bfec22e2e8f9ab8d7afb269c0b8c027709aedce3ac63aa9
|
3 |
+
size 9396
|
process_submissions.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from pathlib import Path
|
4 |
+
import argparse
|
5 |
+
|
6 |
+
def process_submissions(input_file, output_dir='submissions'):
|
7 |
+
"""
|
8 |
+
Process a JSON file containing multiple model submissions and split it into
|
9 |
+
individual JSON files in the submissions directory.
|
10 |
+
|
11 |
+
Args:
|
12 |
+
input_file (str): Path to the input JSON file
|
13 |
+
output_dir (str): Directory where individual submission files will be stored
|
14 |
+
"""
|
15 |
+
# Create submissions directory if it doesn't exist
|
16 |
+
Path(output_dir).mkdir(exist_ok=True)
|
17 |
+
|
18 |
+
# Read the input file
|
19 |
+
try:
|
20 |
+
with open(input_file, 'r') as f:
|
21 |
+
submissions = json.load(f)
|
22 |
+
|
23 |
+
if not isinstance(submissions, list):
|
24 |
+
print(f"Error: Input file {input_file} must contain a JSON array of submissions")
|
25 |
+
return
|
26 |
+
|
27 |
+
# Process each submission
|
28 |
+
for submission in submissions:
|
29 |
+
if 'model_name' not in submission:
|
30 |
+
print(f"Warning: Skipping submission without model_name field")
|
31 |
+
continue
|
32 |
+
|
33 |
+
model_name = submission['model_name']
|
34 |
+
# Create a safe filename from the model name
|
35 |
+
safe_filename = f"{model_name.replace('/', '_')}.json"
|
36 |
+
output_path = os.path.join(output_dir, safe_filename)
|
37 |
+
|
38 |
+
# Write individual submission file
|
39 |
+
with open(output_path, 'w') as f:
|
40 |
+
json.dump(submission, f, indent=4)
|
41 |
+
|
42 |
+
print(f"Created submission file: {output_path}")
|
43 |
+
|
44 |
+
print(f"\nProcessed {len(submissions)} submissions successfully!")
|
45 |
+
|
46 |
+
except FileNotFoundError:
|
47 |
+
print(f"Error: Input file '{input_file}' not found")
|
48 |
+
except json.JSONDecodeError:
|
49 |
+
print(f"Error: Input file '{input_file}' is not valid JSON")
|
50 |
+
except Exception as e:
|
51 |
+
print(f"Error processing submissions: {str(e)}")
|
52 |
+
|
53 |
+
def main():
|
54 |
+
# Set up argument parser
|
55 |
+
parser = argparse.ArgumentParser(
|
56 |
+
description='Process a JSON file containing model submissions and split into individual files.'
|
57 |
+
)
|
58 |
+
parser.add_argument(
|
59 |
+
'input_file',
|
60 |
+
help='Path to the input JSON file containing model submissions'
|
61 |
+
)
|
62 |
+
parser.add_argument(
|
63 |
+
'--output-dir',
|
64 |
+
'-o',
|
65 |
+
default='submissions',
|
66 |
+
help='Directory where individual submission files will be stored (default: submissions)'
|
67 |
+
)
|
68 |
+
|
69 |
+
# Parse arguments
|
70 |
+
args = parser.parse_args()
|
71 |
+
|
72 |
+
# Process submissions
|
73 |
+
process_submissions(args.input_file, args.output_dir)
|
74 |
+
|
75 |
+
if __name__ == "__main__":
|
76 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
gradio>=4.0.0
|
2 |
+
pandas>=2.0.0
|
submissions/20250130_140218-4o.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8e6ac4738378227537d66f8f231f943581cca65a5a88badbf7b089d8ddd436aa
|
3 |
+
size 276
|
submissions/20250130_140439-4omini.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:23d9f222f6ed78bfa0c664d31dad08d0da8782d6db7184469e9b5f1a43838bcd
|
3 |
+
size 280
|
submissions/20250130_145202-gpt35.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:241d41c00306dfe554be2e712ef70befb98cb15287f8f362b64039e63d42ebcc
|
3 |
+
size 279
|
submissions/20250130_183030-claude.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28fd88cfba3985266a31145b0d7f8bbf7523e1486700bb1c0e1b8cdf395ae745
|
3 |
+
size 279
|
submissions/20250130_184905-mistrallarge.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fa32abc214592ed42253f9d09e60049fbded16abf25d27bb58aa883551f52916
|
3 |
+
size 286
|
submissions/20250131_010143-o1mini.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:72f8224b39acdde5456eb76914c7294153b1c3cb5d8f778b05fd646be89d1d4b
|
3 |
+
size 280
|
submissions/20250131_012338-llama405.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:97ad8b202d517c19c62e492a377d58612c8a95870b71ceb8587896a7977415e7
|
3 |
+
size 282
|
submissions/20250131_012449-llama70.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e9202529ccd1b01100e97ab1e49ffe9f80adf890a384bb0c9d54dcb50129044b
|
3 |
+
size 281
|
submissions/20250131_013711-qwen72b.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a5785dea65c6e83d70b8eaae6143aa4d8755c3dfda8fd256d971a67242b444a7
|
3 |
+
size 281
|
submissions/20250131_152226-tau-4o-mini-airline.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:54c876c671383e333599122f0789947b2906fbce489d26feb2fb44df6ca61bb9
|
3 |
+
size 430
|
submissions/20250131_152338-tau-4o-mini-retail.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ccbdb2fa53caa6d08641632a10588c610355a1cd238527b1ab96e40d5314a78c
|
3 |
+
size 429
|
submissions/20250131_152422-tau-4o-retail.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:caffe3b74905c9eb0953b3b7db2ca1caa2c9dbeb67aa9cb56f47c8102df82c73
|
3 |
+
size 419
|
submissions/20250131_152503-tau-4o-airline.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef952c41710fe34a4eef5f23f504e619bc4565a3eaea9ac587b4a2ea2ea3b660
|
3 |
+
size 420
|
submissions/20250131_152610-tau-gpt35-retail.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4d88dffc671daefb67e80fdfed3b7b71a7adc3e7975754fd6a435a5783e6435d
|
3 |
+
size 434
|
submissions/20250131_152708-tau-gpt35-airline.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:89b7e7b87a74bcfe903d7455c718569fab7f0f593fda15b00ceb40999b101e0e
|
3 |
+
size 435
|
submissions/20250131_152807-tau-sonnet-retail.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:946a9b068da07c43972b41281338689f17caf9a454e6aaefab0dbc199bf434d5
|
3 |
+
size 443
|
submissions/20250202_112945-qwen72b-airline.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e4c42e48a557b297a545d9ab5065736de9d3a843ca196625e3c18574f3925c56
|
3 |
+
size 428
|
submissions/20250202_140527-qwen72b-retail.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9bfbdf41eb3707e9d471ea6e8b0df66e0c30a876e64de13bfac05b19107dafa5
|
3 |
+
size 428
|
submissions/20250204_144222-tau-llama-405b-airline.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d14f9ee6572bf5c99fa4524eeba1eb8e8eb3286a1ac36c63c3618014d6084ac
|
3 |
+
size 442
|
submissions/20250205_024823-tau-mistrallarge-airline.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52aa75b36bbfc50d5e49ad5d0333a71248d833969db0050f1d1b2f71688e9a1c
|
3 |
+
size 435
|
submissions/20250205_030422-tau-sonnet-airline.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:69df986398a9874b7ae42ea16c74d2537012d5c94c146cc7316be3d39929dd6f
|
3 |
+
size 434
|
submissions/20250205_033820-tau-llama405b-retail.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:55533012c64e2aa9a09eeec6b5d8b1fb898e2d9e772cdac28c39de4c8167d158
|
3 |
+
size 431
|
submissions/20250205_044403-tau-mistrallarge-retail.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:147f1a3cd17768bb01b54f4568348054af1ea72363f04f5edfa50773dcda3f80
|
3 |
+
size 433
|
submissions/20250208_024344-tau-llama70b-airline.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1b87a4e1981c7b8cfb364cc46ae4a16f4f85f9631d9c02e85008655c8b103e4
|
3 |
+
size 426
|
submissions/20250208_030407-tau-llama70b-retail.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe0c10dbfe0d56a5cd65267a448058aabd8a0e82fdedaf3748f7ffa3d7b7d12e
|
3 |
+
size 424
|
submissions/20250214_142736-tau-o1-mini-retail.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:56519aef51f3d681df9e8f29c792af3f315cb67116c012bc6742a2977793c465
|
3 |
+
size 410
|
submissions/20250214_180731-tau-o1-mini-airline.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:165618025471980ee895ab8a12c04f4ca94eb0df19772c8057de2d415760f684
|
3 |
+
size 410
|
submissions/20250214_193236-o1.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:628356d0e3f25fcb909e50ff54c1a54b0b25ff7d2c8190ae1c87713edd90d238
|
3 |
+
size 276
|
submissions/20250215_115156-tau-o1-airline.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9db8211e9fa6bdb0b97c7341216dcce133cd815e31d0e209a5a5dd452939ffab
|
3 |
+
size 426
|
submissions/20250215_121147-tau-o1-retail.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8c3e438a5019e96efb58d7c17f9b82b6de54da087cb4960445326f8f858a69ab
|
3 |
+
size 423
|