juancauma commited on
Commit
0ae90b3
·
1 Parent(s): 34ca607

initial commit

Browse files
Files changed (36) hide show
  1. .gitattributes +1 -0
  2. app.py +322 -0
  3. data/mwoz_leaderboard_results.json +3 -0
  4. data/tau_leaderboard_results.json +3 -0
  5. process_submissions.py +76 -0
  6. requirements.txt +2 -0
  7. submissions/20250130_140218-4o.json +3 -0
  8. submissions/20250130_140439-4omini.json +3 -0
  9. submissions/20250130_145202-gpt35.json +3 -0
  10. submissions/20250130_183030-claude.json +3 -0
  11. submissions/20250130_184905-mistrallarge.json +3 -0
  12. submissions/20250131_010143-o1mini.json +3 -0
  13. submissions/20250131_012338-llama405.json +3 -0
  14. submissions/20250131_012449-llama70.json +3 -0
  15. submissions/20250131_013711-qwen72b.json +3 -0
  16. submissions/20250131_152226-tau-4o-mini-airline.json +3 -0
  17. submissions/20250131_152338-tau-4o-mini-retail.json +3 -0
  18. submissions/20250131_152422-tau-4o-retail.json +3 -0
  19. submissions/20250131_152503-tau-4o-airline.json +3 -0
  20. submissions/20250131_152610-tau-gpt35-retail.json +3 -0
  21. submissions/20250131_152708-tau-gpt35-airline.json +3 -0
  22. submissions/20250131_152807-tau-sonnet-retail.json +3 -0
  23. submissions/20250202_112945-qwen72b-airline.json +3 -0
  24. submissions/20250202_140527-qwen72b-retail.json +3 -0
  25. submissions/20250204_144222-tau-llama-405b-airline.json +3 -0
  26. submissions/20250205_024823-tau-mistrallarge-airline.json +3 -0
  27. submissions/20250205_030422-tau-sonnet-airline.json +3 -0
  28. submissions/20250205_033820-tau-llama405b-retail.json +3 -0
  29. submissions/20250205_044403-tau-mistrallarge-retail.json +3 -0
  30. submissions/20250208_024344-tau-llama70b-airline.json +3 -0
  31. submissions/20250208_030407-tau-llama70b-retail.json +3 -0
  32. submissions/20250214_142736-tau-o1-mini-retail.json +3 -0
  33. submissions/20250214_180731-tau-o1-mini-airline.json +3 -0
  34. submissions/20250214_193236-o1.json +3 -0
  35. submissions/20250215_115156-tau-o1-airline.json +3 -0
  36. submissions/20250215_121147-tau-o1-retail.json +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.json filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import json
4
+ import os
5
+
6
+ def strip_timestamp(name):
7
+ """Remove the timestamp portion from the model name."""
8
+ parts = name.split('-')
9
+ return '-'.join(parts[1:]) if len(parts) > 1 else name
10
+
11
+ # Static grouping mapping for the 10 general submissions.
12
+ GROUPS = [
13
+ {
14
+ "mwoz": "20250214_193236-o1",
15
+ "tau_airline": "20250215_115156-tau-o1-airline",
16
+ "tau_retail": "20250215_121147-tau-o1-retail"
17
+ },
18
+ {
19
+ "mwoz": "20250131_012338-llama405",
20
+ "tau_airline": "20250204_144222-tau-llama-405b-airline",
21
+ "tau_retail": "20250205_033820-tau-llama405b-retail"
22
+ },
23
+ {
24
+ "mwoz": "20250130_140218-4o",
25
+ "tau_airline": "20250131_152503-tau-4o-airline",
26
+ "tau_retail": "20250131_152422-tau-4o-retail"
27
+ },
28
+ {
29
+ "mwoz": "20250130_183030-claude",
30
+ "tau_airline": "20250205_030422-tau-sonnet-airline",
31
+ "tau_retail": "20250131_152807-tau-sonnet-retail"
32
+ },
33
+ {
34
+ "mwoz": "20250131_012449-llama70",
35
+ "tau_airline": "20250208_024344-tau-llama70b-airline",
36
+ "tau_retail": "20250208_030407-tau-llama70b-retail"
37
+ },
38
+ {
39
+ "mwoz": "20250131_013711-qwen72b",
40
+ "tau_airline": "20250202_112945-qwen72b-airline",
41
+ "tau_retail": "20250202_140527-qwen72b-retail"
42
+ },
43
+ {
44
+ "mwoz": "20250130_184905-mistrallarge",
45
+ "tau_airline": "20250205_024823-tau-mistrallarge-airline",
46
+ "tau_retail": "20250205_044403-tau-mistrallarge-retail"
47
+ },
48
+ {
49
+ "mwoz": "20250131_010143-o1mini",
50
+ "tau_airline": "20250214_180731-tau-o1-mini-airline",
51
+ "tau_retail": "20250214_142736-tau-o1-mini-retail"
52
+ },
53
+ {
54
+ "mwoz": "20250130_140439-4omini",
55
+ "tau_airline": "20250131_152226-tau-4o-mini-airline",
56
+ "tau_retail": "20250131_152338-tau-4o-mini-retail"
57
+ },
58
+ {
59
+ "mwoz": "20250130_145202-gpt35",
60
+ "tau_airline": "20250131_152708-tau-gpt35-airline",
61
+ "tau_retail": "20250131_152610-tau-gpt35-retail"
62
+ }
63
+ ]
64
+
65
+ def load_mwoz_results():
66
+ """Load mwoz results from data/mwoz_leaderboard_results.json."""
67
+ path = os.path.join("data", "mwoz_leaderboard_results.json")
68
+ if not os.path.exists(path):
69
+ return []
70
+ with open(path, "r") as f:
71
+ return json.load(f)
72
+
73
+ def load_tau_results():
74
+ """Load tau results from data/tau_leaderboard_results.json."""
75
+ path = os.path.join("data", "tau_leaderboard_results.json")
76
+ if not os.path.exists(path):
77
+ return []
78
+ with open(path, "r") as f:
79
+ return json.load(f)
80
+
81
+ def create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state):
82
+ """
83
+ Create the aggregated leaderboard DataFrame.
84
+ Aggregates metrics based on the selected variants and sorts the DataFrame using sort_state.
85
+ """
86
+ # Ensure at least one variant is active.
87
+ if not (selected_mwoz or selected_tau_airline or selected_tau_retail):
88
+ selected_mwoz = True
89
+
90
+ mwoz_data = load_mwoz_results()
91
+ tau_data = load_tau_results()
92
+ mwoz_lookup = {entry["model_name"]: entry for entry in mwoz_data}
93
+ tau_lookup = {entry["model_name"]: entry for entry in tau_data}
94
+
95
+ aggregated = []
96
+ for group in GROUPS:
97
+ metrics = {"avg_conv_consistency": 0, "avg_backend_consistency": 0, "avg_policy_completeness": 0}
98
+ count = 0
99
+ title_parts = []
100
+ judge_model = ""
101
+ if selected_mwoz:
102
+ key = group["mwoz"]
103
+ if key in mwoz_lookup:
104
+ record = mwoz_lookup[key]
105
+ metrics["avg_conv_consistency"] += record.get("avg_conv_consistency", 0)
106
+ metrics["avg_backend_consistency"] += record.get("avg_backend_consistency", 0)
107
+ metrics["avg_policy_completeness"] += record.get("avg_policy_completeness", 0)
108
+ count += 1
109
+ title_parts.append(strip_timestamp(key))
110
+ judge_model = record.get("judge_model", "")
111
+ if selected_tau_airline:
112
+ key = group["tau_airline"]
113
+ if key in tau_lookup:
114
+ record = tau_lookup[key]
115
+ metrics["avg_conv_consistency"] += record.get("avg_conv_consistency", 0)
116
+ metrics["avg_backend_consistency"] += record.get("avg_backend_consistency", 0)
117
+ metrics["avg_policy_completeness"] += record.get("avg_policy_completeness", 0)
118
+ count += 1
119
+ title_parts.append(strip_timestamp(key))
120
+ judge_model = record.get("judge_model", "")
121
+ if selected_tau_retail:
122
+ key = group["tau_retail"]
123
+ if key in tau_lookup:
124
+ record = tau_lookup[key]
125
+ metrics["avg_conv_consistency"] += record.get("avg_conv_consistency", 0)
126
+ metrics["avg_backend_consistency"] += record.get("avg_backend_consistency", 0)
127
+ metrics["avg_policy_completeness"] += record.get("avg_policy_completeness", 0)
128
+ count += 1
129
+ title_parts.append(strip_timestamp(key))
130
+ judge_model = record.get("judge_model", "")
131
+ if count > 0:
132
+ avg_conv = metrics["avg_conv_consistency"] / count
133
+ avg_backend = metrics["avg_backend_consistency"] / count
134
+ avg_policy = metrics["avg_policy_completeness"] / count
135
+ overall_avg = (avg_conv + avg_backend + avg_policy) / 3
136
+ else:
137
+ avg_conv = avg_backend = avg_policy = overall_avg = 0
138
+
139
+ aggregated.append({
140
+ "Model": " / ".join(title_parts),
141
+ "Average Score": round(overall_avg, 4),
142
+ "Conversation Consistency": round(avg_conv, 4),
143
+ "Backend Consistency": round(avg_backend, 4),
144
+ "Policy Completeness": round(avg_policy, 4),
145
+ "Judge Model": judge_model
146
+ })
147
+
148
+ df = pd.DataFrame(aggregated)
149
+ # Sort if a valid column is provided.
150
+ allowed_sort_cols = ["Average Score", "Conversation Consistency", "Backend Consistency", "Policy Completeness"]
151
+ sort_by = sort_state.get("sort_by") if sort_state else None
152
+ ascending = sort_state.get("ascending") if sort_state else True
153
+ if sort_by in allowed_sort_cols:
154
+ df = df.sort_values(sort_by, ascending=ascending)
155
+ return df
156
+
157
+ def update_sort_state(current_state, clicked_column):
158
+ """
159
+ Update the sort state based on the clicked column.
160
+ If the same column is clicked, toggle the sort order;
161
+ otherwise, switch to the new column with ascending order.
162
+ """
163
+ if current_state is None:
164
+ current_state = {"sort_by": clicked_column, "ascending": True}
165
+ else:
166
+ if current_state.get("sort_by") == clicked_column:
167
+ current_state["ascending"] = not current_state.get("ascending", True)
168
+ else:
169
+ current_state["sort_by"] = clicked_column
170
+ current_state["ascending"] = True
171
+ return current_state
172
+
173
+ def sort_by_avg(sort_state):
174
+ return update_sort_state(sort_state, "Average Score")
175
+
176
+ def sort_by_conv(sort_state):
177
+ return update_sort_state(sort_state, "Conversation Consistency")
178
+
179
+ def sort_by_backend(sort_state):
180
+ return update_sort_state(sort_state, "Backend Consistency")
181
+
182
+ def sort_by_policy(sort_state):
183
+ return update_sort_state(sort_state, "Policy Completeness")
184
+
185
+ def get_color_for_value(value, min_val, max_val):
186
+ """
187
+ Compute a color for a given value based on its normalized position.
188
+ Interpolates from red (lowest) to yellow (mid) to green (highest).
189
+ """
190
+ if max_val == min_val:
191
+ norm = 0.5
192
+ else:
193
+ norm = (value - min_val) / (max_val - min_val)
194
+ if norm < 0.5:
195
+ ratio = norm / 0.5
196
+ r = 255
197
+ g = int(255 * ratio)
198
+ b = 0
199
+ else:
200
+ ratio = (norm - 0.5) / 0.5
201
+ r = int(255 * (1 - ratio))
202
+ g = 255
203
+ b = 0
204
+ return f"#{r:02X}{g:02X}{b:02X}"
205
+
206
+ def generate_html_table(df):
207
+ """
208
+ Generate an HTML table from the DataFrame.
209
+ For each numeric column, apply a text color based on its relative value.
210
+ """
211
+ numeric_cols = ["Average Score", "Conversation Consistency", "Backend Consistency", "Policy Completeness"]
212
+ col_min = {}
213
+ col_max = {}
214
+ for col in numeric_cols:
215
+ col_min[col] = df[col].min() if not df.empty else 0
216
+ col_max[col] = df[col].max() if not df.empty else 0
217
+
218
+ html = "<table border='1' style='border-collapse: collapse; text-align: center; width: 100%;'>"
219
+ # Header row
220
+ html += "<tr>"
221
+ for col in df.columns:
222
+ html += f"<th style='padding: 8px;'>{col}</th>"
223
+ html += "</tr>"
224
+
225
+ # Data rows
226
+ for _, row in df.iterrows():
227
+ html += "<tr>"
228
+ for col in df.columns:
229
+ cell_value = row[col]
230
+ if col in numeric_cols:
231
+ color = get_color_for_value(cell_value, col_min[col], col_max[col])
232
+ # Now applying the color to the text (color property) instead of background.
233
+ html += f"<td style='padding: 8px; color: {color};'>{cell_value}</td>"
234
+ else:
235
+ html += f"<td style='padding: 8px;'>{cell_value}</td>"
236
+ html += "</tr>"
237
+ html += "</table>"
238
+ return html
239
+
240
+ def update_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state):
241
+ """
242
+ Update the leaderboard by creating the aggregated DataFrame and converting it to HTML.
243
+ """
244
+ df = create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state)
245
+ html_table = generate_html_table(df)
246
+ return html_table
247
+
248
+ with gr.Blocks(title="Rome Leaderboard") as demo:
249
+ gr.Markdown("# 🏆 Rome Model Evaluation Leaderboard")
250
+ gr.Markdown("""
251
+ This leaderboard displays aggregated model performance across multiple evaluation metrics.
252
+
253
+ **Variants:**
254
+ - **mwoz:** Baseline variant.
255
+ - **tau-airline:** Airline specialty variant.
256
+ - **tau-retail:** Retail specialty variant.
257
+
258
+ Use the checkboxes below to select which variants to include. At least one variant must be active.
259
+ """)
260
+
261
+ with gr.Row():
262
+ cb_mwoz = gr.Checkbox(label="mwoz", value=True)
263
+ cb_tau_airline = gr.Checkbox(label="tau-airline", value=True)
264
+ cb_tau_retail = gr.Checkbox(label="tau-retail", value=True)
265
+
266
+ gr.Markdown("### Sort by (click a button to toggle ascending/descending):")
267
+ with gr.Row():
268
+ btn_avg = gr.Button("Average Score")
269
+ btn_conv = gr.Button("Conversation Consistency")
270
+ btn_backend = gr.Button("Backend Consistency")
271
+ btn_policy = gr.Button("Policy Completeness")
272
+
273
+ # Initialize sort state: default sort by Average Score descending.
274
+ sort_state = gr.State({"sort_by": "Average Score", "ascending": False})
275
+
276
+ leaderboard_display = gr.HTML(label="Aggregated Model Rankings")
277
+
278
+ refresh_btn = gr.Button("🔄 Refresh Leaderboard")
279
+
280
+ # Sort button events.
281
+ btn_avg.click(fn=sort_by_avg, inputs=[sort_state], outputs=[sort_state]).then(
282
+ fn=update_leaderboard,
283
+ inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
284
+ outputs=leaderboard_display
285
+ )
286
+ btn_conv.click(fn=sort_by_conv, inputs=[sort_state], outputs=[sort_state]).then(
287
+ fn=update_leaderboard,
288
+ inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
289
+ outputs=leaderboard_display
290
+ )
291
+ btn_backend.click(fn=sort_by_backend, inputs=[sort_state], outputs=[sort_state]).then(
292
+ fn=update_leaderboard,
293
+ inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
294
+ outputs=leaderboard_display
295
+ )
296
+ btn_policy.click(fn=sort_by_policy, inputs=[sort_state], outputs=[sort_state]).then(
297
+ fn=update_leaderboard,
298
+ inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
299
+ outputs=leaderboard_display
300
+ )
301
+
302
+ # Refresh button event.
303
+ refresh_btn.click(
304
+ fn=update_leaderboard,
305
+ inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
306
+ outputs=leaderboard_display
307
+ )
308
+
309
+ # Update leaderboard immediately when any checkbox changes.
310
+ cb_mwoz.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state], outputs=leaderboard_display)
311
+ cb_tau_airline.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state], outputs=leaderboard_display)
312
+ cb_tau_retail.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state], outputs=leaderboard_display)
313
+
314
+ # Load initial leaderboard on app start.
315
+ demo.load(
316
+ fn=update_leaderboard,
317
+ inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
318
+ outputs=leaderboard_display
319
+ )
320
+
321
+ if __name__ == "__main__":
322
+ demo.launch()
data/mwoz_leaderboard_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30f233c17f2a9e1068eb5313c1cc1c1e4b622593eb01a40a34b1a95be2824873
3
+ size 3052
data/tau_leaderboard_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dd7553f04d89e492bfec22e2e8f9ab8d7afb269c0b8c027709aedce3ac63aa9
3
+ size 9396
process_submissions.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+ import argparse
5
+
6
+ def process_submissions(input_file, output_dir='submissions'):
7
+ """
8
+ Process a JSON file containing multiple model submissions and split it into
9
+ individual JSON files in the submissions directory.
10
+
11
+ Args:
12
+ input_file (str): Path to the input JSON file
13
+ output_dir (str): Directory where individual submission files will be stored
14
+ """
15
+ # Create submissions directory if it doesn't exist
16
+ Path(output_dir).mkdir(exist_ok=True)
17
+
18
+ # Read the input file
19
+ try:
20
+ with open(input_file, 'r') as f:
21
+ submissions = json.load(f)
22
+
23
+ if not isinstance(submissions, list):
24
+ print(f"Error: Input file {input_file} must contain a JSON array of submissions")
25
+ return
26
+
27
+ # Process each submission
28
+ for submission in submissions:
29
+ if 'model_name' not in submission:
30
+ print(f"Warning: Skipping submission without model_name field")
31
+ continue
32
+
33
+ model_name = submission['model_name']
34
+ # Create a safe filename from the model name
35
+ safe_filename = f"{model_name.replace('/', '_')}.json"
36
+ output_path = os.path.join(output_dir, safe_filename)
37
+
38
+ # Write individual submission file
39
+ with open(output_path, 'w') as f:
40
+ json.dump(submission, f, indent=4)
41
+
42
+ print(f"Created submission file: {output_path}")
43
+
44
+ print(f"\nProcessed {len(submissions)} submissions successfully!")
45
+
46
+ except FileNotFoundError:
47
+ print(f"Error: Input file '{input_file}' not found")
48
+ except json.JSONDecodeError:
49
+ print(f"Error: Input file '{input_file}' is not valid JSON")
50
+ except Exception as e:
51
+ print(f"Error processing submissions: {str(e)}")
52
+
53
+ def main():
54
+ # Set up argument parser
55
+ parser = argparse.ArgumentParser(
56
+ description='Process a JSON file containing model submissions and split into individual files.'
57
+ )
58
+ parser.add_argument(
59
+ 'input_file',
60
+ help='Path to the input JSON file containing model submissions'
61
+ )
62
+ parser.add_argument(
63
+ '--output-dir',
64
+ '-o',
65
+ default='submissions',
66
+ help='Directory where individual submission files will be stored (default: submissions)'
67
+ )
68
+
69
+ # Parse arguments
70
+ args = parser.parse_args()
71
+
72
+ # Process submissions
73
+ process_submissions(args.input_file, args.output_dir)
74
+
75
+ if __name__ == "__main__":
76
+ main()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio>=4.0.0
2
+ pandas>=2.0.0
submissions/20250130_140218-4o.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e6ac4738378227537d66f8f231f943581cca65a5a88badbf7b089d8ddd436aa
3
+ size 276
submissions/20250130_140439-4omini.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23d9f222f6ed78bfa0c664d31dad08d0da8782d6db7184469e9b5f1a43838bcd
3
+ size 280
submissions/20250130_145202-gpt35.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:241d41c00306dfe554be2e712ef70befb98cb15287f8f362b64039e63d42ebcc
3
+ size 279
submissions/20250130_183030-claude.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28fd88cfba3985266a31145b0d7f8bbf7523e1486700bb1c0e1b8cdf395ae745
3
+ size 279
submissions/20250130_184905-mistrallarge.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa32abc214592ed42253f9d09e60049fbded16abf25d27bb58aa883551f52916
3
+ size 286
submissions/20250131_010143-o1mini.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72f8224b39acdde5456eb76914c7294153b1c3cb5d8f778b05fd646be89d1d4b
3
+ size 280
submissions/20250131_012338-llama405.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97ad8b202d517c19c62e492a377d58612c8a95870b71ceb8587896a7977415e7
3
+ size 282
submissions/20250131_012449-llama70.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9202529ccd1b01100e97ab1e49ffe9f80adf890a384bb0c9d54dcb50129044b
3
+ size 281
submissions/20250131_013711-qwen72b.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5785dea65c6e83d70b8eaae6143aa4d8755c3dfda8fd256d971a67242b444a7
3
+ size 281
submissions/20250131_152226-tau-4o-mini-airline.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54c876c671383e333599122f0789947b2906fbce489d26feb2fb44df6ca61bb9
3
+ size 430
submissions/20250131_152338-tau-4o-mini-retail.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccbdb2fa53caa6d08641632a10588c610355a1cd238527b1ab96e40d5314a78c
3
+ size 429
submissions/20250131_152422-tau-4o-retail.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:caffe3b74905c9eb0953b3b7db2ca1caa2c9dbeb67aa9cb56f47c8102df82c73
3
+ size 419
submissions/20250131_152503-tau-4o-airline.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef952c41710fe34a4eef5f23f504e619bc4565a3eaea9ac587b4a2ea2ea3b660
3
+ size 420
submissions/20250131_152610-tau-gpt35-retail.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d88dffc671daefb67e80fdfed3b7b71a7adc3e7975754fd6a435a5783e6435d
3
+ size 434
submissions/20250131_152708-tau-gpt35-airline.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89b7e7b87a74bcfe903d7455c718569fab7f0f593fda15b00ceb40999b101e0e
3
+ size 435
submissions/20250131_152807-tau-sonnet-retail.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:946a9b068da07c43972b41281338689f17caf9a454e6aaefab0dbc199bf434d5
3
+ size 443
submissions/20250202_112945-qwen72b-airline.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4c42e48a557b297a545d9ab5065736de9d3a843ca196625e3c18574f3925c56
3
+ size 428
submissions/20250202_140527-qwen72b-retail.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bfbdf41eb3707e9d471ea6e8b0df66e0c30a876e64de13bfac05b19107dafa5
3
+ size 428
submissions/20250204_144222-tau-llama-405b-airline.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d14f9ee6572bf5c99fa4524eeba1eb8e8eb3286a1ac36c63c3618014d6084ac
3
+ size 442
submissions/20250205_024823-tau-mistrallarge-airline.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52aa75b36bbfc50d5e49ad5d0333a71248d833969db0050f1d1b2f71688e9a1c
3
+ size 435
submissions/20250205_030422-tau-sonnet-airline.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69df986398a9874b7ae42ea16c74d2537012d5c94c146cc7316be3d39929dd6f
3
+ size 434
submissions/20250205_033820-tau-llama405b-retail.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55533012c64e2aa9a09eeec6b5d8b1fb898e2d9e772cdac28c39de4c8167d158
3
+ size 431
submissions/20250205_044403-tau-mistrallarge-retail.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:147f1a3cd17768bb01b54f4568348054af1ea72363f04f5edfa50773dcda3f80
3
+ size 433
submissions/20250208_024344-tau-llama70b-airline.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1b87a4e1981c7b8cfb364cc46ae4a16f4f85f9631d9c02e85008655c8b103e4
3
+ size 426
submissions/20250208_030407-tau-llama70b-retail.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe0c10dbfe0d56a5cd65267a448058aabd8a0e82fdedaf3748f7ffa3d7b7d12e
3
+ size 424
submissions/20250214_142736-tau-o1-mini-retail.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56519aef51f3d681df9e8f29c792af3f315cb67116c012bc6742a2977793c465
3
+ size 410
submissions/20250214_180731-tau-o1-mini-airline.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:165618025471980ee895ab8a12c04f4ca94eb0df19772c8057de2d415760f684
3
+ size 410
submissions/20250214_193236-o1.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:628356d0e3f25fcb909e50ff54c1a54b0b25ff7d2c8190ae1c87713edd90d238
3
+ size 276
submissions/20250215_115156-tau-o1-airline.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9db8211e9fa6bdb0b97c7341216dcce133cd815e31d0e209a5a5dd452939ffab
3
+ size 426
submissions/20250215_121147-tau-o1-retail.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c3e438a5019e96efb58d7c17f9b82b6de54da087cb4960445326f8f858a69ab
3
+ size 423