juancauma commited on
Commit
349b4c0
·
1 Parent(s): ec8b459

visual changes

Browse files
Files changed (2) hide show
  1. app.py +212 -142
  2. styles.css +107 -16
app.py CHANGED
@@ -14,60 +14,19 @@ def strip_timestamp(name):
14
 
15
  # Static grouping mapping for the 10 general submissions.
16
  GROUPS = [
17
- {
18
- "mwoz": "20250214_193236-o1",
19
- "tau_airline": "20250215_115156-tau-o1-airline",
20
- "tau_retail": "20250215_121147-tau-o1-retail"
21
- },
22
- {
23
- "mwoz": "20250131_012338-llama405",
24
- "tau_airline": "20250204_144222-tau-llama-405b-airline",
25
- "tau_retail": "20250205_033820-tau-llama405b-retail"
26
- },
27
- {
28
- "mwoz": "20250130_140218-4o",
29
- "tau_airline": "20250131_152503-tau-4o-airline",
30
- "tau_retail": "20250131_152422-tau-4o-retail"
31
- },
32
- {
33
- "mwoz": "20250130_183030-claude",
34
- "tau_airline": "20250205_030422-tau-sonnet-airline",
35
- "tau_retail": "20250131_152807-tau-sonnet-retail"
36
- },
37
- {
38
- "mwoz": "20250131_012449-llama70",
39
- "tau_airline": "20250208_024344-tau-llama70b-airline",
40
- "tau_retail": "20250208_030407-tau-llama70b-retail"
41
- },
42
- {
43
- "mwoz": "20250131_013711-qwen72b",
44
- "tau_airline": "20250202_112945-qwen72b-airline",
45
- "tau_retail": "20250202_140527-qwen72b-retail"
46
- },
47
- {
48
- "mwoz": "20250130_184905-mistrallarge",
49
- "tau_airline": "20250205_024823-tau-mistrallarge-airline",
50
- "tau_retail": "20250205_044403-tau-mistrallarge-retail"
51
- },
52
- {
53
- "mwoz": "20250131_010143-o1mini",
54
- "tau_airline": "20250214_180731-tau-o1-mini-airline",
55
- "tau_retail": "20250214_142736-tau-o1-mini-retail"
56
- },
57
- {
58
- "mwoz": "20250130_140439-4omini",
59
- "tau_airline": "20250131_152226-tau-4o-mini-airline",
60
- "tau_retail": "20250131_152338-tau-4o-mini-retail"
61
- },
62
- {
63
- "mwoz": "20250130_145202-gpt35",
64
- "tau_airline": "20250131_152708-tau-gpt35-airline",
65
- "tau_retail": "20250131_152610-tau-gpt35-retail"
66
- }
67
  ]
68
 
69
  def load_mwoz_results():
70
- """Load mwoz results from data/mwoz_leaderboard_results.json."""
71
  path = os.path.join("data", "mwoz_leaderboard_results.json")
72
  if not os.path.exists(path):
73
  return []
@@ -75,28 +34,21 @@ def load_mwoz_results():
75
  return json.load(f)
76
 
77
  def load_tau_results():
78
- """Load tau results from data/tau_leaderboard_results.json."""
79
  path = os.path.join("data", "tau_leaderboard_results.json")
80
  if not os.path.exists(path):
81
  return []
82
  with open(path, "r") as f:
83
  return json.load(f)
84
 
85
- def create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state):
86
- """
87
- Create the aggregated leaderboard DataFrame.
88
- Aggregates metrics based on the selected variants, computes dynamic rank based solely on "Average Score",
89
- and then sorts the DataFrame according to the current sort state.
90
- """
91
- # Ensure at least one variant is active.
92
  if not (selected_mwoz or selected_tau_airline or selected_tau_retail):
93
  selected_mwoz = True
94
 
95
  mwoz_data = load_mwoz_results()
96
  tau_data = load_tau_results()
97
- mwoz_lookup = { entry["model_name"]: entry for entry in mwoz_data }
98
- tau_lookup = { entry["model_name"]: entry for entry in tau_data }
99
-
100
  aggregated = []
101
  for group in GROUPS:
102
  metrics = {"avg_conv_consistency": 0, "avg_backend_consistency": 0, "avg_policy_completeness": 0}
@@ -140,49 +92,97 @@ def create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau
140
  overall_avg = (avg_conv + avg_backend + avg_policy) / 3
141
  else:
142
  avg_conv = avg_backend = avg_policy = overall_avg = 0
 
 
143
 
 
 
 
 
144
  aggregated.append({
145
- "Model": " / ".join(title_parts),
146
  "Average Score": round(overall_avg, 4),
147
  "Conversation Consistency": round(avg_conv, 4),
148
  "Backend Consistency": round(avg_backend, 4),
149
  "Policy Completeness": round(avg_policy, 4),
150
  "Judge Model": judge_model
151
  })
152
-
153
  df = pd.DataFrame(aggregated)
154
- # Compute dynamic Rank solely based on "Average Score" (higher = better; rank 1 is highest)
155
- df["Rank"] = df["Average Score"].rank(ascending=False, method="min").astype(int)
156
 
157
- # Sort according to sort_state (allowed columns: numeric ones)
 
 
 
 
 
158
  allowed_sort_cols = ["Average Score", "Conversation Consistency", "Backend Consistency", "Policy Completeness"]
159
- sort_by = sort_state.get("sort_by") if sort_state else None
160
- ascending = sort_state.get("ascending") if sort_state else True
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  if sort_by in allowed_sort_cols:
162
  df = df.sort_values(sort_by, ascending=ascending)
163
-
164
- # Reorder columns to have "Rank" as the first column.
 
 
165
  cols = df.columns.tolist()
166
  if "Rank" in cols:
167
  cols.insert(0, cols.pop(cols.index("Rank")))
168
  df = df[cols]
169
-
170
  return df
171
 
172
  def update_sort_state(current_state, clicked_column):
173
  """
174
- Update the sort state based on the clicked column.
175
- If the same column is clicked, toggle its sort order; otherwise, set the new column with ascending order.
176
  """
177
- if current_state is None:
178
- current_state = {"sort_by": clicked_column, "ascending": True}
179
- else:
180
- if current_state.get("sort_by") == clicked_column:
181
- current_state["ascending"] = not current_state.get("ascending", True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  else:
183
- current_state["sort_by"] = clicked_column
184
- current_state["ascending"] = True
185
- return current_state
 
 
 
 
 
186
 
187
  def sort_by_avg(sort_state):
188
  return update_sort_state(sort_state, "Average Score")
@@ -197,10 +197,6 @@ def sort_by_policy(sort_state):
197
  return update_sort_state(sort_state, "Policy Completeness")
198
 
199
  def get_color_for_value(value, min_val, max_val):
200
- """
201
- Compute a color for a given value based on its normalized position.
202
- Interpolates from red (lowest) to yellow (mid) to green (highest).
203
- """
204
  if max_val == min_val:
205
  norm = 0.5
206
  else:
@@ -218,10 +214,9 @@ def get_color_for_value(value, min_val, max_val):
218
  return f"#{r:02X}{g:02X}{b:02X}"
219
 
220
  def generate_html_table(df):
221
- """
222
- Generate an HTML table from the DataFrame.
223
- For each numeric column (except Rank), apply a text color based on its relative value.
224
- """
225
  numeric_cols = ["Average Score", "Conversation Consistency", "Backend Consistency", "Policy Completeness"]
226
  col_min = {}
227
  col_max = {}
@@ -229,104 +224,179 @@ def generate_html_table(df):
229
  col_min[col] = df[col].min() if not df.empty else 0
230
  col_max[col] = df[col].max() if not df.empty else 0
231
 
232
- html = "<table border='1' style='border-collapse: collapse; text-align: center; width: 100%;'>"
 
 
233
  # Header row
234
  html += "<tr>"
235
  for col in df.columns:
236
- html += f"<th style='padding: 8px;'>{col}</th>"
237
  html += "</tr>"
238
-
239
- # Data rows
240
  for _, row in df.iterrows():
241
- html += "<tr>"
242
  for col in df.columns:
243
  cell_value = row[col]
244
- if col in numeric_cols: # Color numeric columns (except Rank)
245
  color = get_color_for_value(cell_value, col_min[col], col_max[col])
246
- html += f"<td style='padding: 8px; color: {color};'>{cell_value}</td>"
247
  else:
248
- html += f"<td style='padding: 8px;'>{cell_value}</td>"
249
  html += "</tr>"
250
  html += "</table>"
 
251
  return html
252
 
253
- def update_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state):
254
  """
255
- Update the leaderboard by creating the aggregated DataFrame (with dynamic rank)
256
- and converting it into an HTML table.
257
  """
258
- df = create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state)
259
- html_table = generate_html_table(df)
260
- return html_table
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
  with gr.Blocks(css=custom_css, title="TD-EVAL Leaderboard") as demo:
263
  gr.Markdown("# 🏆 TD-EVAL Model Evaluation Leaderboard")
264
- gr.Markdown("""
265
- This leaderboard displays aggregated model performance across multiple evaluation metrics.
266
 
267
- **Variants:**
268
- - **mwoz:** Baseline variant.
269
- - **tau-airline:** Airline specialty variant.
270
- - **tau-retail:** Retail specialty variant.
271
-
272
- Use the checkboxes below to select which variants to include. At least one variant must be active.
273
- """)
 
 
274
 
275
  with gr.Row():
276
  cb_mwoz = gr.Checkbox(label="mwoz", value=True)
277
  cb_tau_airline = gr.Checkbox(label="tau-airline", value=True)
278
  cb_tau_retail = gr.Checkbox(label="tau-retail", value=True)
279
 
280
- gr.Markdown("### Sort by (click a button to toggle ascending/descending):")
 
 
 
 
 
 
281
  with gr.Row():
282
- btn_avg = gr.Button("Average Score")
283
  btn_conv = gr.Button("Conversation Consistency")
284
  btn_backend = gr.Button("Backend Consistency")
285
  btn_policy = gr.Button("Policy Completeness")
286
-
287
- # Initialize sort state: default sort by Average Score descending.
288
- sort_state = gr.State({"sort_by": "Average Score", "ascending": False})
289
-
290
  leaderboard_display = gr.HTML(label="Aggregated Model Rankings")
291
 
292
- refresh_btn = gr.Button("🔄 Refresh Leaderboard")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
- # Sort button events.
295
- btn_avg.click(fn=sort_by_avg, inputs=[sort_state], outputs=[sort_state]).then(
 
 
 
 
296
  fn=update_leaderboard,
297
- inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
298
  outputs=leaderboard_display
299
  )
300
- btn_conv.click(fn=sort_by_conv, inputs=[sort_state], outputs=[sort_state]).then(
301
- fn=update_leaderboard,
302
- inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
303
- outputs=leaderboard_display
304
- )
305
- btn_backend.click(fn=sort_by_backend, inputs=[sort_state], outputs=[sort_state]).then(
306
  fn=update_leaderboard,
307
- inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
308
  outputs=leaderboard_display
309
  )
310
- btn_policy.click(fn=sort_by_policy, inputs=[sort_state], outputs=[sort_state]).then(
 
 
 
 
 
311
  fn=update_leaderboard,
312
- inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
313
  outputs=leaderboard_display
314
  )
315
 
316
- # Refresh button event.
317
- refresh_btn.click(
 
 
 
318
  fn=update_leaderboard,
319
- inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state],
320
  outputs=leaderboard_display
321
  )
322
 
323
- # Update leaderboard immediately when any checkbox changes.
324
- cb_mwoz.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state], outputs=leaderboard_display)
325
- cb_tau_airline.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state], outputs=leaderboard_display)
326
- cb_tau_retail.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state], outputs=leaderboard_display)
327
-
328
- # Load initial leaderboard on app start.
329
- demo.load(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, sort_state], outputs=leaderboard_display)
330
 
331
  if __name__ == "__main__":
332
  demo.launch()
 
14
 
15
  # Static grouping mapping for the 10 general submissions.
16
  GROUPS = [
17
+ {"mwoz": "20250214_193236-o1", "tau_airline": "20250215_115156-tau-o1-airline", "tau_retail": "20250215_121147-tau-o1-retail"},
18
+ {"mwoz": "20250131_012338-llama405", "tau_airline": "20250204_144222-tau-llama-405b-airline", "tau_retail": "20250205_033820-tau-llama405b-retail"},
19
+ {"mwoz": "20250130_140218-4o", "tau_airline": "20250131_152503-tau-4o-airline", "tau_retail": "20250131_152422-tau-4o-retail"},
20
+ {"mwoz": "20250130_183030-claude", "tau_airline": "20250205_030422-tau-sonnet-airline", "tau_retail": "20250131_152807-tau-sonnet-retail"},
21
+ {"mwoz": "20250131_012449-llama70", "tau_airline": "20250208_024344-tau-llama70b-airline", "tau_retail": "20250208_030407-tau-llama70b-retail"},
22
+ {"mwoz": "20250131_013711-qwen72b", "tau_airline": "20250202_112945-qwen72b-airline", "tau_retail": "20250202_140527-qwen72b-retail"},
23
+ {"mwoz": "20250130_184905-mistrallarge", "tau_airline": "20250205_024823-tau-mistrallarge-airline", "tau_retail": "20250205_044403-tau-mistrallarge-retail"},
24
+ {"mwoz": "20250131_010143-o1mini", "tau_airline": "20250214_180731-tau-o1-mini-airline", "tau_retail": "20250214_142736-tau-o1-mini-retail"},
25
+ {"mwoz": "20250130_140439-4omini", "tau_airline": "20250131_152226-tau-4o-mini-airline", "tau_retail": "20250131_152338-tau-4o-mini-retail"},
26
+ {"mwoz": "20250130_145202-gpt35", "tau_airline": "20250131_152708-tau-gpt35-airline", "tau_retail": "20250131_152610-tau-gpt35-retail"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  ]
28
 
29
  def load_mwoz_results():
 
30
  path = os.path.join("data", "mwoz_leaderboard_results.json")
31
  if not os.path.exists(path):
32
  return []
 
34
  return json.load(f)
35
 
36
  def load_tau_results():
 
37
  path = os.path.join("data", "tau_leaderboard_results.json")
38
  if not os.path.exists(path):
39
  return []
40
  with open(path, "r") as f:
41
  return json.load(f)
42
 
43
+ def create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query=""):
 
 
 
 
 
 
44
  if not (selected_mwoz or selected_tau_airline or selected_tau_retail):
45
  selected_mwoz = True
46
 
47
  mwoz_data = load_mwoz_results()
48
  tau_data = load_tau_results()
49
+ mwoz_lookup = {entry["model_name"]: entry for entry in mwoz_data}
50
+ tau_lookup = {entry["model_name"]: entry for entry in tau_data}
51
+
52
  aggregated = []
53
  for group in GROUPS:
54
  metrics = {"avg_conv_consistency": 0, "avg_backend_consistency": 0, "avg_policy_completeness": 0}
 
92
  overall_avg = (avg_conv + avg_backend + avg_policy) / 3
93
  else:
94
  avg_conv = avg_backend = avg_policy = overall_avg = 0
95
+
96
+ model_name = " / ".join(title_parts)
97
 
98
+ # Apply search filter
99
+ if search_query and search_query.lower() not in model_name.lower():
100
+ continue
101
+
102
  aggregated.append({
103
+ "Model": model_name,
104
  "Average Score": round(overall_avg, 4),
105
  "Conversation Consistency": round(avg_conv, 4),
106
  "Backend Consistency": round(avg_backend, 4),
107
  "Policy Completeness": round(avg_policy, 4),
108
  "Judge Model": judge_model
109
  })
110
+
111
  df = pd.DataFrame(aggregated)
 
 
112
 
113
+ # If no results found after filtering
114
+ if df.empty:
115
+ return df
116
+
117
+ df["Rank"] = df["Average Score"].rank(ascending=False, method="min").astype(int)
118
+
119
  allowed_sort_cols = ["Average Score", "Conversation Consistency", "Backend Consistency", "Policy Completeness"]
120
+
121
+ # Handle sort_state safely
122
+ if isinstance(sort_state, str):
123
+ try:
124
+ sort_state = json.loads(sort_state)
125
+ except:
126
+ sort_state = {"sort_by": "Average Score", "ascending": False}
127
+
128
+ # Ensure sort_state is a dict
129
+ if not isinstance(sort_state, dict):
130
+ sort_state = {"sort_by": "Average Score", "ascending": False}
131
+
132
+ sort_by = sort_state.get("sort_by", "Average Score")
133
+ ascending = sort_state.get("ascending", False)
134
+
135
  if sort_by in allowed_sort_cols:
136
  df = df.sort_values(sort_by, ascending=ascending)
137
+ else:
138
+ # Default sort if column not found
139
+ df = df.sort_values("Average Score", ascending=False)
140
+
141
  cols = df.columns.tolist()
142
  if "Rank" in cols:
143
  cols.insert(0, cols.pop(cols.index("Rank")))
144
  df = df[cols]
145
+
146
  return df
147
 
148
  def update_sort_state(current_state, clicked_column):
149
  """
150
+ Update the sorting state based on the clicked column.
151
+ Handles various input formats for current_state.
152
  """
153
+ # Default state if nothing valid is provided
154
+ new_state = {"sort_by": clicked_column, "ascending": False}
155
+
156
+ # Handle the case when current_state is a string (JSON)
157
+ if isinstance(current_state, str):
158
+ try:
159
+ current_state = json.loads(current_state)
160
+ except (json.JSONDecodeError, TypeError):
161
+ # If we can't parse it, return the default state
162
+ return new_state
163
+
164
+ # If current_state is None or not a dict, return default
165
+ if not isinstance(current_state, dict):
166
+ return new_state
167
+
168
+ # Now we're sure current_state is a dict
169
+ # Check if it has the needed keys
170
+ if "sort_by" in current_state:
171
+ if current_state["sort_by"] == clicked_column:
172
+ # Toggle direction for the same column
173
+ return {
174
+ "sort_by": clicked_column,
175
+ "ascending": not current_state.get("ascending", False)
176
+ }
177
  else:
178
+ # New column, default to descending (false)
179
+ return {
180
+ "sort_by": clicked_column,
181
+ "ascending": False
182
+ }
183
+
184
+ # If we got here, current_state doesn't have the right format
185
+ return new_state
186
 
187
  def sort_by_avg(sort_state):
188
  return update_sort_state(sort_state, "Average Score")
 
197
  return update_sort_state(sort_state, "Policy Completeness")
198
 
199
  def get_color_for_value(value, min_val, max_val):
 
 
 
 
200
  if max_val == min_val:
201
  norm = 0.5
202
  else:
 
214
  return f"#{r:02X}{g:02X}{b:02X}"
215
 
216
  def generate_html_table(df):
217
+ if df.empty:
218
+ return "<div class='no-results'>No matching results found.</div>"
219
+
 
220
  numeric_cols = ["Average Score", "Conversation Consistency", "Backend Consistency", "Policy Completeness"]
221
  col_min = {}
222
  col_max = {}
 
224
  col_min[col] = df[col].min() if not df.empty else 0
225
  col_max[col] = df[col].max() if not df.empty else 0
226
 
227
+ # Build a simple HTML table without borders or JavaScript sorting
228
+ html = "<table style='border: none; border-collapse: collapse;'>"
229
+
230
  # Header row
231
  html += "<tr>"
232
  for col in df.columns:
233
+ html += f"<th style='padding:8px; border: none;'>{col}</th>"
234
  html += "</tr>"
235
+
236
+ # Table rows
237
  for _, row in df.iterrows():
238
+ html += "<tr style='border: none;'>"
239
  for col in df.columns:
240
  cell_value = row[col]
241
+ if col in numeric_cols:
242
  color = get_color_for_value(cell_value, col_min[col], col_max[col])
243
+ html += f"<td style='padding: 8px; border: none; color: {color};'>{cell_value}</td>"
244
  else:
245
+ html += f"<td style='padding: 8px; border: none;'>{cell_value}</td>"
246
  html += "</tr>"
247
  html += "</table>"
248
+
249
  return html
250
 
251
+ def update_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query=""):
252
  """
253
+ Update the leaderboard based on selection and sort state.
 
254
  """
255
+ try:
256
+ # Convert sort_state to dict if it's a string
257
+ if isinstance(sort_state, str):
258
+ try:
259
+ sort_state = json.loads(sort_state)
260
+ except:
261
+ # If JSON parsing fails, create a default state
262
+ sort_state = {"sort_by": "Average Score", "ascending": False}
263
+
264
+ # Ensure sort_state is a dict
265
+ if not isinstance(sort_state, dict):
266
+ sort_state = {"sort_by": "Average Score", "ascending": False}
267
+
268
+ # Generate the data and table
269
+ df = create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query)
270
+ html_table = generate_html_table(df)
271
+
272
+ # Get sort info with fallbacks
273
+ sort_col = sort_state.get("sort_by", "Average Score")
274
+ sort_dir = "▼" if not sort_state.get("ascending", False) else "▲"
275
+
276
+ html_output = f"""
277
+ <div class="sort-info">
278
+ <p>Sorted by: {sort_col} {sort_dir}</p>
279
+ </div>
280
+ {html_table}
281
+ """
282
+
283
+ return html_output
284
+
285
+ except Exception as e:
286
+ # If anything goes wrong, return a basic table with an error message
287
+ print(f"Error in update_leaderboard: {str(e)}")
288
+ df = create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail,
289
+ {"sort_by": "Average Score", "ascending": False})
290
+ html_table = generate_html_table(df)
291
+
292
+ return f"""
293
+ <div class="sort-info" style="color: #ff6b6b;">
294
+ <p>Error in sorting. Using default sort: Average Score (descending)</p>
295
+ </div>
296
+ {html_table}
297
+ """
298
 
299
  with gr.Blocks(css=custom_css, title="TD-EVAL Leaderboard") as demo:
300
  gr.Markdown("# 🏆 TD-EVAL Model Evaluation Leaderboard")
301
+ gr.HTML('<div class="subtitle">This leaderboard displays aggregated model performance across multiple evaluation metrics.</div>')
 
302
 
303
+ gr.Markdown('''
304
+ <div class="variants_container">
305
+ <div class="variants_title">Variants:</div>
306
+ - mwoz: Baseline variant.<br>
307
+ - tau-airline: Airline specialty variant.<br>
308
+ - tau-retail: Retail specialty variant.<br><br>
309
+ Use the checkboxes below to select which variants to include. At least one variant must be active.
310
+ </div>
311
+ ''')
312
 
313
  with gr.Row():
314
  cb_mwoz = gr.Checkbox(label="mwoz", value=True)
315
  cb_tau_airline = gr.Checkbox(label="tau-airline", value=True)
316
  cb_tau_retail = gr.Checkbox(label="tau-retail", value=True)
317
 
318
+ # Add search bar
319
+ search_input = gr.Textbox(label="Search models", placeholder="Type to filter by model name...")
320
+
321
+ hidden_sort_state = gr.State(value={"sort_by": "Average Score", "ascending": False})
322
+
323
+ # Add sorting buttons
324
+ gr.Markdown("### Sort by:")
325
  with gr.Row():
326
+ btn_avg = gr.Button("Average Score")
327
  btn_conv = gr.Button("Conversation Consistency")
328
  btn_backend = gr.Button("Backend Consistency")
329
  btn_policy = gr.Button("Policy Completeness")
330
+
 
 
 
331
  leaderboard_display = gr.HTML(label="Aggregated Model Rankings")
332
 
333
+ # Function to toggle sort state and update button labels
334
+ def toggle_sort(column, current_state, btn_avg, btn_conv, btn_backend, btn_policy):
335
+ # Default new state - flip direction if same column, otherwise default to descending
336
+ if isinstance(current_state, dict) and current_state.get("sort_by") == column:
337
+ new_ascending = not current_state.get("ascending", False)
338
+ else:
339
+ new_ascending = False
340
+
341
+ new_state = {"sort_by": column, "ascending": new_ascending}
342
+
343
+ # Update button labels
344
+ direction = "▲" if new_ascending else "▼"
345
+ avg_label = f"Average Score {direction}" if column == "Average Score" else "Average Score"
346
+ conv_label = f"Conversation Consistency {direction}" if column == "Conversation Consistency" else "Conversation Consistency"
347
+ backend_label = f"Backend Consistency {direction}" if column == "Backend Consistency" else "Backend Consistency"
348
+ policy_label = f"Policy Completeness {direction}" if column == "Policy Completeness" else "Policy Completeness"
349
+
350
+ return new_state, avg_label, conv_label, backend_label, policy_label
351
 
352
+ # Connect sort buttons with the toggle function
353
+ btn_avg.click(
354
+ fn=toggle_sort,
355
+ inputs=[gr.Textbox(value="Average Score", visible=False), hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy],
356
+ outputs=[hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy]
357
+ ).then(
358
  fn=update_leaderboard,
359
+ inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input],
360
  outputs=leaderboard_display
361
  )
362
+
363
+ btn_conv.click(
364
+ fn=toggle_sort,
365
+ inputs=[gr.Textbox(value="Conversation Consistency", visible=False), hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy],
366
+ outputs=[hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy]
367
+ ).then(
368
  fn=update_leaderboard,
369
+ inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input],
370
  outputs=leaderboard_display
371
  )
372
+
373
+ btn_backend.click(
374
+ fn=toggle_sort,
375
+ inputs=[gr.Textbox(value="Backend Consistency", visible=False), hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy],
376
+ outputs=[hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy]
377
+ ).then(
378
  fn=update_leaderboard,
379
+ inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input],
380
  outputs=leaderboard_display
381
  )
382
 
383
+ btn_policy.click(
384
+ fn=toggle_sort,
385
+ inputs=[gr.Textbox(value="Policy Completeness", visible=False), hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy],
386
+ outputs=[hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy]
387
+ ).then(
388
  fn=update_leaderboard,
389
+ inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input],
390
  outputs=leaderboard_display
391
  )
392
 
393
+ # Connect dataflow for variant checkboxes and search
394
+ cb_mwoz.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
395
+ cb_tau_airline.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
396
+ cb_tau_retail.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
397
+ search_input.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
398
+
399
+ demo.load(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
400
 
401
  if __name__ == "__main__":
402
  demo.launch()
styles.css CHANGED
@@ -1,28 +1,108 @@
1
  /* General body style */
2
  body {
3
  font-family: Arial, sans-serif;
4
- background-color: #2b2b2b;
5
  margin: 20px;
6
  }
7
 
8
  /* Title styling for Markdown headers */
9
- h1, h2, h3 {
10
  color: #cacaca;
 
 
 
 
 
 
 
 
 
 
 
11
  }
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  /* Button styling */
14
  button {
15
- background-color: #007BFF;
16
- color: #fff;
17
  border: none;
18
  padding: 8px 12px;
19
  border-radius: 4px;
20
  cursor: pointer;
21
  font-size: 14px;
 
22
  }
23
 
24
  button:hover {
25
- background-color: #0056b3;
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  }
27
 
28
  /* Styling for checkboxes and labels */
@@ -30,20 +110,31 @@ button:hover {
30
  margin-right: 10px;
31
  }
32
 
33
- /* Table styling */
34
- table {
 
 
 
 
 
35
  width: 100%;
36
- border-collapse: collapse;
37
- margin-top: 20px;
38
  }
39
 
40
- th, td {
41
- padding: 8px;
42
- text-align: center;
43
- border: 1px solid #ddd;
44
  }
45
 
46
- th {
47
- background-color: #f2f2f2;
48
- font-weight: bold;
 
 
 
 
 
 
49
  }
 
1
  /* General body style */
2
  body {
3
  font-family: Arial, sans-serif;
4
+ background-color: #000000;
5
  margin: 20px;
6
  }
7
 
8
  /* Title styling for Markdown headers */
9
+ h1, h2, h3, .subtitle, .variants_container {
10
  color: #cacaca;
11
+ display: flex;
12
+ text-align: center;
13
+ justify-content: center;
14
+
15
+ }
16
+
17
+ h1 {
18
+ font-size: 2.125rem;
19
+ font-weight: 700;
20
+ margin-top: 2rem;
21
+
22
  }
23
 
24
+ .variants_container {
25
+ margin: 50px auto;
26
+ border-radius: 10px;
27
+ display: flex;
28
+ flex-direction: column;
29
+ justify-content: center;
30
+ padding: 15px;
31
+ width: fit-content;
32
+ background-color: #27272A;
33
+ }
34
+
35
+ .variants_title {
36
+ font-size: 20px;
37
+ font-weight: 500;
38
+ }
39
+
40
+
41
+ /* table */
42
+ table {
43
+ width: 100%;
44
+ border-collapse: collapse;
45
+ border-spacing: 0;
46
+ border: none;
47
+ margin-top: 20px;
48
+ overflow: hidden;
49
+ border-radius: 10px;
50
+ }
51
+
52
+ table th {
53
+ background-color: #27272A;
54
+ font-weight: bold;
55
+ font-size: 18px;
56
+ border: none;
57
+ }
58
+
59
+ /* Alternate row colors for table rows excluding the header */
60
+ table tr:not(:first-child):nth-child(odd) {
61
+ background-color: #1a1a1a;
62
+ }
63
+
64
+ table tr:not(:first-child):nth-child(even) {
65
+ background-color: #141414;
66
+ }
67
+
68
+ table tr {
69
+ border: none;
70
+ }
71
+
72
+ th, td {
73
+ padding: 8px;
74
+ text-align: center;
75
+ border: none;
76
+ }
77
+
78
+
79
  /* Button styling */
80
  button {
81
+ background-color: #ffbe93;
82
+ color: black;
83
  border: none;
84
  padding: 8px 12px;
85
  border-radius: 4px;
86
  cursor: pointer;
87
  font-size: 14px;
88
+ transition: all 0.3s ease;
89
  }
90
 
91
  button:hover {
92
+ background-color: #ff9955;
93
+ transform: translateY(-2px);
94
+ box-shadow: 0 4px 8px rgba(0,0,0,0.2);
95
+ }
96
+
97
+ /* Sort buttons */
98
+ .sort-info {
99
+ color: #cacaca;
100
+ text-align: center;
101
+ margin: 10px 0;
102
+ padding: 5px;
103
+ background-color: #27272A;
104
+ border-radius: 5px;
105
+ font-size: 16px;
106
  }
107
 
108
  /* Styling for checkboxes and labels */
 
110
  margin-right: 10px;
111
  }
112
 
113
+ /* Search input styling */
114
+ input[type="text"] {
115
+ background-color: #1a1a1a;
116
+ color: #cacaca;
117
+ border: 1px solid #3a3a3a;
118
+ border-radius: 5px;
119
+ padding: 10px;
120
  width: 100%;
121
+ margin-bottom: 15px;
122
+ font-size: 16px;
123
  }
124
 
125
+ input[type="text"]:focus {
126
+ border-color: #ffa162;
127
+ outline: none;
128
+ box-shadow: 0 0 5px rgba(196, 193, 39, 0.5);
129
  }
130
 
131
+ /* No results message */
132
+ .no-results {
133
+ color: #cacaca;
134
+ text-align: center;
135
+ padding: 30px;
136
+ background-color: #1a1a1a;
137
+ border-radius: 10px;
138
+ font-size: 18px;
139
+ margin-top: 20px;
140
  }