juancauma commited on
Commit
f17ba29
Β·
1 Parent(s): faf9069

changes to font color

Browse files
Files changed (2) hide show
  1. app.py +305 -114
  2. styles.css +246 -58
app.py CHANGED
@@ -14,16 +14,16 @@ def strip_timestamp(name):
14
 
15
  # Static grouping mapping for the 10 general submissions.
16
  GROUPS = [
17
- {"mwoz": "20250214_193236-o1", "tau_airline": "20250215_115156-tau-o1-airline", "tau_retail": "20250215_121147-tau-o1-retail"},
18
- {"mwoz": "20250131_012338-llama405", "tau_airline": "20250204_144222-tau-llama-405b-airline","tau_retail": "20250205_033820-tau-llama405b-retail"},
19
- {"mwoz": "20250130_140218-4o", "tau_airline": "20250131_152503-tau-4o-airline", "tau_retail": "20250131_152422-tau-4o-retail"},
20
- {"mwoz": "20250130_183030-claude", "tau_airline": "20250205_030422-tau-sonnet-airline", "tau_retail": "20250131_152807-tau-sonnet-retail"},
21
- {"mwoz": "20250131_012449-llama70", "tau_airline": "20250208_024344-tau-llama70b-airline","tau_retail": "20250208_030407-tau-llama70b-retail"},
22
- {"mwoz": "20250131_013711-qwen72b", "tau_airline": "20250202_112945-qwen72b-airline", "tau_retail": "20250202_140527-qwen72b-retail"},
23
- {"mwoz": "20250130_184905-mistrallarge","tau_airline": "20250205_024823-tau-mistrallarge-airline","tau_retail": "20250205_044403-tau-mistrallarge-retail"},
24
- {"mwoz": "20250131_010143-o1mini", "tau_airline": "20250214_180731-tau-o1-mini-airline", "tau_retail": "20250214_142736-tau-o1-mini-retail"},
25
- {"mwoz": "20250130_140439-4omini", "tau_airline": "20250131_152226-tau-4o-mini-airline", "tau_retail": "20250131_152338-tau-4o-mini-retail"},
26
- {"mwoz": "20250130_145202-gpt35", "tau_airline": "20250131_152708-tau-gpt35-airline", "tau_retail": "20250131_152610-tau-gpt35-retail"}
27
  ]
28
 
29
  def load_mwoz_results():
@@ -41,177 +41,368 @@ def load_tau_results():
41
  return json.load(f)
42
 
43
  def create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query=""):
44
- # ensure at least one variant selected
45
  if not (selected_mwoz or selected_tau_airline or selected_tau_retail):
46
  selected_mwoz = True
47
 
48
  mwoz_data = load_mwoz_results()
49
- tau_data = load_tau_results()
50
- mwoz_lookup = {e["model_name"]: e for e in mwoz_data}
51
- tau_lookup = {e["model_name"]: e for e in tau_data}
52
 
53
  aggregated = []
54
  for group in GROUPS:
55
- metrics = {"avg_conv_consistency":0, "avg_backend_consistency":0, "avg_policy_completeness":0}
56
  count = 0
57
  title_parts = []
58
  judge_model = ""
59
-
60
- # collect metrics from each selected variant
61
- for key_name, lookup in [("mwoz", mwoz_lookup), ("tau_airline", tau_lookup), ("tau_retail", tau_lookup)]:
62
- if locals()[f"selected_{key_name}"]:
63
- key = group[key_name]
64
- if key in lookup:
65
- rec = lookup[key]
66
- metrics["avg_conv_consistency"] += rec.get("avg_conv_consistency",0)
67
- metrics["avg_backend_consistency"] += rec.get("avg_backend_consistency",0)
68
- metrics["avg_policy_completeness"] += rec.get("avg_policy_completeness",0)
69
- count += 1
70
- title_parts.append(strip_timestamp(key))
71
- judge_model = rec.get("judge_model", judge_model)
72
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  if count > 0:
74
- avg_conv = metrics["avg_conv_consistency"]/count
75
- avg_backend = metrics["avg_backend_consistency"]/count
76
- avg_policy = metrics["avg_policy_completeness"]/count
77
- overall_avg = (avg_conv + avg_backend + avg_policy)/3
78
  else:
79
  avg_conv = avg_backend = avg_policy = overall_avg = 0
80
 
81
  model_name = " / ".join(title_parts)
 
 
82
  if search_query and search_query.lower() not in model_name.lower():
83
  continue
84
-
85
  aggregated.append({
86
  "Model": model_name,
87
- "Average Score": round(overall_avg,4),
88
- "Conversation Consistency": round(avg_conv,4),
89
- "Backend Consistency": round(avg_backend,4),
90
- "Policy Completeness": round(avg_policy,4),
91
  "Judge Model": judge_model
92
  })
93
 
94
  df = pd.DataFrame(aggregated)
 
 
95
  if df.empty:
96
  return df
97
-
98
- # ranking & sorting
99
  df["Rank"] = df["Average Score"].rank(ascending=False, method="min").astype(int)
100
- allowed = ["Average Score","Conversation Consistency","Backend Consistency","Policy Completeness"]
 
 
 
101
  if isinstance(sort_state, str):
102
- try: sort_state = json.loads(sort_state)
103
- except: sort_state = {"sort_by":"Average Score","ascending":False}
 
 
 
 
104
  if not isinstance(sort_state, dict):
105
- sort_state = {"sort_by":"Average Score","ascending":False}
106
-
107
- sort_by, asc = sort_state.get("sort_by","Average Score"), sort_state.get("ascending",False)
108
- if sort_by in allowed:
109
- df = df.sort_values(sort_by, ascending=asc)
 
 
110
  else:
 
111
  df = df.sort_values("Average Score", ascending=False)
112
 
113
- # move Rank column to front
114
  cols = df.columns.tolist()
115
  if "Rank" in cols:
116
  cols.insert(0, cols.pop(cols.index("Rank")))
117
- return df[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  def update_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query=""):
120
- # generate HTML for the leaderboard
 
 
121
  try:
122
- # normalize sort_state
123
  if isinstance(sort_state, str):
124
- try: sort_state = json.loads(sort_state)
125
- except: sort_state = {"sort_by":"Average Score","ascending":False}
 
 
 
 
 
126
  if not isinstance(sort_state, dict):
127
- sort_state = {"sort_by":"Average Score","ascending":False}
128
-
 
129
  df = create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query)
130
-
131
- # color gradients & HTML generation
132
- def get_color_for_value(value, mn, mx):
133
- if mx==mn: norm=0.5
134
- else: norm=(value-mn)/(mx-mn)
135
- if norm<0.5:
136
- r, g, b = 255, int(255*(norm/0.5)), 0
137
- else:
138
- r, g, b = int(255*(1-(norm-0.5)/0.5)), 255, 0
139
- return f"#{r:02X}{g:02X}{b:02X}"
140
-
141
- if df.empty:
142
- html_table = "<div class='no-results'>No matching results found.</div>"
143
- else:
144
- colmins = {c:df[c].min() for c in ["Average Score","Conversation Consistency","Backend Consistency","Policy Completeness"]}
145
- colmaxs = {c:df[c].max() for c in colmins}
146
- # build table...
147
- html = "<table><tr>" + "".join(f"<th>{c}</th>" for c in df.columns) + "</tr>"
148
- for _, row in df.iterrows():
149
- html += "<tr>" + "".join(
150
- f"<td style='color:{get_color_for_value(row[c],colmins[c],colmaxs[c])}'>" + (str(row[c])) + "</td>"
151
- if c in colmins
152
- else f"<td>{row[c]}</td>"
153
- for c in df.columns
154
- ) + "</tr>"
155
- html += "</table>"
156
- html_table = html
157
-
158
- sort_by = sort_state.get("sort_by","Average Score")
159
- dir_char = "β–²" if sort_state.get("ascending",False) else "β–Ό"
160
  html_output = f"""
161
- <div class="sort-info"><p>Sorted by: {sort_by} {dir_char}</p></div>
 
 
162
  {html_table}
163
  """
 
164
  return html_output
165
-
166
  except Exception as e:
167
- # fallback
168
- df = create_grouped_leaderboard(True,True,True,{"sort_by":"Average Score","ascending":False},"")
169
- return "<div class='sort-info' style='color:red'><p>Error loading leaderboard</p></div>" + generate_html_table(df)
 
 
 
 
 
 
 
 
 
170
 
171
  with gr.Blocks(css=custom_css, title="TD-EVAL Leaderboard") as demo:
172
  gr.Markdown("# πŸ† TD-EVAL Model Evaluation Leaderboard")
173
  gr.HTML('<div class="subtitle">This leaderboard displays aggregated model performance across multiple evaluation metrics.</div>')
 
174
  gr.HTML('''
175
  <div class="variants_container">
176
  <div class="variants_title">Variants:</div>
177
- <ul>
178
- <li><strong>mwoz</strong>: Baseline variant.</li>
179
- <li><strong>tau-airline</strong>: Airline specialty variant.</li>
180
- <li><strong>tau-retail</strong>: Retail specialty variant.</li>
181
  </ul>
182
  <p>Use the checkboxes below to select which variants to include. At least one variant must be active.</p>
183
  </div>
184
  ''')
185
-
186
- # βœ”οΈ PANEL 1: checkboxes
187
  with gr.Row(elem_classes="checkbox-panel"):
188
  cb_mwoz = gr.Checkbox(label="mwoz", value=True)
189
  cb_tau_airline = gr.Checkbox(label="tau-airline", value=True)
190
  cb_tau_retail = gr.Checkbox(label="tau-retail", value=True)
191
 
192
- # βœ”οΈ PANEL 2: search + sort state
193
  with gr.Row(elem_classes="search-panel"):
194
- search_input = gr.Textbox(label="Search models", placeholder="Type to filter…", elem_classes="search-input")
195
- hidden_sort_state = gr.State({"sort_by":"Average Score","ascending":False})
196
-
 
 
 
 
 
 
197
  gr.Markdown("### Sort by:")
198
  with gr.Row():
199
- btn_avg = gr.Button("Average Score β–Ό")
200
- btn_conv = gr.Button("Conversation Consistency")
201
  btn_backend = gr.Button("Backend Consistency")
202
- btn_policy = gr.Button("Policy Completeness")
203
 
204
- leaderboard_display = gr.HTML()
205
-
206
- # wire up all callbacks exactly as before...
207
- cb_mwoz.change(update_leaderboard, [cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], leaderboard_display)
208
- cb_tau_airline.change(update_leaderboard, [cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], leaderboard_display)
209
- cb_tau_retail.change(update_leaderboard, [cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], leaderboard_display)
210
- search_input.change(update_leaderboard, [cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], leaderboard_display)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
- # sort buttons (toggle logic omitted for brevity; assume same as before)
213
- # ...
214
- demo.load(update_leaderboard, [cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], leaderboard_display)
215
 
216
  if __name__ == "__main__":
217
  demo.launch()
 
14
 
15
  # Static grouping mapping for the 10 general submissions.
16
  GROUPS = [
17
+ {"mwoz": "20250214_193236-o1", "tau_airline": "20250215_115156-tau-o1-airline", "tau_retail": "20250215_121147-tau-o1-retail"},
18
+ {"mwoz": "20250131_012338-llama405", "tau_airline": "20250204_144222-tau-llama-405b-airline", "tau_retail": "20250205_033820-tau-llama405b-retail"},
19
+ {"mwoz": "20250130_140218-4o", "tau_airline": "20250131_152503-tau-4o-airline", "tau_retail": "20250131_152422-tau-4o-retail"},
20
+ {"mwoz": "20250130_183030-claude", "tau_airline": "20250205_030422-tau-sonnet-airline", "tau_retail": "20250131_152807-tau-sonnet-retail"},
21
+ {"mwoz": "20250131_012449-llama70", "tau_airline": "20250208_024344-tau-llama70b-airline", "tau_retail": "20250208_030407-tau-llama70b-retail"},
22
+ {"mwoz": "20250131_013711-qwen72b", "tau_airline": "20250202_112945-qwen72b-airline", "tau_retail": "20250202_140527-qwen72b-retail"},
23
+ {"mwoz": "20250130_184905-mistrallarge", "tau_airline": "20250205_024823-tau-mistrallarge-airline", "tau_retail": "20250205_044403-tau-mistrallarge-retail"},
24
+ {"mwoz": "20250131_010143-o1mini", "tau_airline": "20250214_180731-tau-o1-mini-airline", "tau_retail": "20250214_142736-tau-o1-mini-retail"},
25
+ {"mwoz": "20250130_140439-4omini", "tau_airline": "20250131_152226-tau-4o-mini-airline", "tau_retail": "20250131_152338-tau-4o-mini-retail"},
26
+ {"mwoz": "20250130_145202-gpt35", "tau_airline": "20250131_152708-tau-gpt35-airline", "tau_retail": "20250131_152610-tau-gpt35-retail"}
27
  ]
28
 
29
  def load_mwoz_results():
 
41
  return json.load(f)
42
 
43
  def create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query=""):
 
44
  if not (selected_mwoz or selected_tau_airline or selected_tau_retail):
45
  selected_mwoz = True
46
 
47
  mwoz_data = load_mwoz_results()
48
+ tau_data = load_tau_results()
49
+ mwoz_lookup = {entry["model_name"]: entry for entry in mwoz_data}
50
+ tau_lookup = {entry["model_name"]: entry for entry in tau_data}
51
 
52
  aggregated = []
53
  for group in GROUPS:
54
+ metrics = {"avg_conv_consistency": 0, "avg_backend_consistency": 0, "avg_policy_completeness": 0}
55
  count = 0
56
  title_parts = []
57
  judge_model = ""
58
+ if selected_mwoz:
59
+ key = group["mwoz"]
60
+ if key in mwoz_lookup:
61
+ record = mwoz_lookup[key]
62
+ metrics["avg_conv_consistency"] += record.get("avg_conv_consistency", 0)
63
+ metrics["avg_backend_consistency"] += record.get("avg_backend_consistency", 0)
64
+ metrics["avg_policy_completeness"] += record.get("avg_policy_completeness", 0)
65
+ count += 1
66
+ title_parts.append(strip_timestamp(key))
67
+ judge_model = record.get("judge_model", "")
68
+ if selected_tau_airline:
69
+ key = group["tau_airline"]
70
+ if key in tau_lookup:
71
+ record = tau_lookup[key]
72
+ metrics["avg_conv_consistency"] += record.get("avg_conv_consistency", 0)
73
+ metrics["avg_backend_consistency"] += record.get("avg_backend_consistency", 0)
74
+ metrics["avg_policy_completeness"] += record.get("avg_policy_completeness", 0)
75
+ count += 1
76
+ title_parts.append(strip_timestamp(key))
77
+ judge_model = record.get("judge_model", "")
78
+ if selected_tau_retail:
79
+ key = group["tau_retail"]
80
+ if key in tau_lookup:
81
+ record = tau_lookup[key]
82
+ metrics["avg_conv_consistency"] += record.get("avg_conv_consistency", 0)
83
+ metrics["avg_backend_consistency"] += record.get("avg_backend_consistency", 0)
84
+ metrics["avg_policy_completeness"] += record.get("avg_policy_completeness", 0)
85
+ count += 1
86
+ title_parts.append(strip_timestamp(key))
87
+ judge_model = record.get("judge_model", "")
88
  if count > 0:
89
+ avg_conv = metrics["avg_conv_consistency"] / count
90
+ avg_backend = metrics["avg_backend_consistency"] / count
91
+ avg_policy = metrics["avg_policy_completeness"] / count
92
+ overall_avg = (avg_conv + avg_backend + avg_policy) / 3
93
  else:
94
  avg_conv = avg_backend = avg_policy = overall_avg = 0
95
 
96
  model_name = " / ".join(title_parts)
97
+
98
+ # Apply search filter
99
  if search_query and search_query.lower() not in model_name.lower():
100
  continue
101
+
102
  aggregated.append({
103
  "Model": model_name,
104
+ "Average Score": round(overall_avg, 4),
105
+ "Conversation Consistency": round(avg_conv, 4),
106
+ "Backend Consistency": round(avg_backend, 4),
107
+ "Policy Completeness": round(avg_policy, 4),
108
  "Judge Model": judge_model
109
  })
110
 
111
  df = pd.DataFrame(aggregated)
112
+
113
+ # If no results found after filtering
114
  if df.empty:
115
  return df
116
+
 
117
  df["Rank"] = df["Average Score"].rank(ascending=False, method="min").astype(int)
118
+
119
+ allowed_sort_cols = ["Average Score", "Conversation Consistency", "Backend Consistency", "Policy Completeness"]
120
+
121
+ # Handle sort_state safely
122
  if isinstance(sort_state, str):
123
+ try:
124
+ sort_state = json.loads(sort_state)
125
+ except:
126
+ sort_state = {"sort_by": "Average Score", "ascending": False}
127
+
128
+ # Ensure sort_state is a dict
129
  if not isinstance(sort_state, dict):
130
+ sort_state = {"sort_by": "Average Score", "ascending": False}
131
+
132
+ sort_by = sort_state.get("sort_by", "Average Score")
133
+ ascending = sort_state.get("ascending", False)
134
+
135
+ if sort_by in allowed_sort_cols:
136
+ df = df.sort_values(sort_by, ascending=ascending)
137
  else:
138
+ # Default sort if column not found
139
  df = df.sort_values("Average Score", ascending=False)
140
 
 
141
  cols = df.columns.tolist()
142
  if "Rank" in cols:
143
  cols.insert(0, cols.pop(cols.index("Rank")))
144
+ df = df[cols]
145
+
146
+ return df
147
+
148
+ def update_sort_state(current_state, clicked_column):
149
+ """
150
+ Update the sorting state based on the clicked column.
151
+ Handles various input formats for current_state.
152
+ """
153
+ # Default state if nothing valid is provided
154
+ new_state = {"sort_by": clicked_column, "ascending": False}
155
+
156
+ # Handle the case when current_state is a string (JSON)
157
+ if isinstance(current_state, str):
158
+ try:
159
+ current_state = json.loads(current_state)
160
+ except (json.JSONDecodeError, TypeError):
161
+ # If we can't parse it, return the default state
162
+ return new_state
163
+
164
+ # If current_state is None or not a dict, return default
165
+ if not isinstance(current_state, dict):
166
+ return new_state
167
+
168
+ # Now we're sure current_state is a dict
169
+ # Check if it has the needed keys
170
+ if "sort_by" in current_state:
171
+ if current_state["sort_by"] == clicked_column:
172
+ # Toggle direction for the same column
173
+ return {
174
+ "sort_by": clicked_column,
175
+ "ascending": not current_state.get("ascending", False)
176
+ }
177
+ else:
178
+ # New column, default to descending (false)
179
+ return {
180
+ "sort_by": clicked_column,
181
+ "ascending": False
182
+ }
183
+
184
+ # If we got here, current_state doesn't have the right format
185
+ return new_state
186
+
187
+ def sort_by_avg(sort_state):
188
+ return update_sort_state(sort_state, "Average Score")
189
+
190
+ def sort_by_conv(sort_state):
191
+ return update_sort_state(sort_state, "Conversation Consistency")
192
+
193
+ def sort_by_backend(sort_state):
194
+ return update_sort_state(sort_state, "Backend Consistency")
195
+
196
+ def sort_by_policy(sort_state):
197
+ return update_sort_state(sort_state, "Policy Completeness")
198
+
199
+ def get_color_for_value(value, min_val, max_val):
200
+ if max_val == min_val:
201
+ norm = 0.5
202
+ else:
203
+ norm = (value - min_val) / (max_val - min_val)
204
+ if norm < 0.5:
205
+ ratio = norm / 0.5
206
+ r = 255
207
+ g = int(255 * ratio)
208
+ b = 0
209
+ else:
210
+ ratio = (norm - 0.5) / 0.5
211
+ r = int(255 * (1 - ratio))
212
+ g = 255
213
+ b = 0
214
+ return f"#{r:02X}{g:02X}{b:02X}"
215
+
216
+ def generate_html_table(df):
217
+ if df.empty:
218
+ return "<div class='no-results'>No matching results found.</div>"
219
+
220
+ numeric_cols = ["Average Score", "Conversation Consistency", "Backend Consistency", "Policy Completeness"]
221
+ col_min = {}
222
+ col_max = {}
223
+ for col in numeric_cols:
224
+ col_min[col] = df[col].min() if not df.empty else 0
225
+ col_max[col] = df[col].max() if not df.empty else 0
226
+
227
+ # Build a simple HTML table without borders or JavaScript sorting
228
+ html = "<table style='border: none; border-collapse: collapse;'>"
229
+
230
+ # Header row
231
+ html += "<tr>"
232
+ for col in df.columns:
233
+ html += f"<th style='padding:8px; border: none;'>{col}</th>"
234
+ html += "</tr>"
235
+
236
+ # Table rows
237
+ for _, row in df.iterrows():
238
+ html += "<tr style='border: none;'>"
239
+ for col in df.columns:
240
+ cell_value = row[col]
241
+ if col in numeric_cols:
242
+ color = get_color_for_value(cell_value, col_min[col], col_max[col])
243
+ html += f"<td style='padding: 8px; border: none; color: {color};'>{cell_value}</td>"
244
+ else:
245
+ html += f"<td style='padding: 8px; border: none;'>{cell_value}</td>"
246
+ html += "</tr>"
247
+ html += "</table>"
248
+
249
+ return html
250
 
251
  def update_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query=""):
252
+ """
253
+ Update the leaderboard based on selection and sort state.
254
+ """
255
  try:
256
+ # Convert sort_state to dict if it's a string
257
  if isinstance(sort_state, str):
258
+ try:
259
+ sort_state = json.loads(sort_state)
260
+ except:
261
+ # If JSON parsing fails, create a default state
262
+ sort_state = {"sort_by": "Average Score", "ascending": False}
263
+
264
+ # Ensure sort_state is a dict
265
  if not isinstance(sort_state, dict):
266
+ sort_state = {"sort_by": "Average Score", "ascending": False}
267
+
268
+ # Generate the data and table
269
  df = create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query)
270
+ html_table = generate_html_table(df)
271
+
272
+ # Get sort info with fallbacks
273
+ sort_col = sort_state.get("sort_by", "Average Score")
274
+ sort_dir = "β–Ό" if not sort_state.get("ascending", False) else "β–²"
275
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  html_output = f"""
277
+ <div class="sort-info">
278
+ <p>Sorted by: {sort_col} {sort_dir}</p>
279
+ </div>
280
  {html_table}
281
  """
282
+
283
  return html_output
284
+
285
  except Exception as e:
286
+ # If anything goes wrong, return a basic table with an error message
287
+ print(f"Error in update_leaderboard: {str(e)}")
288
+ df = create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail,
289
+ {"sort_by": "Average Score", "ascending": False})
290
+ html_table = generate_html_table(df)
291
+
292
+ return f"""
293
+ <div class="sort-info" style="color: #ff6b6b;">
294
+ <p>Error in sorting. Using default sort: Average Score (descending)</p>
295
+ </div>
296
+ {html_table}
297
+ """
298
 
299
  with gr.Blocks(css=custom_css, title="TD-EVAL Leaderboard") as demo:
300
  gr.Markdown("# πŸ† TD-EVAL Model Evaluation Leaderboard")
301
  gr.HTML('<div class="subtitle">This leaderboard displays aggregated model performance across multiple evaluation metrics.</div>')
302
+
303
  gr.HTML('''
304
  <div class="variants_container">
305
  <div class="variants_title">Variants:</div>
306
+ <ul style="list-style: none; padding: 0; margin: 8px 0;">
307
+ <li>mwoz: Baseline variant.</li>
308
+ <li>tau-airline: Airline specialty variant.</li>
309
+ <li>tau-retail: Retail specialty variant.</li>
310
  </ul>
311
  <p>Use the checkboxes below to select which variants to include. At least one variant must be active.</p>
312
  </div>
313
  ''')
314
+
 
315
  with gr.Row(elem_classes="checkbox-panel"):
316
  cb_mwoz = gr.Checkbox(label="mwoz", value=True)
317
  cb_tau_airline = gr.Checkbox(label="tau-airline", value=True)
318
  cb_tau_retail = gr.Checkbox(label="tau-retail", value=True)
319
 
 
320
  with gr.Row(elem_classes="search-panel"):
321
+ search_input = gr.Textbox(
322
+ label="Search models",
323
+ placeholder="Type to filter…",
324
+ elem_classes="search-input"
325
+ )
326
+
327
+ hidden_sort_state = gr.State(value={"sort_by": "Average Score", "ascending": False})
328
+
329
+ # Add sorting buttons
330
  gr.Markdown("### Sort by:")
331
  with gr.Row():
332
+ btn_avg = gr.Button("Average Score β–Ό")
333
+ btn_conv = gr.Button("Conversation Consistency")
334
  btn_backend = gr.Button("Backend Consistency")
335
+ btn_policy = gr.Button("Policy Completeness")
336
 
337
+ leaderboard_display = gr.HTML(label="Aggregated Model Rankings")
338
+
339
+ # Function to toggle sort state and update button labels
340
+ def toggle_sort(column, current_state, btn_avg, btn_conv, btn_backend, btn_policy):
341
+ # Default new state - flip direction if same column, otherwise default to descending
342
+ if isinstance(current_state, dict) and current_state.get("sort_by") == column:
343
+ new_ascending = not current_state.get("ascending", False)
344
+ else:
345
+ new_ascending = False
346
+
347
+ new_state = {"sort_by": column, "ascending": new_ascending}
348
+
349
+ # Update button labels
350
+ direction = "β–²" if new_ascending else "β–Ό"
351
+ avg_label = f"Average Score {direction}" if column == "Average Score" else "Average Score"
352
+ conv_label = f"Conversation Consistency {direction}" if column == "Conversation Consistency" else "Conversation Consistency"
353
+ backend_label = f"Backend Consistency {direction}" if column == "Backend Consistency" else "Backend Consistency"
354
+ policy_label = f"Policy Completeness {direction}" if column == "Policy Completeness" else "Policy Completeness"
355
+
356
+ return new_state, avg_label, conv_label, backend_label, policy_label
357
+
358
+ # Connect sort buttons with the toggle function
359
+ btn_avg.click(
360
+ fn=toggle_sort,
361
+ inputs=[gr.Textbox(value="Average Score", visible=False), hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy],
362
+ outputs=[hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy]
363
+ ).then(
364
+ fn=update_leaderboard,
365
+ inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input],
366
+ outputs=leaderboard_display
367
+ )
368
+
369
+ btn_conv.click(
370
+ fn=toggle_sort,
371
+ inputs=[gr.Textbox(value="Conversation Consistency", visible=False), hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy],
372
+ outputs=[hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy]
373
+ ).then(
374
+ fn=update_leaderboard,
375
+ inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input],
376
+ outputs=leaderboard_display
377
+ )
378
+
379
+ btn_backend.click(
380
+ fn=toggle_sort,
381
+ inputs=[gr.Textbox(value="Backend Consistency", visible=False), hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy],
382
+ outputs=[hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy]
383
+ ).then(
384
+ fn=update_leaderboard,
385
+ inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input],
386
+ outputs=leaderboard_display
387
+ )
388
+
389
+ btn_policy.click(
390
+ fn=toggle_sort,
391
+ inputs=[gr.Textbox(value="Policy Completeness", visible=False), hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy],
392
+ outputs=[hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy]
393
+ ).then(
394
+ fn=update_leaderboard,
395
+ inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input],
396
+ outputs=leaderboard_display
397
+ )
398
+
399
+ # Connect dataflow for variant checkboxes and search
400
+ cb_mwoz.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
401
+ cb_tau_airline.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
402
+ cb_tau_retail.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
403
+ search_input.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
404
 
405
+ demo.load(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
 
 
406
 
407
  if __name__ == "__main__":
408
  demo.launch()
styles.css CHANGED
@@ -3,34 +3,36 @@
3
  ------------------------------------------------------------------ */
4
  body {
5
  font-family: Arial, sans-serif;
6
- background-color: #000000;
7
  margin: 20px;
8
- color: #FFFFFF;
9
- }
10
-
11
- /* ------------------------------------------------------------------
12
- Headings & Subtitle
13
- ------------------------------------------------------------------ */
14
- h1, h2, h3, .subtitle, .variants_container {
15
- color: #CCCCCC;
16
  display: flex;
17
  text-align: center;
18
  justify-content: center;
19
- }
20
- h1 {
 
21
  font-size: 2.3rem;
22
  font-weight: 700;
23
  margin-top: 2rem;
24
- }
25
- .subtitle {
 
26
  margin-bottom: 50px;
27
  color: #CCCCCC !important;
28
- }
29
-
30
- /* ------------------------------------------------------------------
31
- Variants Container (static explanatory box)
32
- ------------------------------------------------------------------ */
33
- .variants_container {
34
  margin: 50px auto;
35
  border-radius: 10px;
36
  display: flex;
@@ -38,77 +40,263 @@
38
  justify-content: center;
39
  padding: 15px;
40
  width: fit-content;
41
- background-color: transparent;
42
- color: #CCCCCC !important;
43
- }
44
- .variants_title {
45
- font-size: 1.25rem;
 
46
  font-weight: 500;
47
  color: #CCCCCC !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  }
49
 
50
- /* ------------------------------------------------------------------
51
- Table styling
52
- ------------------------------------------------------------------ */
53
- /* your existing table rules... */
 
54
 
55
- /* ------------------------------------------------------------------
56
- Sort-info Banner
57
- ------------------------------------------------------------------ */
58
- .sort-info {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  text-align: center;
60
  margin: 10px 0;
61
  padding: 5px;
62
  background-color: #27272A;
63
  border-radius: 5px;
64
- font-size: 1rem;
65
- }
66
- .sort-info, .sort-info * {
67
- color: #CCCCCC !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  }
69
 
70
- /* ------------------------------------------------------------------
71
- Panel overrides for checkbox-group & search-bar
72
- ------------------------------------------------------------------ */
73
- /* Base panel styling */
74
  .checkbox-panel,
 
 
 
 
 
 
 
75
  .search-panel {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  background-color: green !important;
77
  padding: 12px !important;
78
  border-radius: 6px !important;
79
- margin-bottom: 1rem !important;
 
 
 
80
 
81
- /* override Gradio’s block‐background–fill variable */
82
- --block-background-fill: transparent !important;
 
83
  }
84
- /* force all text to white */
85
- .checkbox-panel *,
86
- .search-panel * {
 
 
 
 
 
87
  color: #FFFFFF !important;
88
  }
89
 
90
- /* kill Gradio’s inner card backgrounds & borders */
91
- .checkbox-panel .block,
92
- .search-panel .block {
 
 
 
 
 
 
 
 
 
93
  background-color: transparent !important;
94
  border: none !important;
95
  box-shadow: none !important;
96
  }
97
 
98
- /* strip out Gradio’s .gr-form-group on the Textbox row */
99
- .search-panel .gr-form-group {
100
  background-color: transparent !important;
101
  border: none !important;
102
  box-shadow: none !important;
103
  }
104
 
105
- /* placeholder styling */
106
- .search-panel .search-input input::placeholder {
107
- color: rgba(255,255,255,0.7) !important;
 
108
  }
109
 
110
- /* ------------------------------------------------------------------
111
- No-results Message
112
- ------------------------------------------------------------------ */
113
- /* your existing .no-results rules... */
 
 
 
 
114
 
 
3
  ------------------------------------------------------------------ */
4
  body {
5
  font-family: Arial, sans-serif;
6
+ background-color: #000000;
7
  margin: 20px;
8
+ color: #FFFFFF;
9
+ }
10
+
11
+ /* ------------------------------------------------------------------
12
+ Headings & Subtitle
13
+ ------------------------------------------------------------------ */
14
+ h1, h2, h3, .subtitle, .variants_container {
15
+ color: #CCCCCC;
16
  display: flex;
17
  text-align: center;
18
  justify-content: center;
19
+ }
20
+
21
+ h1 {
22
  font-size: 2.3rem;
23
  font-weight: 700;
24
  margin-top: 2rem;
25
+ }
26
+
27
+ .subtitle {
28
  margin-bottom: 50px;
29
  color: #CCCCCC !important;
30
+ }
31
+
32
+ /* ------------------------------------------------------------------
33
+ Variants Container (Filters)
34
+ ------------------------------------------------------------------ */
35
+ .variants_container {
36
  margin: 50px auto;
37
  border-radius: 10px;
38
  display: flex;
 
40
  justify-content: center;
41
  padding: 15px;
42
  width: fit-content;
43
+ color: #CCCCCC!important;
44
+ background-color:transparent;
45
+ }
46
+
47
+ .variants_title {
48
+ font-size: 20px;
49
  font-weight: 500;
50
  color: #CCCCCC !important;
51
+ }
52
+
53
+ /* Force all descendants of the variants container to be dark */
54
+ .variants_container,
55
+ .variants_container * {
56
+ color: #CCCCCC!important;
57
+ }
58
+
59
+
60
+ /* ------------------------------------------------------------------
61
+ Table styling
62
+ ------------------------------------------------------------------ */
63
+ table {
64
+ width: 100%;
65
+ /* border-collapse: separate; */
66
+ border-radius: 10px;
67
+ overflow: hidden;
68
+ margin-top: 20px;
69
+ }
70
+
71
+ table th {
72
+ background-color: #27272A;
73
+ color: #FFFFFF;
74
+ font-weight: bold;
75
+ font-size: 18px;
76
+ border: 1px solid #CCCCCC;
77
+ }
78
+
79
+ table tr:not(:first-child):nth-child(odd) {
80
+ background-color: #27272aef;
81
+
82
+ }
83
+
84
+ table tr:not(:first-child):nth-child(even) {
85
+ background-color: #27272add;
86
+
87
+ }
88
+
89
+ table tr:not(:first-child):nth-child(odd) td {
90
+ color: #ffffff;
91
+ border: 1px solid #CCCCCC;
92
+
93
  }
94
 
95
+ table tr:not(:first-child):nth-child(even) td {
96
+ color: #ffffff;
97
+ border: 1px solid #CCCCCC;
98
+
99
+ }
100
 
101
+
102
+ th, td {
103
+ padding: 8px;
104
+ text-align: center;
105
+ border: 1px solid white;
106
+ }
107
+
108
+ /* ------------------------------------------------------------------
109
+ Buttons
110
+ ------------------------------------------------------------------ */
111
+ button {
112
+ background-color: #c34700b6;
113
+ color: #ffffff;
114
+ border: none;
115
+ padding: 8px 12px;
116
+ border-radius: 4px;
117
+ cursor: pointer;
118
+ font-size: 14px;
119
+ transition: all 0.3s ease;
120
+ }
121
+
122
+ button:hover {
123
+ background-color: #c34800;
124
+ transform: translateY(-2px);
125
+ box-shadow: 0 4px 8px rgba(0,0,0,0.2);
126
+ }
127
+
128
+ /* ------------------------------------------------------------------
129
+ Sort‐info Banner
130
+ ------------------------------------------------------------------ */
131
+ .sort-info {
132
  text-align: center;
133
  margin: 10px 0;
134
  padding: 5px;
135
  background-color: #27272A;
136
  border-radius: 5px;
137
+ font-size: 16px;
138
+ }
139
+
140
+ .sort-info,
141
+ .sort-info * {
142
+ color: #CCCCCC !important;
143
+ }
144
+
145
+
146
+ /* ------------------------------------------------------------------
147
+ Checkboxes Container
148
+ ------------------------------------------------------------------ */
149
+ .gradio-container .checkbox-container {
150
+ margin-right: 10px;
151
+ background-color: #27272A;
152
+ padding: 8px;
153
+ border-radius: 5px;
154
+ }
155
+
156
+ /* ------------------------------------------------------------------
157
+ Search Input
158
+ ------------------------------------------------------------------ */
159
+ input[type="text"] {
160
+ background-color: #27272A;
161
+ color: #FFFFFF;
162
+ border: 1px solid #CCCCCC;
163
+ border-radius: 5px;
164
+ padding: 10px;
165
+ width: 100%;
166
+ margin-bottom: 15px;
167
+ font-size: 16px;
168
+ }
169
+
170
+ input[type="text"]:focus {
171
+ border-color: #FFFFFF;
172
+ outline: none;
173
+ box-shadow: 0 0 5px rgba(0, 0, 0, 0.5);
174
+ }
175
+
176
+ /* ------------------------------------------------------------------
177
+ No‐results Message
178
+ ------------------------------------------------------------------ */
179
+ .no-results {
180
+ color: #FFFFFF;
181
+ text-align: center;
182
+ padding: 30px;
183
+ background-color: #27272A;
184
+ border-radius: 10px;
185
+ font-size: 18px;
186
+ margin-top: 20px;
187
+ }
188
+
189
+ /* ─────────────────────────────────────────────────────────────────────────
190
+ the checkbox‐group panel
191
+ ─────────────────────────────────────────────────────────────────────── */
192
+ .checkbox-panel {
193
+ background-color: green !important;
194
+ padding: 12px !important;
195
+ border-radius: 6px !important;
196
+ margin-bottom: 1rem !important; /* give it some breathing room */
197
  }
198
 
 
 
 
 
199
  .checkbox-panel,
200
+ .checkbox-panel * {
201
+ color: #FFFFFF !important;
202
+ }
203
+
204
+ /* ─────────────────────────────────────────────────────────────────────────
205
+ the search‐bar panel
206
+ ─────────────────────────────────────────────────────────────────────── */
207
  .search-panel {
208
+ background-color: green !important;
209
+ padding: 12px !important;
210
+ border-radius: 6px !important;
211
+ margin-bottom: 1rem !important;
212
+ }
213
+
214
+ .search-panel,
215
+ .search-panel * {
216
+ color: #FFFFFF !important;
217
+ }
218
+
219
+ /* make the textbox itself blend with the panel */
220
+ .search-panel input[type="text"] {
221
+ background-color: transparent !important;
222
+ border: 1px solid #FFFFFF !important;
223
+ }
224
+
225
+ /* lighten the placeholder text so it’s visible */
226
+ .search-panel input[type="text"]::placeholder {
227
+ color: rgba(255,255,255,0.7) !important;
228
+ }
229
+
230
+
231
+
232
+
233
+ .checkbox-panel { background-color: green !important; }
234
+ .checkbox-panel, .checkbox-panel * { color: #FFF !important; }
235
+
236
+ .search-panel { background-color: green !important; }
237
+ .search-panel, .search-panel * { color: #FFF !important; }
238
+
239
+ /* your existing panel styles */
240
+ .checkbox-panel {
241
  background-color: green !important;
242
  padding: 12px !important;
243
  border-radius: 6px !important;
244
+ }
245
+ .checkbox-panel, .checkbox-panel * {
246
+ color: #FFFFFF !important;
247
+ }
248
 
249
+ /* make the checkbox β€œcards” transparent */
250
+ .checkbox-panel .checkbox-container {
251
+ background-color: transparent !important;
252
  }
253
+
254
+ /* similarly for the search bar row */
255
+ .search-panel {
256
+ background-color: green !important;
257
+ padding: 12px !important;
258
+ border-radius: 6px !important;
259
+ }
260
+ .search-panel, .search-panel * {
261
  color: #FFFFFF !important;
262
  }
263
 
264
+ /* strip out Gradio’s default input wrapper */
265
+ .search-panel .gr-form-group {
266
+ background-color: transparent !important;
267
+ }
268
+
269
+ /* placeholder styling */
270
+ .search-panel .search-input input::placeholder {
271
+ color: rgba(255,255,255,0.7) !important;
272
+ }
273
+
274
+ /* ─── Kill the inner β€œcard” backgrounds ───────────────────────────────────── */
275
+ .checkbox-panel .block {
276
  background-color: transparent !important;
277
  border: none !important;
278
  box-shadow: none !important;
279
  }
280
 
281
+ /* If you ever need to do the same for the search row: */
282
+ .search-panel .block {
283
  background-color: transparent !important;
284
  border: none !important;
285
  box-shadow: none !important;
286
  }
287
 
288
+
289
+
290
+ :root {
291
+ --block-background-fill: var(--neutral-800);
292
  }
293
 
294
+
295
+ /* at the very end of styles.css */
296
+ .checkbox-panel {
297
+ --block-background-fill: transparent !important;
298
+ }
299
+ .search-panel {
300
+ --block-background-fill: transparent !important;
301
+ }
302