TD-EVAL_leaderboard

Sleeping

App Files Files Community

juancauma commited on Apr 29

Commit

f17ba29

1 Parent(s): faf9069

changes to font color

Browse files

Files changed (2) hide show

app.py +305 -114
styles.css +246 -58

app.py CHANGED Viewed

@@ -14,16 +14,16 @@ def strip_timestamp(name):
 # Static grouping mapping for the 10 general submissions.
 GROUPS = [
-    {"mwoz": "20250214_193236-o1",         "tau_airline": "20250215_115156-tau-o1-airline",     "tau_retail": "20250215_121147-tau-o1-retail"},
-    {"mwoz": "20250131_012338-llama405",   "tau_airline": "20250204_144222-tau-llama-405b-airline","tau_retail": "20250205_033820-tau-llama405b-retail"},
-    {"mwoz": "20250130_140218-4o",         "tau_airline": "20250131_152503-tau-4o-airline",      "tau_retail": "20250131_152422-tau-4o-retail"},
-    {"mwoz": "20250130_183030-claude",     "tau_airline": "20250205_030422-tau-sonnet-airline",  "tau_retail": "20250131_152807-tau-sonnet-retail"},
-    {"mwoz": "20250131_012449-llama70",    "tau_airline": "20250208_024344-tau-llama70b-airline","tau_retail": "20250208_030407-tau-llama70b-retail"},
-    {"mwoz": "20250131_013711-qwen72b",    "tau_airline": "20250202_112945-qwen72b-airline",    "tau_retail": "20250202_140527-qwen72b-retail"},
-    {"mwoz": "20250130_184905-mistrallarge","tau_airline": "20250205_024823-tau-mistrallarge-airline","tau_retail": "20250205_044403-tau-mistrallarge-retail"},
-    {"mwoz": "20250131_010143-o1mini",     "tau_airline": "20250214_180731-tau-o1-mini-airline", "tau_retail": "20250214_142736-tau-o1-mini-retail"},
-    {"mwoz": "20250130_140439-4omini",     "tau_airline": "20250131_152226-tau-4o-mini-airline", "tau_retail": "20250131_152338-tau-4o-mini-retail"},
-    {"mwoz": "20250130_145202-gpt35",      "tau_airline": "20250131_152708-tau-gpt35-airline",   "tau_retail": "20250131_152610-tau-gpt35-retail"}
 ]
 def load_mwoz_results():
@@ -41,177 +41,368 @@ def load_tau_results():
         return json.load(f)
 def create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query=""):
-    # ensure at least one variant selected
     if not (selected_mwoz or selected_tau_airline or selected_tau_retail):
         selected_mwoz = True
     mwoz_data = load_mwoz_results()
-    tau_data  = load_tau_results()
-    mwoz_lookup = {e["model_name"]: e for e in mwoz_data}
-    tau_lookup  = {e["model_name"]: e for e in tau_data}
     aggregated = []
     for group in GROUPS:
-        metrics = {"avg_conv_consistency":0, "avg_backend_consistency":0, "avg_policy_completeness":0}
         count = 0
         title_parts = []
         judge_model = ""
-        # collect metrics from each selected variant
-        for key_name, lookup in [("mwoz", mwoz_lookup), ("tau_airline", tau_lookup), ("tau_retail", tau_lookup)]:
-            if locals()[f"selected_{key_name}"]:
-                key = group[key_name]
-                if key in lookup:
-                    rec = lookup[key]
-                    metrics["avg_conv_consistency"]   += rec.get("avg_conv_consistency",0)
-                    metrics["avg_backend_consistency"] += rec.get("avg_backend_consistency",0)
-                    metrics["avg_policy_completeness"] += rec.get("avg_policy_completeness",0)
-                    count += 1
-                    title_parts.append(strip_timestamp(key))
-                    judge_model = rec.get("judge_model", judge_model)
         if count > 0:
-            avg_conv    = metrics["avg_conv_consistency"]/count
-            avg_backend = metrics["avg_backend_consistency"]/count
-            avg_policy  = metrics["avg_policy_completeness"]/count
-            overall_avg = (avg_conv + avg_backend + avg_policy)/3
         else:
             avg_conv = avg_backend = avg_policy = overall_avg = 0
         model_name = " / ".join(title_parts)
         if search_query and search_query.lower() not in model_name.lower():
             continue
         aggregated.append({
             "Model": model_name,
-            "Average Score": round(overall_avg,4),
-            "Conversation Consistency": round(avg_conv,4),
-            "Backend Consistency": round(avg_backend,4),
-            "Policy Completeness": round(avg_policy,4),
             "Judge Model": judge_model
         })
     df = pd.DataFrame(aggregated)
     if df.empty:
         return df
-    # ranking & sorting
     df["Rank"] = df["Average Score"].rank(ascending=False, method="min").astype(int)
-    allowed = ["Average Score","Conversation Consistency","Backend Consistency","Policy Completeness"]
     if isinstance(sort_state, str):
-        try: sort_state = json.loads(sort_state)
-        except: sort_state = {"sort_by":"Average Score","ascending":False}
     if not isinstance(sort_state, dict):
-        sort_state = {"sort_by":"Average Score","ascending":False}
-    sort_by, asc = sort_state.get("sort_by","Average Score"), sort_state.get("ascending",False)
-    if sort_by in allowed:
-        df = df.sort_values(sort_by, ascending=asc)
     else:
         df = df.sort_values("Average Score", ascending=False)
-    # move Rank column to front
     cols = df.columns.tolist()
     if "Rank" in cols:
         cols.insert(0, cols.pop(cols.index("Rank")))
-    return df[cols]
 def update_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query=""):
-    # generate HTML for the leaderboard
     try:
-        # normalize sort_state
         if isinstance(sort_state, str):
-            try: sort_state = json.loads(sort_state)
-            except: sort_state = {"sort_by":"Average Score","ascending":False}
         if not isinstance(sort_state, dict):
-            sort_state = {"sort_by":"Average Score","ascending":False}
         df = create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query)
-        # color gradients & HTML generation
-        def get_color_for_value(value, mn, mx):
-            if mx==mn: norm=0.5
-            else: norm=(value-mn)/(mx-mn)
-            if norm<0.5:
-                r, g, b = 255, int(255*(norm/0.5)), 0
-            else:
-                r, g, b = int(255*(1-(norm-0.5)/0.5)), 255, 0
-            return f"#{r:02X}{g:02X}{b:02X}"
-        if df.empty:
-            html_table = "<div class='no-results'>No matching results found.</div>"
-        else:
-            colmins = {c:df[c].min() for c in ["Average Score","Conversation Consistency","Backend Consistency","Policy Completeness"]}
-            colmaxs = {c:df[c].max() for c in colmins}
-            # build table...
-            html = "<table><tr>" + "".join(f"<th>{c}</th>" for c in df.columns) + "</tr>"
-            for _, row in df.iterrows():
-                html += "<tr>" + "".join(
-                    f"<td style='color:{get_color_for_value(row[c],colmins[c],colmaxs[c])}'>" + (str(row[c])) + "</td>"
-                    if c in colmins
-                    else f"<td>{row[c]}</td>"
-                    for c in df.columns
-                ) + "</tr>"
-            html += "</table>"
-            html_table = html
-        sort_by = sort_state.get("sort_by","Average Score")
-        dir_char = "▲" if sort_state.get("ascending",False) else "▼"
         html_output = f"""
-        <div class="sort-info"><p>Sorted by: {sort_by} {dir_char}</p></div>
         {html_table}
         """
         return html_output
     except Exception as e:
-        # fallback
-        df = create_grouped_leaderboard(True,True,True,{"sort_by":"Average Score","ascending":False},"")
-        return "<div class='sort-info' style='color:red'><p>Error loading leaderboard</p></div>" + generate_html_table(df)
 with gr.Blocks(css=custom_css, title="TD-EVAL Leaderboard") as demo:
     gr.Markdown("# 🏆 TD-EVAL Model Evaluation Leaderboard")
     gr.HTML('<div class="subtitle">This leaderboard displays aggregated model performance across multiple evaluation metrics.</div>')
     gr.HTML('''
       <div class="variants_container">
         <div class="variants_title">Variants:</div>
-        <ul>
-          <li><strong>mwoz</strong>: Baseline variant.</li>
-          <li><strong>tau-airline</strong>: Airline specialty variant.</li>
-          <li><strong>tau-retail</strong>: Retail specialty variant.</li>
         </ul>
         <p>Use the checkboxes below to select which variants to include. At least one variant must be active.</p>
       </div>
     ''')
-    # ✔️ PANEL 1: checkboxes
     with gr.Row(elem_classes="checkbox-panel"):
         cb_mwoz        = gr.Checkbox(label="mwoz",        value=True)
         cb_tau_airline = gr.Checkbox(label="tau-airline", value=True)
         cb_tau_retail  = gr.Checkbox(label="tau-retail",  value=True)
-    # ✔️ PANEL 2: search + sort state
     with gr.Row(elem_classes="search-panel"):
-        search_input      = gr.Textbox(label="Search models", placeholder="Type to filter…", elem_classes="search-input")
-        hidden_sort_state = gr.State({"sort_by":"Average Score","ascending":False})
     gr.Markdown("### Sort by:")
     with gr.Row():
-        btn_avg     = gr.Button("Average Score ▼")
-        btn_conv    = gr.Button("Conversation Consistency")
         btn_backend = gr.Button("Backend Consistency")
-        btn_policy  = gr.Button("Policy Completeness")
-    leaderboard_display = gr.HTML()
-    # wire up all callbacks exactly as before...
-    cb_mwoz.change(update_leaderboard, [cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], leaderboard_display)
-    cb_tau_airline.change(update_leaderboard, [cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], leaderboard_display)
-    cb_tau_retail.change(update_leaderboard, [cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], leaderboard_display)
-    search_input.change(update_leaderboard, [cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], leaderboard_display)
-    # sort buttons (toggle logic omitted for brevity; assume same as before)
-    # ...
-    demo.load(update_leaderboard, [cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], leaderboard_display)
 if __name__ == "__main__":
     demo.launch()

 # Static grouping mapping for the 10 general submissions.
 GROUPS = [
+    {"mwoz": "20250214_193236-o1", "tau_airline": "20250215_115156-tau-o1-airline", "tau_retail": "20250215_121147-tau-o1-retail"},
+    {"mwoz": "20250131_012338-llama405", "tau_airline": "20250204_144222-tau-llama-405b-airline", "tau_retail": "20250205_033820-tau-llama405b-retail"},
+    {"mwoz": "20250130_140218-4o", "tau_airline": "20250131_152503-tau-4o-airline", "tau_retail": "20250131_152422-tau-4o-retail"},
+    {"mwoz": "20250130_183030-claude", "tau_airline": "20250205_030422-tau-sonnet-airline", "tau_retail": "20250131_152807-tau-sonnet-retail"},
+    {"mwoz": "20250131_012449-llama70", "tau_airline": "20250208_024344-tau-llama70b-airline", "tau_retail": "20250208_030407-tau-llama70b-retail"},
+    {"mwoz": "20250131_013711-qwen72b", "tau_airline": "20250202_112945-qwen72b-airline", "tau_retail": "20250202_140527-qwen72b-retail"},
+    {"mwoz": "20250130_184905-mistrallarge", "tau_airline": "20250205_024823-tau-mistrallarge-airline", "tau_retail": "20250205_044403-tau-mistrallarge-retail"},
+    {"mwoz": "20250131_010143-o1mini", "tau_airline": "20250214_180731-tau-o1-mini-airline", "tau_retail": "20250214_142736-tau-o1-mini-retail"},
+    {"mwoz": "20250130_140439-4omini", "tau_airline": "20250131_152226-tau-4o-mini-airline", "tau_retail": "20250131_152338-tau-4o-mini-retail"},
+    {"mwoz": "20250130_145202-gpt35", "tau_airline": "20250131_152708-tau-gpt35-airline", "tau_retail": "20250131_152610-tau-gpt35-retail"}
 ]
 def load_mwoz_results():
         return json.load(f)
 def create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query=""):
     if not (selected_mwoz or selected_tau_airline or selected_tau_retail):
         selected_mwoz = True
     mwoz_data = load_mwoz_results()
+    tau_data = load_tau_results()
+    mwoz_lookup = {entry["model_name"]: entry for entry in mwoz_data}
+    tau_lookup = {entry["model_name"]: entry for entry in tau_data}
     aggregated = []
     for group in GROUPS:
+        metrics = {"avg_conv_consistency": 0, "avg_backend_consistency": 0, "avg_policy_completeness": 0}
         count = 0
         title_parts = []
         judge_model = ""
+        if selected_mwoz:
+            key = group["mwoz"]
+            if key in mwoz_lookup:
+                record = mwoz_lookup[key]
+                metrics["avg_conv_consistency"] += record.get("avg_conv_consistency", 0)
+                metrics["avg_backend_consistency"] += record.get("avg_backend_consistency", 0)
+                metrics["avg_policy_completeness"] += record.get("avg_policy_completeness", 0)
+                count += 1
+                title_parts.append(strip_timestamp(key))
+                judge_model = record.get("judge_model", "")
+        if selected_tau_airline:
+            key = group["tau_airline"]
+            if key in tau_lookup:
+                record = tau_lookup[key]
+                metrics["avg_conv_consistency"] += record.get("avg_conv_consistency", 0)
+                metrics["avg_backend_consistency"] += record.get("avg_backend_consistency", 0)
+                metrics["avg_policy_completeness"] += record.get("avg_policy_completeness", 0)
+                count += 1
+                title_parts.append(strip_timestamp(key))
+                judge_model = record.get("judge_model", "")
+        if selected_tau_retail:
+            key = group["tau_retail"]
+            if key in tau_lookup:
+                record = tau_lookup[key]
+                metrics["avg_conv_consistency"] += record.get("avg_conv_consistency", 0)
+                metrics["avg_backend_consistency"] += record.get("avg_backend_consistency", 0)
+                metrics["avg_policy_completeness"] += record.get("avg_policy_completeness", 0)
+                count += 1
+                title_parts.append(strip_timestamp(key))
+                judge_model = record.get("judge_model", "")
         if count > 0:
+            avg_conv = metrics["avg_conv_consistency"] / count
+            avg_backend = metrics["avg_backend_consistency"] / count
+            avg_policy = metrics["avg_policy_completeness"] / count
+            overall_avg = (avg_conv + avg_backend + avg_policy) / 3
         else:
             avg_conv = avg_backend = avg_policy = overall_avg = 0
         model_name = " / ".join(title_parts)
+        # Apply search filter
         if search_query and search_query.lower() not in model_name.lower():
             continue
         aggregated.append({
             "Model": model_name,
+            "Average Score": round(overall_avg, 4),
+            "Conversation Consistency": round(avg_conv, 4),
+            "Backend Consistency": round(avg_backend, 4),
+            "Policy Completeness": round(avg_policy, 4),
             "Judge Model": judge_model
         })
     df = pd.DataFrame(aggregated)
+    # If no results found after filtering
     if df.empty:
         return df
     df["Rank"] = df["Average Score"].rank(ascending=False, method="min").astype(int)
+    allowed_sort_cols = ["Average Score", "Conversation Consistency", "Backend Consistency", "Policy Completeness"]
+    # Handle sort_state safely
     if isinstance(sort_state, str):
+        try:
+            sort_state = json.loads(sort_state)
+        except:
+            sort_state = {"sort_by": "Average Score", "ascending": False}
+    # Ensure sort_state is a dict
     if not isinstance(sort_state, dict):
+        sort_state = {"sort_by": "Average Score", "ascending": False}
+    sort_by = sort_state.get("sort_by", "Average Score")
+    ascending = sort_state.get("ascending", False)
+    if sort_by in allowed_sort_cols:
+        df = df.sort_values(sort_by, ascending=ascending)
     else:
+        # Default sort if column not found
         df = df.sort_values("Average Score", ascending=False)
     cols = df.columns.tolist()
     if "Rank" in cols:
         cols.insert(0, cols.pop(cols.index("Rank")))
+    df = df[cols]
+    return df
+def update_sort_state(current_state, clicked_column):
+    """
+    Update the sorting state based on the clicked column.
+    Handles various input formats for current_state.
+    """
+    # Default state if nothing valid is provided
+    new_state = {"sort_by": clicked_column, "ascending": False}
+    # Handle the case when current_state is a string (JSON)
+    if isinstance(current_state, str):
+        try:
+            current_state = json.loads(current_state)
+        except (json.JSONDecodeError, TypeError):
+            # If we can't parse it, return the default state
+            return new_state
+    # If current_state is None or not a dict, return default
+    if not isinstance(current_state, dict):
+        return new_state
+    # Now we're sure current_state is a dict
+    # Check if it has the needed keys
+    if "sort_by" in current_state:
+        if current_state["sort_by"] == clicked_column:
+            # Toggle direction for the same column
+            return {
+                "sort_by": clicked_column,
+                "ascending": not current_state.get("ascending", False)
+            }
+        else:
+            # New column, default to descending (false)
+            return {
+                "sort_by": clicked_column,
+                "ascending": False
+            }
+    # If we got here, current_state doesn't have the right format
+    return new_state
+def sort_by_avg(sort_state):
+    return update_sort_state(sort_state, "Average Score")
+def sort_by_conv(sort_state):
+    return update_sort_state(sort_state, "Conversation Consistency")
+def sort_by_backend(sort_state):
+    return update_sort_state(sort_state, "Backend Consistency")
+def sort_by_policy(sort_state):
+    return update_sort_state(sort_state, "Policy Completeness")
+def get_color_for_value(value, min_val, max_val):
+    if max_val == min_val:
+        norm = 0.5
+    else:
+        norm = (value - min_val) / (max_val - min_val)
+    if norm < 0.5:
+        ratio = norm / 0.5
+        r = 255
+        g = int(255 * ratio)
+        b = 0
+    else:
+        ratio = (norm - 0.5) / 0.5
+        r = int(255 * (1 - ratio))
+        g = 255
+        b = 0
+    return f"#{r:02X}{g:02X}{b:02X}"
+def generate_html_table(df):
+    if df.empty:
+        return "<div class='no-results'>No matching results found.</div>"
+    numeric_cols = ["Average Score", "Conversation Consistency", "Backend Consistency", "Policy Completeness"]
+    col_min = {}
+    col_max = {}
+    for col in numeric_cols:
+        col_min[col] = df[col].min() if not df.empty else 0
+        col_max[col] = df[col].max() if not df.empty else 0
+    # Build a simple HTML table without borders or JavaScript sorting
+    html = "<table style='border: none; border-collapse: collapse;'>"
+    # Header row
+    html += "<tr>"
+    for col in df.columns:
+        html += f"<th style='padding:8px; border: none;'>{col}</th>"
+    html += "</tr>"
+    # Table rows
+    for _, row in df.iterrows():
+        html += "<tr style='border: none;'>"
+        for col in df.columns:
+            cell_value = row[col]
+            if col in numeric_cols:
+                color = get_color_for_value(cell_value, col_min[col], col_max[col])
+                html += f"<td style='padding: 8px; border: none; color: {color};'>{cell_value}</td>"
+            else:
+                html += f"<td style='padding: 8px; border: none;'>{cell_value}</td>"
+        html += "</tr>"
+    html += "</table>"
+    return html
 def update_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query=""):
+    """
+    Update the leaderboard based on selection and sort state.
+    """
     try:
+        # Convert sort_state to dict if it's a string
         if isinstance(sort_state, str):
+            try:
+                sort_state = json.loads(sort_state)
+            except:
+                # If JSON parsing fails, create a default state
+                sort_state = {"sort_by": "Average Score", "ascending": False}
+        # Ensure sort_state is a dict
         if not isinstance(sort_state, dict):
+            sort_state = {"sort_by": "Average Score", "ascending": False}
+        # Generate the data and table
         df = create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query)
+        html_table = generate_html_table(df)
+        # Get sort info with fallbacks
+        sort_col = sort_state.get("sort_by", "Average Score")
+        sort_dir = "▼" if not sort_state.get("ascending", False) else "▲"
         html_output = f"""
+        <div class="sort-info">
+            <p>Sorted by: {sort_col} {sort_dir}</p>
+        </div>
         {html_table}
         """
         return html_output
     except Exception as e:
+        # If anything goes wrong, return a basic table with an error message
+        print(f"Error in update_leaderboard: {str(e)}")
+        df = create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail,
+                                       {"sort_by": "Average Score", "ascending": False})
+        html_table = generate_html_table(df)
+        return f"""
+        <div class="sort-info" style="color: #ff6b6b;">
+            <p>Error in sorting. Using default sort: Average Score (descending)</p>
+        </div>
+        {html_table}
+        """
 with gr.Blocks(css=custom_css, title="TD-EVAL Leaderboard") as demo:
     gr.Markdown("# 🏆 TD-EVAL Model Evaluation Leaderboard")
     gr.HTML('<div class="subtitle">This leaderboard displays aggregated model performance across multiple evaluation metrics.</div>')
     gr.HTML('''
       <div class="variants_container">
         <div class="variants_title">Variants:</div>
+       <ul style="list-style: none; padding: 0; margin: 8px 0;">
+          <li>mwoz: Baseline variant.</li>
+          <li>tau-airline: Airline specialty variant.</li>
+          <li>tau-retail: Retail specialty variant.</li>
         </ul>
         <p>Use the checkboxes below to select which variants to include. At least one variant must be active.</p>
       </div>
     ''')
     with gr.Row(elem_classes="checkbox-panel"):
         cb_mwoz        = gr.Checkbox(label="mwoz",        value=True)
         cb_tau_airline = gr.Checkbox(label="tau-airline", value=True)
         cb_tau_retail  = gr.Checkbox(label="tau-retail",  value=True)
     with gr.Row(elem_classes="search-panel"):
+        search_input = gr.Textbox(
+            label="Search models",
+            placeholder="Type to filter…",
+            elem_classes="search-input"
+        )
+    hidden_sort_state = gr.State(value={"sort_by": "Average Score", "ascending": False})
+    # Add sorting buttons
     gr.Markdown("### Sort by:")
     with gr.Row():
+        btn_avg = gr.Button("Average Score ▼")
+        btn_conv = gr.Button("Conversation Consistency")
         btn_backend = gr.Button("Backend Consistency")
+        btn_policy = gr.Button("Policy Completeness")
+    leaderboard_display = gr.HTML(label="Aggregated Model Rankings")
+    # Function to toggle sort state and update button labels
+    def toggle_sort(column, current_state, btn_avg, btn_conv, btn_backend, btn_policy):
+        # Default new state - flip direction if same column, otherwise default to descending
+        if isinstance(current_state, dict) and current_state.get("sort_by") == column:
+            new_ascending = not current_state.get("ascending", False)
+        else:
+            new_ascending = False
+        new_state = {"sort_by": column, "ascending": new_ascending}
+        # Update button labels
+        direction = "▲" if new_ascending else "▼"
+        avg_label = f"Average Score {direction}" if column == "Average Score" else "Average Score"
+        conv_label = f"Conversation Consistency {direction}" if column == "Conversation Consistency" else "Conversation Consistency"
+        backend_label = f"Backend Consistency {direction}" if column == "Backend Consistency" else "Backend Consistency"
+        policy_label = f"Policy Completeness {direction}" if column == "Policy Completeness" else "Policy Completeness"
+        return new_state, avg_label, conv_label, backend_label, policy_label
+    # Connect sort buttons with the toggle function
+    btn_avg.click(
+        fn=toggle_sort,
+        inputs=[gr.Textbox(value="Average Score", visible=False), hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy],
+        outputs=[hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy]
+    ).then(
+        fn=update_leaderboard,
+        inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input],
+        outputs=leaderboard_display
+    )
+    btn_conv.click(
+        fn=toggle_sort,
+        inputs=[gr.Textbox(value="Conversation Consistency", visible=False), hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy],
+        outputs=[hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy]
+    ).then(
+        fn=update_leaderboard,
+        inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input],
+        outputs=leaderboard_display
+    )
+    btn_backend.click(
+        fn=toggle_sort,
+        inputs=[gr.Textbox(value="Backend Consistency", visible=False), hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy],
+        outputs=[hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy]
+    ).then(
+        fn=update_leaderboard,
+        inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input],
+        outputs=leaderboard_display
+    )
+    btn_policy.click(
+        fn=toggle_sort,
+        inputs=[gr.Textbox(value="Policy Completeness", visible=False), hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy],
+        outputs=[hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy]
+    ).then(
+        fn=update_leaderboard,
+        inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input],
+        outputs=leaderboard_display
+    )
+    # Connect dataflow for variant checkboxes and search
+    cb_mwoz.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
+    cb_tau_airline.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
+    cb_tau_retail.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
+    search_input.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
+    demo.load(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
 if __name__ == "__main__":
     demo.launch()

styles.css CHANGED Viewed

@@ -3,34 +3,36 @@
    ------------------------------------------------------------------ */
    body {
     font-family: Arial, sans-serif;
-    background-color: #000000;
     margin: 20px;
-    color: #FFFFFF;
-  }
-  /* ------------------------------------------------------------------
-     Headings & Subtitle
-     ------------------------------------------------------------------ */
-  h1, h2, h3, .subtitle, .variants_container {
-    color: #CCCCCC;
     display: flex;
     text-align: center;
     justify-content: center;
-  }
-  h1 {
     font-size: 2.3rem;
     font-weight: 700;
     margin-top: 2rem;
-  }
-  .subtitle {
     margin-bottom: 50px;
     color: #CCCCCC !important;
-  }
-  /* ------------------------------------------------------------------
-     Variants Container (static explanatory box)
-     ------------------------------------------------------------------ */
-  .variants_container {
     margin: 50px auto;
     border-radius: 10px;
     display: flex;
@@ -38,77 +40,263 @@
     justify-content: center;
     padding: 15px;
     width: fit-content;
-    background-color: transparent;
-    color: #CCCCCC !important;
-  }
-  .variants_title {
-    font-size: 1.25rem;
     font-weight: 500;
     color: #CCCCCC !important;
   }
-  /* ------------------------------------------------------------------
-     Table styling
-     ------------------------------------------------------------------ */
-  /* your existing table rules... */
-  /* ------------------------------------------------------------------
-     Sort-info Banner
-     ------------------------------------------------------------------ */
-  .sort-info {
     text-align: center;
     margin: 10px 0;
     padding: 5px;
     background-color: #27272A;
     border-radius: 5px;
-    font-size: 1rem;
-  }
-  .sort-info, .sort-info * {
-    color: #CCCCCC !important;
   }
-  /* ------------------------------------------------------------------
-     Panel overrides for checkbox-group & search-bar
-     ------------------------------------------------------------------ */
-  /* Base panel styling */
   .checkbox-panel,
   .search-panel {
     background-color: green !important;
     padding: 12px !important;
     border-radius: 6px !important;
-    margin-bottom: 1rem !important;
-    /* override Gradio’s block‐background–fill variable */
-    --block-background-fill: transparent !important;
   }
-  /* force all text to white */
-  .checkbox-panel *,
-  .search-panel * {
     color: #FFFFFF !important;
   }
-  /* kill Gradio’s inner card backgrounds & borders */
-  .checkbox-panel .block,
-  .search-panel .block {
     background-color: transparent !important;
     border:           none           !important;
     box-shadow:       none           !important;
   }
-  /* strip out Gradio’s .gr-form-group on the Textbox row */
-  .search-panel .gr-form-group {
     background-color: transparent !important;
     border:           none           !important;
     box-shadow:       none           !important;
   }
-  /* placeholder styling */
-  .search-panel .search-input input::placeholder {
-    color: rgba(255,255,255,0.7) !important;
   }
-  /* ------------------------------------------------------------------
-     No-results Message
-     ------------------------------------------------------------------ */
-  /* your existing .no-results rules... */

    ------------------------------------------------------------------ */
    body {
     font-family: Arial, sans-serif;
+    background-color: #000000;
     margin: 20px;
+    color: #FFFFFF;
+}
+/* ------------------------------------------------------------------
+   Headings & Subtitle
+   ------------------------------------------------------------------ */
+h1, h2, h3, .subtitle, .variants_container {
+    color: #CCCCCC;
     display: flex;
     text-align: center;
     justify-content: center;
+}
+h1 {
     font-size: 2.3rem;
     font-weight: 700;
     margin-top: 2rem;
+}
+.subtitle {
     margin-bottom: 50px;
     color: #CCCCCC !important;
+}
+/* ------------------------------------------------------------------
+   Variants Container (Filters)
+   ------------------------------------------------------------------ */
+.variants_container {
     margin: 50px auto;
     border-radius: 10px;
     display: flex;
     justify-content: center;
     padding: 15px;
     width: fit-content;
+    color: #CCCCCC!important;
+    background-color:transparent;
+}
+.variants_title {
+    font-size: 20px;
     font-weight: 500;
     color: #CCCCCC !important;
+}
+/* Force all descendants of the variants container to be dark */
+.variants_container,
+.variants_container * {
+    color: #CCCCCC!important;
+}
+/* ------------------------------------------------------------------
+   Table styling
+   ------------------------------------------------------------------ */
+table {
+    width: 100%;
+    /* border-collapse: separate;     */
+    border-radius: 10px;
+    overflow: hidden;
+    margin-top: 20px;
+}
+table th {
+    background-color: #27272A;
+    color: #FFFFFF;
+    font-weight: bold;
+    font-size: 18px;
+    border: 1px solid #CCCCCC;
+}
+table tr:not(:first-child):nth-child(odd) {
+    background-color: #27272aef;
+}
+table tr:not(:first-child):nth-child(even) {
+    background-color: #27272add;
+}
+table tr:not(:first-child):nth-child(odd) td {
+    color: #ffffff;
+    border: 1px solid #CCCCCC;
   }
+  table tr:not(:first-child):nth-child(even) td {
+    color: #ffffff;
+    border: 1px solid #CCCCCC;
+  }
+th, td {
+    padding: 8px;
+    text-align: center;
+    border: 1px solid white;
+}
+/* ------------------------------------------------------------------
+   Buttons
+   ------------------------------------------------------------------ */
+button {
+    background-color: #c34700b6;
+    color: #ffffff;
+    border: none;
+    padding: 8px 12px;
+    border-radius: 4px;
+    cursor: pointer;
+    font-size: 14px;
+    transition: all 0.3s ease;
+}
+button:hover {
+    background-color: #c34800;
+    transform: translateY(-2px);
+    box-shadow: 0 4px 8px rgba(0,0,0,0.2);
+}
+/* ------------------------------------------------------------------
+   Sort‐info Banner
+   ------------------------------------------------------------------ */
+.sort-info {
     text-align: center;
     margin: 10px 0;
     padding: 5px;
     background-color: #27272A;
     border-radius: 5px;
+    font-size: 16px;
+}
+.sort-info,
+.sort-info * {
+  color: #CCCCCC !important;
+}
+/* ------------------------------------------------------------------
+   Checkboxes Container
+   ------------------------------------------------------------------ */
+.gradio-container .checkbox-container {
+    margin-right: 10px;
+    background-color: #27272A;
+    padding: 8px;
+    border-radius: 5px;
+}
+/* ------------------------------------------------------------------
+   Search Input
+   ------------------------------------------------------------------ */
+input[type="text"] {
+    background-color: #27272A;
+    color: #FFFFFF;
+    border: 1px solid #CCCCCC;
+    border-radius: 5px;
+    padding: 10px;
+    width: 100%;
+    margin-bottom: 15px;
+    font-size: 16px;
+}
+input[type="text"]:focus {
+    border-color: #FFFFFF;
+    outline: none;
+    box-shadow: 0 0 5px rgba(0, 0, 0, 0.5);
+}
+/* ------------------------------------------------------------------
+   No‐results Message
+   ------------------------------------------------------------------ */
+.no-results {
+    color: #FFFFFF;
+    text-align: center;
+    padding: 30px;
+    background-color: #27272A;
+    border-radius: 10px;
+    font-size: 18px;
+    margin-top: 20px;
+}
+/* ─────────────────────────────────────────────────────────────────────────
+   the checkbox‐group panel
+   ─────────────────────────────────────────────────────────────────────── */
+   .checkbox-panel {
+    background-color: green    !important;
+    padding:        12px       !important;
+    border-radius:  6px        !important;
+    margin-bottom:  1rem       !important; /* give it some breathing room */
   }
   .checkbox-panel,
+  .checkbox-panel * {
+    color: #FFFFFF !important;
+  }
+  /* ─────────────────────────────────────────────────────────────────────────
+     the search‐bar panel
+     ─────────────────────────────────────────────────────────────────────── */
   .search-panel {
+    background-color: green    !important;
+    padding:        12px       !important;
+    border-radius:  6px        !important;
+    margin-bottom:  1rem       !important;
+  }
+  .search-panel,
+  .search-panel * {
+    color: #FFFFFF !important;
+  }
+  /* make the textbox itself blend with the panel */
+  .search-panel input[type="text"] {
+    background-color: transparent !important;
+    border:           1px solid #FFFFFF !important;
+  }
+  /* lighten the placeholder text so it’s visible */
+  .search-panel input[type="text"]::placeholder {
+    color: rgba(255,255,255,0.7) !important;
+  }
+.checkbox-panel { background-color: green !important; }
+.checkbox-panel, .checkbox-panel * { color: #FFF !important; }
+.search-panel  { background-color: green !important; }
+.search-panel, .search-panel  * { color: #FFF !important; }
+/* your existing panel styles */
+.checkbox-panel {
     background-color: green !important;
     padding: 12px !important;
     border-radius: 6px !important;
+  }
+  .checkbox-panel, .checkbox-panel * {
+    color: #FFFFFF !important;
+  }
+  /* make the checkbox “cards” transparent */
+  .checkbox-panel .checkbox-container {
+    background-color: transparent !important;
   }
+  /* similarly for the search bar row */
+  .search-panel {
+    background-color: green !important;
+    padding: 12px !important;
+    border-radius: 6px !important;
+  }
+  .search-panel, .search-panel * {
     color: #FFFFFF !important;
   }
+  /* strip out Gradio’s default input wrapper */
+  .search-panel .gr-form-group {
+    background-color: transparent !important;
+  }
+  /* placeholder styling */
+  .search-panel .search-input input::placeholder {
+    color: rgba(255,255,255,0.7) !important;
+  }
+  /* ─── Kill the inner “card” backgrounds ───────────────────────────────────── */
+.checkbox-panel .block {
     background-color: transparent !important;
     border:           none           !important;
     box-shadow:       none           !important;
   }
+  /* If you ever need to do the same for the search row: */
+  .search-panel .block {
     background-color: transparent !important;
     border:           none           !important;
     box-shadow:       none           !important;
   }
+  :root {
+    --block-background-fill: var(--neutral-800);
   }
+  /* at the very end of styles.css */
+.checkbox-panel {
+    --block-background-fill: transparent !important;
+  }
+  .search-panel {
+    --block-background-fill: transparent !important;
+  }