TD-EVAL_leaderboard

Sleeping

App Files Files Community

TD-EVAL_leaderboard / app.py

juancauma

changes to font color

a732253 about 1 month ago

raw

history blame contribute delete

25 kB

	import gradio as gr
	import pandas as pd
	import json
	import os
	from gradio.themes.utils import colors

	# Load external CSS from the file "styles.css"
	try:
	with open("styles.css", "r", encoding="utf-8") as f:
	custom_css = f.read()
	except UnicodeDecodeError:
	# Try with a different encoding if utf-8 fails
	with open("styles.css", "r", encoding="latin-1") as f:
	custom_css = f.read()

	# Add more specific selector for Gradio and add !important to improve the cascading
	additional_css = """
	.gradio-container .checkbox-panel,
	div.gradio-container [class*="block"] .checkbox-panel {
	background-color: #27272A !important;
	}
	.gradio-container .search-panel,
	div.gradio-container [class*="block"] .search-panel {
	background-color: #27272A !important;
	}
	"""
	custom_css += additional_css

	# Create a custom theme with light colors for our panels
	class CustomTheme(gr.themes.Base):
	def __init__(self):
	super().__init__(
	primary_hue=colors.red,
	secondary_hue=colors.gray,
	neutral_hue=colors.gray,
	text_size=gr.themes.sizes.text_lg
	)
	# Don't set any global background colors
	self.block_border_width = "0px"
	self.block_shadow = "none"

	# Add additional CSS for the new styles, being more specific
	custom_css += """
	/* Only override specific panels by ID */
	#checkbox-panel,
	#search-panel {
	background-color: #F0F0F0 !important;
	}
	/* Only affect immediate children of these specific panels */
	#checkbox-panel > *,
	#search-panel > * {
	background-color: transparent !important;
	}
	/* Target checkbox inputs specifically */
	#checkbox-panel input[type="checkbox"],
	#search-panel input[type="text"] {
	background-color: transparent !important;
	}
	"""

	def strip_timestamp(name):
	"""Remove the timestamp portion from the model name."""
	parts = name.split('-')
	return '-'.join(parts[1:]) if len(parts) > 1 else name

	# Static grouping mapping for the 10 general submissions.
	GROUPS = [
	{"mwoz": "20250214_193236-o1", "tau_airline": "20250215_115156-tau-o1-airline", "tau_retail": "20250215_121147-tau-o1-retail"},
	{"mwoz": "20250131_012338-llama405", "tau_airline": "20250204_144222-tau-llama-405b-airline", "tau_retail": "20250205_033820-tau-llama405b-retail"},
	{"mwoz": "20250130_140218-4o", "tau_airline": "20250131_152503-tau-4o-airline", "tau_retail": "20250131_152422-tau-4o-retail"},
	{"mwoz": "20250130_183030-claude", "tau_airline": "20250205_030422-tau-sonnet-airline", "tau_retail": "20250131_152807-tau-sonnet-retail"},
	{"mwoz": "20250131_012449-llama70", "tau_airline": "20250208_024344-tau-llama70b-airline", "tau_retail": "20250208_030407-tau-llama70b-retail"},
	{"mwoz": "20250131_013711-qwen72b", "tau_airline": "20250202_112945-qwen72b-airline", "tau_retail": "20250202_140527-qwen72b-retail"},
	{"mwoz": "20250130_184905-mistrallarge", "tau_airline": "20250205_024823-tau-mistrallarge-airline", "tau_retail": "20250205_044403-tau-mistrallarge-retail"},
	{"mwoz": "20250131_010143-o1mini", "tau_airline": "20250214_180731-tau-o1-mini-airline", "tau_retail": "20250214_142736-tau-o1-mini-retail"},
	{"mwoz": "20250130_140439-4omini", "tau_airline": "20250131_152226-tau-4o-mini-airline", "tau_retail": "20250131_152338-tau-4o-mini-retail"},
	{"mwoz": "20250130_145202-gpt35", "tau_airline": "20250131_152708-tau-gpt35-airline", "tau_retail": "20250131_152610-tau-gpt35-retail"}
	]

	def load_mwoz_results():
	path = os.path.join("data", "mwoz_leaderboard_results.json")
	if not os.path.exists(path):
	return []
	with open(path, "r") as f:
	return json.load(f)

	def load_tau_results():
	path = os.path.join("data", "tau_leaderboard_results.json")
	if not os.path.exists(path):
	return []
	with open(path, "r") as f:
	return json.load(f)

	def create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query=""):
	if not (selected_mwoz or selected_tau_airline or selected_tau_retail):
	selected_mwoz = True

	mwoz_data = load_mwoz_results()
	tau_data = load_tau_results()
	mwoz_lookup = {entry["model_name"]: entry for entry in mwoz_data}
	tau_lookup = {entry["model_name"]: entry for entry in tau_data}

	aggregated = []
	for group in GROUPS:
	metrics = {"avg_conv_consistency": 0, "avg_backend_consistency": 0, "avg_policy_completeness": 0}
	count = 0
	title_parts = []
	judge_model = ""
	if selected_mwoz:
	key = group["mwoz"]
	if key in mwoz_lookup:
	record = mwoz_lookup[key]
	metrics["avg_conv_consistency"] += record.get("avg_conv_consistency", 0)
	metrics["avg_backend_consistency"] += record.get("avg_backend_consistency", 0)
	metrics["avg_policy_completeness"] += record.get("avg_policy_completeness", 0)
	count += 1
	title_parts.append(strip_timestamp(key))
	judge_model = record.get("judge_model", "")
	if selected_tau_airline:
	key = group["tau_airline"]
	if key in tau_lookup:
	record = tau_lookup[key]
	metrics["avg_conv_consistency"] += record.get("avg_conv_consistency", 0)
	metrics["avg_backend_consistency"] += record.get("avg_backend_consistency", 0)
	metrics["avg_policy_completeness"] += record.get("avg_policy_completeness", 0)
	count += 1
	title_parts.append(strip_timestamp(key))
	judge_model = record.get("judge_model", "")
	if selected_tau_retail:
	key = group["tau_retail"]
	if key in tau_lookup:
	record = tau_lookup[key]
	metrics["avg_conv_consistency"] += record.get("avg_conv_consistency", 0)
	metrics["avg_backend_consistency"] += record.get("avg_backend_consistency", 0)
	metrics["avg_policy_completeness"] += record.get("avg_policy_completeness", 0)
	count += 1
	title_parts.append(strip_timestamp(key))
	judge_model = record.get("judge_model", "")
	if count > 0:
	avg_conv = metrics["avg_conv_consistency"] / count
	avg_backend = metrics["avg_backend_consistency"] / count
	avg_policy = metrics["avg_policy_completeness"] / count
	overall_avg = (avg_conv + avg_backend + avg_policy) / 3
	else:
	avg_conv = avg_backend = avg_policy = overall_avg = 0

	model_name = " / ".join(title_parts)

	# Apply search filter
	if search_query and search_query.lower() not in model_name.lower():
	continue

	aggregated.append({
	"Model": model_name,
	"Average Score": round(overall_avg, 4),
	"Conversation Consistency": round(avg_conv, 4),
	"Backend Consistency": round(avg_backend, 4),
	"Policy Completeness": round(avg_policy, 4),
	"Judge Model": judge_model
	})

	df = pd.DataFrame(aggregated)

	# If no results found after filtering
	if df.empty:
	return df

	df["Rank"] = df["Average Score"].rank(ascending=False, method="min").astype(int)

	allowed_sort_cols = ["Average Score", "Conversation Consistency", "Backend Consistency", "Policy Completeness"]

	# Handle sort_state safely
	if isinstance(sort_state, str):
	try:
	sort_state = json.loads(sort_state)
	except:
	sort_state = {"sort_by": "Average Score", "ascending": False}

	# Ensure sort_state is a dict
	if not isinstance(sort_state, dict):
	sort_state = {"sort_by": "Average Score", "ascending": False}

	sort_by = sort_state.get("sort_by", "Average Score")
	ascending = sort_state.get("ascending", False)

	if sort_by in allowed_sort_cols:
	df = df.sort_values(sort_by, ascending=ascending)
	else:
	# Default sort if column not found
	df = df.sort_values("Average Score", ascending=False)

	cols = df.columns.tolist()
	if "Rank" in cols:
	cols.insert(0, cols.pop(cols.index("Rank")))
	df = df[cols]

	return df

	def update_sort_state(current_state, clicked_column):
	"""
	Update the sorting state based on the clicked column.
	Handles various input formats for current_state.
	"""
	# Default state if nothing valid is provided
	new_state = {"sort_by": clicked_column, "ascending": False}

	# Handle the case when current_state is a string (JSON)
	if isinstance(current_state, str):
	try:
	current_state = json.loads(current_state)
	except (json.JSONDecodeError, TypeError):
	# If we can't parse it, return the default state
	return new_state

	# If current_state is None or not a dict, return default
	if not isinstance(current_state, dict):
	return new_state

	# Now we're sure current_state is a dict
	# Check if it has the needed keys
	if "sort_by" in current_state:
	if current_state["sort_by"] == clicked_column:
	# Toggle direction for the same column
	return {
	"sort_by": clicked_column,
	"ascending": not current_state.get("ascending", False)
	}
	else:
	# New column, default to descending (false)
	return {
	"sort_by": clicked_column,
	"ascending": False
	}

	# If we got here, current_state doesn't have the right format
	return new_state

	def sort_by_avg(sort_state):
	return update_sort_state(sort_state, "Average Score")

	def sort_by_conv(sort_state):
	return update_sort_state(sort_state, "Conversation Consistency")

	def sort_by_backend(sort_state):
	return update_sort_state(sort_state, "Backend Consistency")

	def sort_by_policy(sort_state):
	return update_sort_state(sort_state, "Policy Completeness")

	def get_color_for_value(value, min_val, max_val):
	if max_val == min_val:
	norm = 0.5
	else:
	norm = (value - min_val) / (max_val - min_val)
	if norm < 0.5:
	ratio = norm / 0.5
	# Darker red for lower values
	r = 180
	g = int(140 * ratio)
	b = 0
	else:
	ratio = (norm - 0.5) / 0.5
	# Darker green for higher values
	r = int(140 * (1 - ratio))
	g = 140
	b = 0
	return f"#{r:02X}{g:02X}{b:02X}"

	def generate_html_table(df):
	if df.empty:
	return "<div class='no-results'>No matching results found.</div>"

	numeric_cols = ["Average Score", "Conversation Consistency", "Backend Consistency", "Policy Completeness"]
	col_min = {}
	col_max = {}
	for col in numeric_cols:
	col_min[col] = df[col].min() if not df.empty else 0
	col_max[col] = df[col].max() if not df.empty else 0

	# Build a simple HTML table without borders or JavaScript sorting
	html = "<table style='border: none; border-collapse: collapse;'>"

	# Header row
	html += "<tr>"
	for col in df.columns:
	html += f"<th style='padding:8px; border: none;'>{col}</th>"
	html += "</tr>"

	# Table rows
	for _, row in df.iterrows():
	html += "<tr style='border: none;'>"
	for col in df.columns:
	cell_value = row[col]
	if col in numeric_cols:
	color = get_color_for_value(cell_value, col_min[col], col_max[col])
	html += f"<td style='padding: 8px; border: none; color: {color}; font-weight: bold;'>{cell_value}</td>"
	else:
	html += f"<td style='padding: 8px; border: none;'>{cell_value}</td>"
	html += "</tr>"
	html += "</table>"

	return html

	def update_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query=""):
	"""
	Update the leaderboard based on selection and sort state.
	"""
	try:
	# Convert sort_state to dict if it's a string
	if isinstance(sort_state, str):
	try:
	sort_state = json.loads(sort_state)
	except:
	# If JSON parsing fails, create a default state
	sort_state = {"sort_by": "Average Score", "ascending": False}

	# Ensure sort_state is a dict
	if not isinstance(sort_state, dict):
	sort_state = {"sort_by": "Average Score", "ascending": False}

	# Generate the data and table
	df = create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail, sort_state, search_query)
	html_table = generate_html_table(df)

	# Get sort info with fallbacks
	sort_col = sort_state.get("sort_by", "Average Score")
	sort_dir = "▼" if not sort_state.get("ascending", False) else "▲"

	html_output = f"""
	<div class="sort-info">
	<p>Sorted by: {sort_col} {sort_dir}</p>
	</div>
	{html_table}
	"""

	return html_output

	except Exception as e:
	# If anything goes wrong, return a basic table with an error message
	print(f"Error in update_leaderboard: {str(e)}")
	df = create_grouped_leaderboard(selected_mwoz, selected_tau_airline, selected_tau_retail,
	{"sort_by": "Average Score", "ascending": False})
	html_table = generate_html_table(df)

	return f"""
	<div class="sort-info" style="color: #ff6b6b;">
	<p>Error in sorting. Using default sort: Average Score (descending)</p>
	</div>
	{html_table}
	"""

	# Create our custom theme instance
	custom_theme = CustomTheme()

	with gr.Blocks(css=custom_css, title="TD-EVAL Leaderboard", theme=custom_theme) as demo:
	gr.Markdown("# 🏆 TD-EVAL Model Evaluation Leaderboard")
	gr.HTML('<div class="subtitle">This leaderboard displays aggregated model performance across multiple evaluation metrics.</div>')

	# Add JavaScript to ensure backgrounds are properly set
	gr.HTML("""
	<script>
	// Function to fix background colors
	function fixBackgrounds() {
	// Add a style tag to force all block-info spans to be black
	var styleEl = document.createElement('style');
	styleEl.textContent = `
	span[data-testid="block-info"] { color: #000000 !important; }
	.svelte-1gfkn6j { color: #000000 !important; }
	.search-panel label,
	.search-panel .label-wrap,
	.search-panel span,
	#search-panel span,
	div[id="search-panel"] span { color: #000000 !important; }
	`;
	document.head.appendChild(styleEl);

	// Only fix specific panels by ID
	var checkboxPanel = document.getElementById('checkbox-panel');
	if (checkboxPanel) {
	checkboxPanel.style.backgroundColor = '#F0F0F0';

	// Only make checkboxes and their direct containers transparent
	var checkboxes = checkboxPanel.querySelectorAll('input[type="checkbox"]');
	checkboxes.forEach(function(checkbox) {
	var parent = checkbox.parentElement;
	if (parent) parent.style.backgroundColor = 'transparent';
	checkbox.style.backgroundColor = 'transparent';

	// Find and style the associated label to be black
	var label = checkbox.nextElementSibling;
	if (label && label.tagName === 'LABEL') {
	label.style.color = '#000000';
	}

	// Also find any span elements that might contain the label text
	var spans = parent.querySelectorAll('span');
	spans.forEach(function(span) {
	span.style.color = '#000000';
	});

	// Find label elements in the parent container
	var labels = parent.querySelectorAll('label');
	labels.forEach(function(label) {
	label.style.color = '#000000';
	});

	// Apply custom styling for the checkbox to show orange checkmark
	if (checkbox.checked) {
	checkbox.style.position = 'relative';
	checkbox.style.appearance = 'none';
	checkbox.style.backgroundColor = '#F0F0F0';
	checkbox.style.border = '1px solid #CCCCCC';
	checkbox.style.borderRadius = '3px';

	// Create or update the checkmark element
	var checkmark = checkbox.querySelector('.orange-checkmark');
	if (!checkmark) {
	checkmark = document.createElement('span');
	checkmark.className = 'orange-checkmark';
	checkmark.style.position = 'absolute';
	checkmark.style.left = '50%';
	checkmark.style.top = '50%';
	checkmark.style.transform = 'translate(-50%, -50%)';
	checkmark.style.color = '#c34700';
	checkmark.style.fontSize = '14px';
	checkmark.style.fontWeight = 'bold';
	checkmark.innerText = '✓';
	checkbox.appendChild(checkmark);
	}
	}
	});
	}

	var searchPanel = document.getElementById('search-panel');
	if (searchPanel) {
	searchPanel.style.backgroundColor = '#F0F0F0';

	// Only make search input and its direct container transparent
	var searchInput = searchPanel.querySelector('input[type="text"]');
	if (searchInput) {
	var parent = searchInput.parentElement;
	if (parent) parent.style.backgroundColor = 'transparent';
	searchInput.style.backgroundColor = '#FFFFFF';
	// Ensure the border is visible and matches text color
	searchInput.style.border = '2px solid #000000';
	searchInput.style.color = '#000000';
	}

	// Make sure the label is black
	var searchLabels = searchPanel.querySelectorAll('label, .label-wrap, .label-wrap span');
	searchLabels.forEach(function(label) {
	label.style.color = '#000000';
	});

	// Target the specific span element that contains the label text
	var blockInfoSpans = document.querySelectorAll('span[data-testid="block-info"]');
	blockInfoSpans.forEach(function(span) {
	span.style.color = '#000000';
	});

	// Also target elements with the svelte class
	var svelteElements = document.querySelectorAll('.svelte-1gfkn6j');
	svelteElements.forEach(function(element) {
	if (element.textContent.includes('Search models')) {
	element.style.color = '#000000';
	}
	});
	}
	}

	// Run on page load and every second for 3 seconds to catch any delayed rendering
	setTimeout(fixBackgrounds, 500);
	setTimeout(fixBackgrounds, 1000);
	setTimeout(fixBackgrounds, 2000);
	</script>
	""")

	gr.HTML('''
	<div class="variants_container">
	<div class="variants_title">Variants:</div>
	<ul style="list-style: none; padding: 0; margin: 8px 0;">
	<li>mwoz: Baseline variant.</li>
	<li>tau-airline: Airline specialty variant.</li>
	<li>tau-retail: Retail specialty variant.</li>
	</ul>
	<p>Use the checkboxes below to select which variants to include. At least one variant must be active.</p>
	</div>
	''')

	with gr.Row(elem_classes="checkbox-panel", elem_id="checkbox-panel"):
	cb_mwoz = gr.Checkbox(label="mwoz", value=True)
	cb_tau_airline = gr.Checkbox(label="tau-airline", value=True)
	cb_tau_retail = gr.Checkbox(label="tau-retail", value=True)

	with gr.Row(elem_classes="search-panel", elem_id="search-panel"):
	search_input = gr.Textbox(
	label="Search models",
	placeholder="Type to filter…",
	elem_classes="search-input",
	elem_id="search-input"
	)

	hidden_sort_state = gr.State(value={"sort_by": "Average Score", "ascending": False})

	# Add sorting buttons
	gr.Markdown("### Sort by:")
	with gr.Row():
	btn_avg = gr.Button("Average Score ▼")
	btn_conv = gr.Button("Conversation Consistency")
	btn_backend = gr.Button("Backend Consistency")
	btn_policy = gr.Button("Policy Completeness")

	leaderboard_display = gr.HTML(label="Aggregated Model Rankings")

	# Function to toggle sort state and update button labels
	def toggle_sort(column, current_state, btn_avg, btn_conv, btn_backend, btn_policy):
	# Default new state - flip direction if same column, otherwise default to descending
	if isinstance(current_state, dict) and current_state.get("sort_by") == column:
	new_ascending = not current_state.get("ascending", False)
	else:
	new_ascending = False

	new_state = {"sort_by": column, "ascending": new_ascending}

	# Update button labels
	direction = "▲" if new_ascending else "▼"
	avg_label = f"Average Score {direction}" if column == "Average Score" else "Average Score"
	conv_label = f"Conversation Consistency {direction}" if column == "Conversation Consistency" else "Conversation Consistency"
	backend_label = f"Backend Consistency {direction}" if column == "Backend Consistency" else "Backend Consistency"
	policy_label = f"Policy Completeness {direction}" if column == "Policy Completeness" else "Policy Completeness"

	return new_state, avg_label, conv_label, backend_label, policy_label

	# Connect sort buttons with the toggle function
	btn_avg.click(
	fn=toggle_sort,
	inputs=[gr.Textbox(value="Average Score", visible=False), hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy],
	outputs=[hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy]
	).then(
	fn=update_leaderboard,
	inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input],
	outputs=leaderboard_display
	)

	btn_conv.click(
	fn=toggle_sort,
	inputs=[gr.Textbox(value="Conversation Consistency", visible=False), hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy],
	outputs=[hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy]
	).then(
	fn=update_leaderboard,
	inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input],
	outputs=leaderboard_display
	)

	btn_backend.click(
	fn=toggle_sort,
	inputs=[gr.Textbox(value="Backend Consistency", visible=False), hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy],
	outputs=[hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy]
	).then(
	fn=update_leaderboard,
	inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input],
	outputs=leaderboard_display
	)

	btn_policy.click(
	fn=toggle_sort,
	inputs=[gr.Textbox(value="Policy Completeness", visible=False), hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy],
	outputs=[hidden_sort_state, btn_avg, btn_conv, btn_backend, btn_policy]
	).then(
	fn=update_leaderboard,
	inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input],
	outputs=leaderboard_display
	)

	# Connect dataflow for variant checkboxes and search
	cb_mwoz.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
	cb_tau_airline.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
	cb_tau_retail.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)
	search_input.change(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)

	demo.load(fn=update_leaderboard, inputs=[cb_mwoz, cb_tau_airline, cb_tau_retail, hidden_sort_state, search_input], outputs=leaderboard_display)

	if __name__ == "__main__":
	demo.launch()