Spaces:

lisabdunlap
/

Whatever-this-is

Sleeping

Whatever-this-is / lmmvibes /vis_gradio /overview_tab.py

Lisa Dunlap

restart

4862c84 4 months ago

4.47 kB

	"""Logic helpers for the Overview tab."""
	from typing import List

	from .state import app_state
	from .utils import compute_model_rankings_new, create_model_summary_card_new

	__all__ = ["create_overview"]


	def create_overview(
	selected_models: List[str],
	top_n: int,
	score_significant_only: bool = False,
	quality_significant_only: bool = False,
	sort_by: str = "quality_asc",
	min_cluster_size: int = 1,
	) -> str:
	"""Return the HTML snippet that summarises model performance."""
	if not app_state["metrics"]:
	return "Please load data first using the 'Load Data' tab."

	if not selected_models:
	return "Please select at least one model to display."

	# 1. Compute global rankings and filter to selection
	model_rankings = compute_model_rankings_new(app_state["metrics"])
	filtered_rankings = [
	(name, stats) for name, stats in model_rankings if name in selected_models
	]

	# Sort so "all" appears first, then the rest by their rankings
	all_models = [(name, stats) for name, stats in filtered_rankings if name == "all"]
	other_models = [(name, stats) for name, stats in filtered_rankings if name != "all"]
	filtered_rankings = all_models + other_models

	if not filtered_rankings:
	return "No data available for selected models."

	# 2. Assemble HTML
	overview_html = """
	<div style="max-width: 1600px; margin: 0 auto;">
	<p style="color: #666; margin-bottom: 10px;">
	Top distinctive clusters where each model shows unique behavioural patterns.
	Frequency shows what percentage of a model's battles resulted in that behavioural pattern.
	</p>

	<details style="margin-bottom:25px;">
	<summary style="cursor:pointer; color:#4c6ef5; font-weight:600;">ℹ️ What do "proportion delta", "Quality Δ", and significance tags mean?</summary>
	<div style="margin-top:12px; font-size:14px; line-height:1.5; color:#333;">
	<strong>Proportion Delta</strong><br>
	For each cluster we compute how often <em>this model</em> appears in that cluster compared with the average across all models.<br>
	• A positive value (e.g. <code>+0.15</code>) means the model hits the behaviour more often than average.<br>
	• A negative value (e.g. <code>-0.08</code>) means it appears less often.<br>
	It is derived from the <code>proportion_delta</code> field in <code>model_cluster_scores.json</code>.<br><br>
	<strong>Quality Δ</strong><br>
	The difference between the cluster's quality score(s) for this model and the model's <em>overall</em> quality baseline, shown for each individual metric (e.g., helpfulness, accuracy).<br>
	Positive values (green) indicate the model performs better than its average in that behaviour; negative values (red) indicate worse.<br>
	This is derived from the <code>quality_delta</code> metric dictionary in <code>model_cluster_scores.json</code>.<br><br>
	<strong>Significance Tags (FREQ/QUAL)</strong><br>
	The <span style="background: #28a745; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">FREQ</span> and <span style="background: #007bff; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">QUAL</span> tags indicate <em>statistical significance</em> based on confidence intervals:<br>
	• <strong>FREQ</strong> (green): The proportion delta is statistically significant (confidence interval doesn't include zero)<br>
	• <strong>QUAL</strong> (blue): At least one quality metric delta is statistically significant<br>
	These tags help identify which behavioral patterns are reliably different from the model's baseline performance.
	</div>
	</details>
	"""

	for model_name, _ in filtered_rankings:
	card_html = create_model_summary_card_new(
	model_name,
	app_state["metrics"],
	# top_n etc.
	top_n,
	score_significant_only=score_significant_only,
	quality_significant_only=quality_significant_only,
	sort_by=sort_by,
	min_cluster_size=min_cluster_size,
	)
	overview_html += card_html

	overview_html += "</div>"
	return overview_html