Spaces:
Sleeping
Sleeping
| """Logic helpers for the **Overview** tab.""" | |
| from typing import List | |
| from .state import app_state | |
| from .utils import compute_model_rankings_new, create_model_summary_card_new | |
| __all__ = ["create_overview"] | |
| def create_overview( | |
| selected_models: List[str], | |
| top_n: int, | |
| score_significant_only: bool = False, | |
| quality_significant_only: bool = False, | |
| sort_by: str = "quality_asc", | |
| min_cluster_size: int = 1, | |
| ) -> str: | |
| """Return the HTML snippet that summarises model performance.""" | |
| if not app_state["metrics"]: | |
| return "Please load data first using the 'Load Data' tab." | |
| if not selected_models: | |
| return "Please select at least one model to display." | |
| # 1. Compute global rankings and filter to selection | |
| model_rankings = compute_model_rankings_new(app_state["metrics"]) | |
| filtered_rankings = [ | |
| (name, stats) for name, stats in model_rankings if name in selected_models | |
| ] | |
| # Sort so "all" appears first, then the rest by their rankings | |
| all_models = [(name, stats) for name, stats in filtered_rankings if name == "all"] | |
| other_models = [(name, stats) for name, stats in filtered_rankings if name != "all"] | |
| filtered_rankings = all_models + other_models | |
| if not filtered_rankings: | |
| return "No data available for selected models." | |
| # 2. Assemble HTML | |
| overview_html = """ | |
| <div style="max-width: 1600px; margin: 0 auto;"> | |
| <p style="color: #666; margin-bottom: 10px;"> | |
| Top distinctive clusters where each model shows unique behavioural patterns. | |
| Frequency shows what percentage of a model's battles resulted in that behavioural pattern. | |
| </p> | |
| <details style="margin-bottom:25px;"> | |
| <summary style="cursor:pointer; color:#4c6ef5; font-weight:600;">ℹ️ What do "proportion delta", "Quality Δ", and significance tags mean?</summary> | |
| <div style="margin-top:12px; font-size:14px; line-height:1.5; color:#333;"> | |
| <strong>Proportion Delta</strong><br> | |
| For each cluster we compute how often <em>this model</em> appears in that cluster compared with the average across all models.<br> | |
| • A positive value (e.g. <code>+0.15</code>) means the model hits the behaviour more often than average.<br> | |
| • A negative value (e.g. <code>-0.08</code>) means it appears less often.<br> | |
| It is derived from the <code>proportion_delta</code> field in <code>model_cluster_scores.json</code>.<br><br> | |
| <strong>Quality Δ</strong><br> | |
| The difference between the cluster's quality score(s) for this model and the model's <em>overall</em> quality baseline, shown for each individual metric (e.g., helpfulness, accuracy).<br> | |
| Positive values (green) indicate the model performs better than its average in that behaviour; negative values (red) indicate worse.<br> | |
| This is derived from the <code>quality_delta</code> metric dictionary in <code>model_cluster_scores.json</code>.<br><br> | |
| <strong>Significance Tags (FREQ/QUAL)</strong><br> | |
| The <span style="background: #28a745; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">FREQ</span> and <span style="background: #007bff; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">QUAL</span> tags indicate <em>statistical significance</em> based on confidence intervals:<br> | |
| • <strong>FREQ</strong> (green): The proportion delta is statistically significant (confidence interval doesn't include zero)<br> | |
| • <strong>QUAL</strong> (blue): At least one quality metric delta is statistically significant<br> | |
| These tags help identify which behavioral patterns are reliably different from the model's baseline performance. | |
| </div> | |
| </details> | |
| """ | |
| for model_name, _ in filtered_rankings: | |
| card_html = create_model_summary_card_new( | |
| model_name, | |
| app_state["metrics"], | |
| # top_n etc. | |
| top_n, | |
| score_significant_only=score_significant_only, | |
| quality_significant_only=quality_significant_only, | |
| sort_by=sort_by, | |
| min_cluster_size=min_cluster_size, | |
| ) | |
| overview_html += card_html | |
| overview_html += "</div>" | |
| return overview_html |