import json from typing import Any from env import TASK, MODELS, ORG_NAME import gradio as gr from datasets import Dataset, load_dataset KNOWN_METRIC_LABELS = { "accuracy": "Accuracy", "accuracy_stderr": "Accuracy (stderr)", } def aggregate_results() -> list: """Extract scores for each model and return list of result dictionaries.""" all_results = [] for model_path in MODELS: try: path = f"{ORG_NAME}/details_{model_path.replace('/', '__')}_private" dataset = load_dataset(path, "results", split="latest") config = json.loads(dataset["config_general"][0]) results = json.loads(dataset["results"][0]) _, model = model_path.split("/") duration = round(config["end_time"] - config["start_time"], 2) result = { "Model": model, "Duration (s)": duration, } for metric, metric_values in results.items(): if metric == "all": continue for raw_metric_name, metric_value in metric_values.items(): base_name = raw_metric_name.split("(")[0].strip() pretty_label = KNOWN_METRIC_LABELS.get(base_name, raw_metric_name) if isinstance(metric_value, float): metric_value = round(metric_value, 3) result[pretty_label] = metric_value all_results.append(result) except Exception as e: print(f"Error processing {model_path} {ORG_NAME}: {e}") # Sort final result by Accuracy all_results.sort(key=lambda r: r.get("Accuracy", 0), reverse=True) return all_results def extract_dataviz() -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]]]: """Extract best, worst, and all samples for visualization""" sample_index_map = {} for model_path in MODELS: try: dataset_path = f"{ORG_NAME}/details_{model_path.replace('/', '__')}_private" split_name = f"custom_{TASK.replace('/', '_')}_0" dataset = load_dataset(dataset_path, split_name, split="latest") for idx, row in enumerate(dataset): prompt = row["full_prompt"] gold = row.get("gold", "") gold = gold[0] if isinstance(gold, list) and gold else gold score = list(row["metrics"].values())[0] predictions = row.get("predictions", []) prediction = predictions[0] if predictions else "" if idx not in sample_index_map: sample_index_map[idx] = { "ix": idx, "prompt": prompt, "gold": gold, "model_scores": [], "models": [], } if model_path not in sample_index_map[idx]["models"]: sample_index_map[idx][f"{model_path}_score"] = row["metrics"] sample_index_map[idx][f"{model_path}_prediction"] = prediction sample_index_map[idx]["model_scores"].append(score) sample_index_map[idx]["models"].append(model_path) except Exception as e: print(f"Error processing {model_path}: {e}") all_samples = sorted(sample_index_map.values(), key=lambda r: r["ix"]) hard_samples = [sample for sample in all_samples if sum(sample["model_scores"]) == 0] easy_samples = [sample for sample in all_samples if sum(sample["model_scores"]) == len(sample["model_scores"])] return easy_samples, hard_samples, all_samples def samples_to_box_display(samples: list[dict[str, Any]], example_index: int = 0) -> str: """ Adapted from Nathan's code https://huggingface.co/spaces/SaylorTwift/OpenEvalsModelDetails/ Support both light and dark themes """ if not samples: return "No samples in this category!" sample = samples[example_index] outputs = [] for model in sample["models"]: try: outputs.append({ "Model": model, "Prediction": sample[f"{model}_prediction"], "Prompt": sample["prompt"], "Metrics": sample[f"{model}_score"], "Gold": sample["gold"], }) except (KeyError, IndexError): continue if not outputs: return "No results found for the selected combination." # CSS for theme compatibility css = """ """ # Create HTML output with all models html_output = f"{css}
\n\n" # Show gold answer at the top with distinct styling if outputs: html_output += "
\n" html_output += "

Ground Truth

\n" html_output += "
\n" html_output += f"
{outputs[0]['Gold']}
\n" html_output += "
\n" html_output += "
\n" for output in outputs: html_output += "
\n" html_output += f"

{output['Model']}

\n" # Format metrics as a clean table html_output += "
\n" html_output += "

Metrics

\n" metrics = output["Metrics"] if isinstance(metrics, str): metrics = eval(metrics) html_output += "
\n" html_output += "\n" for key, value in metrics.items(): if isinstance(value, float): value = f"{value:.3f}" html_output += f"\n" html_output += "
{key}{value}
\n" html_output += "
\n" html_output += "
\n\n" # Handle prompt formatting with better styling html_output += "
\n" html_output += "

Prompt

\n" html_output += "
\n" prompt_text = output["Prompt"] if isinstance(prompt_text, list): for i, msg in enumerate(prompt_text): if isinstance(msg, dict) and "content" in msg: role = msg.get("role", "message").title() html_output += "
\n" html_output += f"{role}:\n" html_output += "
\n" html_output += f"
{msg['content']}
\n" html_output += "
\n" html_output += "
\n" else: html_output += "
\n" html_output += "
\n" html_output += f"
{json.dumps(msg, indent=2)}
\n" html_output += "
\n" html_output += "
\n" else: html_output += "
\n" if isinstance(prompt_text, dict) and "content" in prompt_text: html_output += f"
{prompt_text['content']}
\n" else: html_output += f"
{prompt_text}
\n" html_output += "
\n" html_output += "
\n" html_output += "
\n\n" # Style prediction output - now in a collapsible section html_output += "
\n" html_output += "

Prediction

" # Add word count in a muted style word_count = len(output["Prediction"].split()) html_output += f"({word_count} words)" html_output += "
\n" html_output += "
\n" html_output += "
\n" html_output += f"
{output['Prediction']}
\n" html_output += "
\n" html_output += "
\n" html_output += "
\n" html_output += "
\n\n" html_output += "
" return html_output def run_pipeline(samples_ix: int = 0) -> tuple[Any, Any, Any, Any]: """Run evaluation pipeline and return results for display""" results = aggregate_results() easy_samples, hard_samples, all_samples = extract_dataviz() return ( gr.Dataframe(Dataset.from_list(results).to_pandas(), visible=True), gr.HTML( samples_to_box_display(easy_samples, samples_ix), label="Easiest samples (always found)", visible=True, ), gr.HTML( samples_to_box_display(hard_samples, samples_ix), label="Hardest samples (always failed)", visible=True, ), gr.HTML( samples_to_box_display(all_samples, samples_ix), label="All samples", visible=True, ), ) def update_examples(samples_ix: int = 0) -> tuple[str, str, str]: """Return HTML strings for easy, hard, and all samples""" easy_samples, hard_samples, all_samples = extract_dataviz() return ( samples_to_box_display(easy_samples, samples_ix), samples_to_box_display(hard_samples, samples_ix), samples_to_box_display(all_samples, samples_ix), )