import json from typing import Any from env import TASK, MODELS, ORG_NAME import gradio as gr from datasets import Dataset, load_dataset KNOWN_METRIC_LABELS = { "accuracy": "Accuracy", "accuracy_stderr": "Accuracy (stderr)", } def aggregate_results() -> list: """Extract scores for each model and return list of result dictionaries.""" all_results = [] for model_path in MODELS: try: path = f"{ORG_NAME}/details_{model_path.replace('/', '__')}_private" dataset = load_dataset(path, "results", split="latest") config = json.loads(dataset["config_general"][0]) results = json.loads(dataset["results"][0]) _, model = model_path.split("/") duration = round(config["end_time"] - config["start_time"], 2) result = { "Model": model, "Duration (s)": duration, } for metric, metric_values in results.items(): if metric == "all": continue for raw_metric_name, metric_value in metric_values.items(): base_name = raw_metric_name.split("(")[0].strip() pretty_label = KNOWN_METRIC_LABELS.get(base_name, raw_metric_name) if isinstance(metric_value, float): metric_value = round(metric_value, 3) result[pretty_label] = metric_value all_results.append(result) except Exception as e: print(f"Error processing {model_path} {ORG_NAME}: {e}") # Sort final result by Accuracy all_results.sort(key=lambda r: r.get("Accuracy", 0), reverse=True) return all_results def extract_dataviz() -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]]]: """Extract best, worst, and all samples for visualization""" sample_index_map = {} for model_path in MODELS: try: dataset_path = f"{ORG_NAME}/details_{model_path.replace('/', '__')}_private" split_name = f"custom_{TASK.replace('/', '_')}_0" dataset = load_dataset(dataset_path, split_name, split="latest") for idx, row in enumerate(dataset): prompt = row["full_prompt"] gold = row.get("gold", "") gold = gold[0] if isinstance(gold, list) and gold else gold score = list(row["metrics"].values())[0] predictions = row.get("predictions", []) prediction = predictions[0] if predictions else "" if idx not in sample_index_map: sample_index_map[idx] = { "ix": idx, "prompt": prompt, "gold": gold, "model_scores": [], "models": [], } if model_path not in sample_index_map[idx]["models"]: sample_index_map[idx][f"{model_path}_score"] = row["metrics"] sample_index_map[idx][f"{model_path}_prediction"] = prediction sample_index_map[idx]["model_scores"].append(score) sample_index_map[idx]["models"].append(model_path) except Exception as e: print(f"Error processing {model_path}: {e}") all_samples = sorted(sample_index_map.values(), key=lambda r: r["ix"]) hard_samples = [sample for sample in all_samples if sum(sample["model_scores"]) == 0] easy_samples = [sample for sample in all_samples if sum(sample["model_scores"]) == len(sample["model_scores"])] return easy_samples, hard_samples, all_samples def samples_to_box_display(samples: list[dict[str, Any]], example_index: int = 0) -> str: """ Adapted from Nathan's code https://huggingface.co/spaces/SaylorTwift/OpenEvalsModelDetails/ Support both light and dark themes """ if not samples: return "No samples in this category!" sample = samples[example_index] outputs = [] for model in sample["models"]: try: outputs.append({ "Model": model, "Prediction": sample[f"{model}_prediction"], "Prompt": sample["prompt"], "Metrics": sample[f"{model}_score"], "Gold": sample["gold"], }) except (KeyError, IndexError): continue if not outputs: return "No results found for the selected combination." # CSS for theme compatibility css = """ """ # Create HTML output with all models html_output = f"{css}
{outputs[0]['Gold']}
\n"
html_output += "{key} | {value} |
{msg['content']}
\n"
html_output += "{json.dumps(msg, indent=2)}
\n"
html_output += "{prompt_text['content']}
\n"
else:
html_output += f"{prompt_text}
\n"
html_output += "{output['Prediction']}
\n"
html_output += "