from datasets import load_dataset import json import gradio as gr # Load experiments.json to get model configurations with open('experiments.json', 'r') as f: EXPERIMENTS = json.load(f) # Get all unique benchmark subsets from experiments.json BENCHMARKS = [] for model_config in EXPERIMENTS.values(): for benchmark in model_config['benchmarks'].values(): subset = benchmark['subset'] if subset not in BENCHMARKS: BENCHMARKS.append(subset) from datasets import get_dataset_split_names # Add this near the top with other constants REPO_OPTIONS = [ "OpenEvals/details_gpt-4o_private", "OpenEvals/details_claude-3-7-sonnet-20250219_private", "OpenEvals/details_o3-mini-2025-01-31_private", "OpenEvals/details_moonshotai__Moonlight-16B-A3B-Instruct_private", "OpenEvals/details_meta-llama__Llama-3.3-70B-Instruct_private", "OpenEvals/details_deepseek-ai__DeepSeek-R1-Distill-Llama-70B_private", "OpenEvals/details_qihoo360__TinyR1-32B-Preview_private", "OpenEvals/details_openai__gpt-4.5-preview-2025-02-27_private", "OpenEvals/details_deepseek-ai__DeepSeek-R1-Distill-Qwen-32B_private", "OpenEvals/details_openai__deepseek-ai__DeepSeek-R1_private", "OpenEvals/details_Qwen__QwQ-32B_private", "OpenEvals/details_google__gemma-3-1b-it_private", "OpenEvals/details_google__gemma-3-12b-it_private", "OpenEvals/details_google__gemma-3-27b-it_private", "OpenEvals/details_openai__deepseek-ai__DeepSeek-V3-0324_private", "OpenEvals/details_openai__deepseek-ai__DeepSeek-V3_private", "OpenEvals/details_meta-llama__Llama-4-Scout-17B-16E-Instruct_private", "OpenEvals/details_meta-llama__Llama-4-Maverick-17B-128E-Instruct-FP8_private" ] def get_model_name_from_repo(repo): # Extract model name from repository path # Example: "OpenEvals/details_meta-llama__Llama-4-Maverick-17B-128E-Instruct-FP8_private" # -> "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" parts = repo.split('/') model_name = parts[1].replace('details_', '').replace('_private', '') # Convert double underscores back to forward slashes model_name = model_name.replace('__', '/') return model_name def get_available_benchmarks(repo): model_name = get_model_name_from_repo(repo) print(model_name) if not model_name or model_name not in EXPERIMENTS: return [] model_config = EXPERIMENTS[model_name] print(model_config) return [benchmark['subset'] for benchmark in model_config['benchmarks'].values()] def get_available_splits(repo, benchmark): if not benchmark: return [] return get_dataset_split_names(repo, config_name=benchmark.replace("|", "_").replace(":", "_")) def load_details_and_results(repo, subset, split): def worker(example): example["predictions"] = example["predictions"] example["gold"] = example["gold"][0] example["metrics"] = example["metrics"] return example details = load_dataset(repo, subset.replace("|", "_").replace(":", "_"), split=split) results = load_dataset(repo, "results", split=split) results = eval(results[0]["results"]) columns_to_keep = ['full_prompt', 'gold', 'metrics', 'predictions'] details = details.select_columns(columns_to_keep) details = details.map(worker) return details, results def update_splits(repo, benchmark): splits = get_available_splits(repo, benchmark) return gr.Dropdown(choices=splits, value=splits[0] if splits else None) def display_model_details(repo_name, benchmark, split, example_index): try: # Load details for the specific model, benchmark and split details, _ = load_details_and_results(repo_name, benchmark, split) example = details[example_index] except Exception as e: return f"Error loading model details: {str(e)}" # Create HTML output html_output = "
\n\n" # Ground Truth section html_output += "
\n" html_output += "

Ground Truth

\n" html_output += "
\n" html_output += f"
{example['gold']}
\n" html_output += "
\n" html_output += "
\n" # Model output section html_output += "
\n" html_output += f"

{repo_name}

\n" html_output += f"

Split: {split}

\n" # Prompt section html_output += "
\n" html_output += "

Prompt

\n" html_output += "
\n" html_output += "
\n" prompt = example['full_prompt'] if isinstance(prompt, list): for msg in prompt: if isinstance(msg, dict) and 'role' in msg and 'content' in msg: role = msg['role'].title() content = msg['content'].replace('<', '<').replace('>', '>') html_output += f"
\n" html_output += f"{role}:\n" html_output += f"
{content}
\n" html_output += "
\n" else: content = str(msg).replace('<', '<').replace('>', '>') html_output += f"
{content}
\n" else: prompt_text = str(prompt).replace('<', '<').replace('>', '>') html_output += f"
{prompt_text}
\n" html_output += "
\n" html_output += "
\n" html_output += "
\n\n" # Metrics section html_output += "
\n" html_output += "

Metrics

\n" metrics = example['metrics'] if isinstance(metrics, str): metrics = eval(metrics) html_output += "
\n" html_output += "\n" for key, value in metrics.items(): if isinstance(value, float): value = f"{value:.3f}" html_output += f"\n" html_output += "
{key}{value}
\n" html_output += "
\n" html_output += "
\n\n" # Prediction section prediction = example['predictions'][0] if example['predictions'] else '' html_output += "
\n" html_output += "

Prediction

" word_count = len(prediction.split()) html_output += f"({word_count} words)" html_output += "
\n" html_output += "
\n" html_output += "
\n" prediction = prediction.replace('<', '<').replace('>', '>') html_output += f"
{prediction}
\n" html_output += "
\n" html_output += "
\n" html_output += "
\n" html_output += "
\n
" return html_output # Create the Gradio interface with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# Model Generation Details") gr.Markdown("View detailed outputs for a specific model") with gr.Row(): repo_select = gr.Radio( choices=["Choose from list", "Custom"], label="Repository Selection Method", value="Choose from list", info="Select how you want to specify the repository" ) with gr.Row(): repo_dropdown = gr.Dropdown( choices=REPO_OPTIONS, label="Repository Name", value=REPO_OPTIONS[0] if REPO_OPTIONS else None, visible=True, info="Select from predefined repositories" ) repo_custom = gr.Textbox( label="Custom Repository Name", placeholder="e.g. OpenEvals/details_custom_model_private", visible=False, info="Enter custom repository name" ) with gr.Row(): benchmark = gr.Dropdown( label="Benchmark", choices=[], info="Select the benchmark subset" ) split = gr.Dropdown( label="Split", choices=[], info="Select evaluation." ) with gr.Row(): example_index = gr.Number( label="Example Index", value=0, step=1, info="Navigate through different examples" ) submit_btn = gr.Button("Show Results", variant="primary") # Add this function to handle visibility toggling def toggle_repo_input(choice): return { repo_dropdown: gr.update(visible=(choice == "Choose from list")), repo_custom: gr.update(visible=(choice == "Custom")) } # Add this function to get the active repository name def get_active_repo(selection_method, dropdown_value, custom_value): return custom_value if selection_method == "Custom" else dropdown_value # Update the event handlers repo_select.change( fn=toggle_repo_input, inputs=[repo_select], outputs=[repo_dropdown, repo_custom] ) # Update the repository change handler to update available benchmarks def update_benchmarks(selection_method, dropdown_value, custom_value): repo = get_active_repo(selection_method, dropdown_value, custom_value) available_benchmarks = get_available_benchmarks(repo) print(available_benchmarks) return gr.Dropdown(choices=available_benchmarks, value=available_benchmarks[0] if available_benchmarks else None) repo_dropdown.change( fn=update_benchmarks, inputs=[repo_select, repo_dropdown, repo_custom], outputs=benchmark ) repo_custom.change( fn=update_benchmarks, inputs=[repo_select, repo_dropdown, repo_custom], outputs=benchmark ) # Update the benchmark change handler benchmark.change( fn=lambda selection_method, dropdown, custom, bench: update_splits( get_active_repo(selection_method, dropdown, custom), bench ), inputs=[repo_select, repo_dropdown, repo_custom, benchmark], outputs=split ) # Display results output = gr.HTML() submit_btn.click( fn=lambda selection_method, dropdown, custom, bench, split_val, idx: display_model_details( get_active_repo(selection_method, dropdown, custom), bench, split_val, idx ), inputs=[repo_select, repo_dropdown, repo_custom, benchmark, split, example_index], outputs=output ) if __name__ == "__main__": demo.launch()