from datasets import load_dataset
import json
import gradio as gr
# Load experiments.json to get model configurations
with open('experiments.json', 'r') as f:
EXPERIMENTS = json.load(f)
# Get all unique benchmark subsets from experiments.json
BENCHMARKS = []
for model_config in EXPERIMENTS.values():
for benchmark in model_config['benchmarks'].values():
subset = benchmark['subset']
if subset not in BENCHMARKS:
BENCHMARKS.append(subset)
from datasets import get_dataset_split_names
# Add this near the top with other constants
REPO_OPTIONS = [
"OpenEvals/details_gpt-4o_private",
"OpenEvals/details_claude-3-7-sonnet-20250219_private",
"OpenEvals/details_o3-mini-2025-01-31_private",
"OpenEvals/details_moonshotai__Moonlight-16B-A3B-Instruct_private",
"OpenEvals/details_meta-llama__Llama-3.3-70B-Instruct_private",
"OpenEvals/details_deepseek-ai__DeepSeek-R1-Distill-Llama-70B_private",
"OpenEvals/details_qihoo360__TinyR1-32B-Preview_private",
"OpenEvals/details_openai__gpt-4.5-preview-2025-02-27_private",
"OpenEvals/details_deepseek-ai__DeepSeek-R1-Distill-Qwen-32B_private",
"OpenEvals/details_openai__deepseek-ai__DeepSeek-R1_private",
"OpenEvals/details_Qwen__QwQ-32B_private",
"OpenEvals/details_google__gemma-3-1b-it_private",
"OpenEvals/details_google__gemma-3-12b-it_private",
"OpenEvals/details_google__gemma-3-27b-it_private",
"OpenEvals/details_openai__deepseek-ai__DeepSeek-V3-0324_private",
"OpenEvals/details_openai__deepseek-ai__DeepSeek-V3_private",
"OpenEvals/details_meta-llama__Llama-4-Scout-17B-16E-Instruct_private",
"OpenEvals/details_meta-llama__Llama-4-Maverick-17B-128E-Instruct-FP8_private"
]
def get_model_name_from_repo(repo):
# Extract model name from repository path
# Example: "OpenEvals/details_meta-llama__Llama-4-Maverick-17B-128E-Instruct-FP8_private"
# -> "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
parts = repo.split('/')
model_name = parts[1].replace('details_', '').replace('_private', '')
# Convert double underscores back to forward slashes
model_name = model_name.replace('__', '/')
return model_name
def get_available_benchmarks(repo):
model_name = get_model_name_from_repo(repo)
print(model_name)
if not model_name or model_name not in EXPERIMENTS:
return []
model_config = EXPERIMENTS[model_name]
print(model_config)
return [benchmark['subset'] for benchmark in model_config['benchmarks'].values()]
def get_available_splits(repo, benchmark):
if not benchmark:
return []
return get_dataset_split_names(repo, config_name=benchmark.replace("|", "_").replace(":", "_"))
def load_details_and_results(repo, subset, split):
def worker(example):
example["predictions"] = example["predictions"]
example["gold"] = example["gold"][0]
example["metrics"] = example["metrics"]
return example
details = load_dataset(repo, subset.replace("|", "_").replace(":", "_"), split=split)
results = load_dataset(repo, "results", split=split)
results = eval(results[0]["results"])
columns_to_keep = ['full_prompt', 'gold', 'metrics', 'predictions']
details = details.select_columns(columns_to_keep)
details = details.map(worker)
return details, results
def update_splits(repo, benchmark):
splits = get_available_splits(repo, benchmark)
return gr.Dropdown(choices=splits, value=splits[0] if splits else None)
def display_model_details(repo_name, benchmark, split, example_index):
try:
# Load details for the specific model, benchmark and split
details, _ = load_details_and_results(repo_name, benchmark, split)
example = details[example_index]
except Exception as e:
return f"Error loading model details: {str(e)}"
# Create HTML output
html_output = "
\n\n"
# Ground Truth section
html_output += "
\n"
html_output += "
Ground Truth
\n"
html_output += "
\n"
html_output += f"
{example['gold']}
\n"
html_output += "
\n"
html_output += "
\n"
# Model output section
html_output += "
\n"
html_output += f"
{repo_name}
\n"
html_output += f"
Split: {split}
\n"
# Prompt section
html_output += "
\n"
html_output += "Prompt
\n"
html_output += "\n"
html_output += "
\n"
prompt = example['full_prompt']
if isinstance(prompt, list):
for msg in prompt:
if isinstance(msg, dict) and 'role' in msg and 'content' in msg:
role = msg['role'].title()
content = msg['content'].replace('<', '<').replace('>', '>')
html_output += f"
\n"
html_output += f"
{role}:\n"
html_output += f"
{content}
\n"
html_output += "
\n"
else:
content = str(msg).replace('<', '<').replace('>', '>')
html_output += f"
{content}
\n"
else:
prompt_text = str(prompt).replace('<', '<').replace('>', '>')
html_output += f"
{prompt_text}
\n"
html_output += "
\n"
html_output += "
\n"
html_output += " \n\n"
# Metrics section
html_output += "
\n"
html_output += "Metrics
\n"
metrics = example['metrics']
if isinstance(metrics, str):
metrics = eval(metrics)
html_output += "\n"
html_output += "
\n"
for key, value in metrics.items():
if isinstance(value, float):
value = f"{value:.3f}"
html_output += f"{key} | {value} |
\n"
html_output += "
\n"
html_output += "
\n"
html_output += " \n\n"
# Prediction section
prediction = example['predictions'][0] if example['predictions'] else ''
html_output += "
\n"
html_output += "Prediction
"
word_count = len(prediction.split())
html_output += f"({word_count} words)"
html_output += "
\n"
html_output += "\n"
html_output += "
\n"
prediction = prediction.replace('<', '<').replace('>', '>')
html_output += f"
{prediction}
\n"
html_output += "
\n"
html_output += "
\n"
html_output += " \n"
html_output += "
\n
"
return html_output
# Create the Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# Model Generation Details")
gr.Markdown("View detailed outputs for a specific model")
with gr.Row():
repo_select = gr.Radio(
choices=["Choose from list", "Custom"],
label="Repository Selection Method",
value="Choose from list",
info="Select how you want to specify the repository"
)
with gr.Row():
repo_dropdown = gr.Dropdown(
choices=REPO_OPTIONS,
label="Repository Name",
value=REPO_OPTIONS[0] if REPO_OPTIONS else None,
visible=True,
info="Select from predefined repositories"
)
repo_custom = gr.Textbox(
label="Custom Repository Name",
placeholder="e.g. OpenEvals/details_custom_model_private",
visible=False,
info="Enter custom repository name"
)
with gr.Row():
benchmark = gr.Dropdown(
label="Benchmark",
choices=[],
info="Select the benchmark subset"
)
split = gr.Dropdown(
label="Split",
choices=[],
info="Select evaluation."
)
with gr.Row():
example_index = gr.Number(
label="Example Index",
value=0,
step=1,
info="Navigate through different examples"
)
submit_btn = gr.Button("Show Results", variant="primary")
# Add this function to handle visibility toggling
def toggle_repo_input(choice):
return {
repo_dropdown: gr.update(visible=(choice == "Choose from list")),
repo_custom: gr.update(visible=(choice == "Custom"))
}
# Add this function to get the active repository name
def get_active_repo(selection_method, dropdown_value, custom_value):
return custom_value if selection_method == "Custom" else dropdown_value
# Update the event handlers
repo_select.change(
fn=toggle_repo_input,
inputs=[repo_select],
outputs=[repo_dropdown, repo_custom]
)
# Update the repository change handler to update available benchmarks
def update_benchmarks(selection_method, dropdown_value, custom_value):
repo = get_active_repo(selection_method, dropdown_value, custom_value)
available_benchmarks = get_available_benchmarks(repo)
print(available_benchmarks)
return gr.Dropdown(choices=available_benchmarks, value=available_benchmarks[0] if available_benchmarks else None)
repo_dropdown.change(
fn=update_benchmarks,
inputs=[repo_select, repo_dropdown, repo_custom],
outputs=benchmark
)
repo_custom.change(
fn=update_benchmarks,
inputs=[repo_select, repo_dropdown, repo_custom],
outputs=benchmark
)
# Update the benchmark change handler
benchmark.change(
fn=lambda selection_method, dropdown, custom, bench: update_splits(
get_active_repo(selection_method, dropdown, custom),
bench
),
inputs=[repo_select, repo_dropdown, repo_custom, benchmark],
outputs=split
)
# Display results
output = gr.HTML()
submit_btn.click(
fn=lambda selection_method, dropdown, custom, bench, split_val, idx: display_model_details(
get_active_repo(selection_method, dropdown, custom),
bench,
split_val,
idx
),
inputs=[repo_select, repo_dropdown, repo_custom, benchmark, split, example_index],
outputs=output
)
if __name__ == "__main__":
demo.launch()