Spaces:

SaylorTwift
/

OpenEvalsDetails

Running

App Files Files Community

OpenEvalsDetails / app.py

Linker1907

init

e4b8ab6 4 months ago

raw

history blame contribute delete

14.1 kB

	from datasets import load_dataset
	import json
	import gradio as gr

	# Load experiments.json to get model configurations
	with open('experiments.json', 'r') as f:
	EXPERIMENTS = json.load(f)

	# Get all unique benchmark subsets from experiments.json
	BENCHMARKS = []
	for model_config in EXPERIMENTS.values():
	for benchmark in model_config['benchmarks'].values():
	subset = benchmark['subset']
	if subset not in BENCHMARKS:
	BENCHMARKS.append(subset)

	from datasets import get_dataset_split_names

	# Add this near the top with other constants
	REPO_OPTIONS = [
	"OpenEvals/details_gpt-4o_private",
	"OpenEvals/details_claude-3-7-sonnet-20250219_private",
	"OpenEvals/details_o3-mini-2025-01-31_private",
	"OpenEvals/details_moonshotai__Moonlight-16B-A3B-Instruct_private",
	"OpenEvals/details_meta-llama__Llama-3.3-70B-Instruct_private",
	"OpenEvals/details_deepseek-ai__DeepSeek-R1-Distill-Llama-70B_private",
	"OpenEvals/details_qihoo360__TinyR1-32B-Preview_private",
	"OpenEvals/details_openai__gpt-4.5-preview-2025-02-27_private",
	"OpenEvals/details_deepseek-ai__DeepSeek-R1-Distill-Qwen-32B_private",
	"OpenEvals/details_openai__deepseek-ai__DeepSeek-R1_private",
	"OpenEvals/details_Qwen__QwQ-32B_private",
	"OpenEvals/details_google__gemma-3-1b-it_private",
	"OpenEvals/details_google__gemma-3-12b-it_private",
	"OpenEvals/details_google__gemma-3-27b-it_private",
	"OpenEvals/details_openai__deepseek-ai__DeepSeek-V3-0324_private",
	"OpenEvals/details_openai__deepseek-ai__DeepSeek-V3_private",
	"OpenEvals/details_meta-llama__Llama-4-Scout-17B-16E-Instruct_private",
	"OpenEvals/details_meta-llama__Llama-4-Maverick-17B-128E-Instruct-FP8_private"
	]

	def get_model_name_from_repo(repo):
	# Extract model name from repository path
	# Example: "OpenEvals/details_meta-llama__Llama-4-Maverick-17B-128E-Instruct-FP8_private"
	# -> "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
	parts = repo.split('/')
	model_name = parts[1].replace('details_', '').replace('_private', '')
	# Convert double underscores back to forward slashes
	model_name = model_name.replace('__', '/')
	return model_name

	def get_available_benchmarks(repo):
	model_name = get_model_name_from_repo(repo)
	print(model_name)
	if not model_name or model_name not in EXPERIMENTS:
	return []

	model_config = EXPERIMENTS[model_name]
	print(model_config)
	return [benchmark['subset'] for benchmark in model_config['benchmarks'].values()]

	def get_available_splits(repo, benchmark):
	if not benchmark:
	return []
	return get_dataset_split_names(repo, config_name=benchmark.replace("\|", "_").replace(":", "_"))

	def load_details_and_results(repo, subset, split):
	def worker(example):
	example["predictions"] = example["predictions"]
	example["gold"] = example["gold"]
	example["metrics"] = example["metrics"]
	return example

	details = load_dataset(repo, subset.replace("\|", "_").replace(":", "_"), split=split)
	results = load_dataset(repo, "results", split=split)
	results = eval(results[0]["results"])

	columns_to_keep = ['full_prompt', 'gold', 'metrics', 'predictions']
	details = details.select_columns(columns_to_keep)
	details = details.map(worker)

	return details, results

	def update_splits(repo, benchmark):
	splits = get_available_splits(repo, benchmark)
	return gr.Dropdown(choices=splits, value=splits[0] if splits else None)

	def display_model_details(repo_name, benchmark, split, example_index):
	try:
	# Load details for the specific model, benchmark and split
	details, _ = load_details_and_results(repo_name, benchmark, split)
	example = details[example_index]
	except Exception as e:
	return f"Error loading model details: {str(e)}"

	# Create HTML output
	html_output = "<div style='max-width: 800px; margin: 0 auto;'>\n\n"

	# Ground Truth section
	html_output += "<div style='background: #e6f3e6; padding: 20px; border-radius: 10px; margin-bottom: 20px;'>\n"
	html_output += "<h3 style='margin-top: 0;'>Ground Truth</h3>\n"
	html_output += "<div style='overflow-x: auto; max-width: 100%;'>\n"
	html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{example['gold']}</code></pre>\n"
	html_output += "</div>\n"
	html_output += "</div>\n"

	# Model output section
	html_output += "<div style='background: #f5f5f5; padding: 20px; margin-bottom: 20px; border-radius: 10px;'>\n"
	html_output += f"<h2 style='margin-top: 0;'>{repo_name}</h2>\n"
	html_output += f"<p style='color: #666;'>Split: {split}</p>\n"

	# Prompt section
	html_output += "<details style='margin-bottom: 15px;'>\n"
	html_output += "<summary><h3 style='display: inline; margin: 0;'>Prompt</h3></summary>\n"
	html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
	html_output += "<div style='overflow-x: auto;'>\n"

	prompt = example['full_prompt']
	if isinstance(prompt, list):
	for msg in prompt:
	if isinstance(msg, dict) and 'role' in msg and 'content' in msg:
	role = msg['role'].title()
	content = msg['content'].replace('<', '<').replace('>', '>')
	html_output += f"<div style='margin-bottom: 10px;'>\n"
	html_output += f"<strong>{role}:</strong>\n"
	html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0; background: #f8f8f8; padding: 10px; border-radius: 5px;'><code>{content}</code></pre>\n"
	html_output += "</div>\n"
	else:
	content = str(msg).replace('<', '<').replace('>', '>')
	html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0; background: #f8f8f8; padding: 10px; border-radius: 5px;'><code>{content}</code></pre>\n"
	else:
	prompt_text = str(prompt).replace('<', '<').replace('>', '>')
	html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0; background: #f8f8f8; padding: 10px; border-radius: 5px;'><code>{prompt_text}</code></pre>\n"

	html_output += "</div>\n"
	html_output += "</div>\n"
	html_output += "</details>\n\n"

	# Metrics section
	html_output += "<details open style='margin-bottom: 15px;'>\n"
	html_output += "<summary><h3 style='display: inline; margin: 0;'>Metrics</h3></summary>\n"
	metrics = example['metrics']
	if isinstance(metrics, str):
	metrics = eval(metrics)
	html_output += "<div style='overflow-x: auto;'>\n"
	html_output += "<table style='width: 100%; margin: 10px 0; border-collapse: collapse;'>\n"
	for key, value in metrics.items():
	if isinstance(value, float):
	value = f"{value:.3f}"
	html_output += f"<tr><td style='padding: 5px; border-bottom: 1px solid #ddd;'><strong>{key}</strong></td><td style='padding: 5px; border-bottom: 1px solid #ddd;'>{value}</td></tr>\n"
	html_output += "</table>\n"
	html_output += "</div>\n"
	html_output += "</details>\n\n"

	# Prediction section
	prediction = example['predictions'][0] if example['predictions'] else ''
	html_output += "<details open style='margin-bottom: 15px;'>\n"
	html_output += "<summary><h3 style='display: inline; margin: 0;'>Prediction</h3>"
	word_count = len(prediction.split())
	html_output += f"<span style='color: #666; font-size: 0.8em; margin-left: 10px;'>({word_count} words)</span>"
	html_output += "</summary>\n"
	html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
	html_output += "<div style='overflow-x: auto;'>\n"
	prediction = prediction.replace('<', '<').replace('>', '>')
	html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{prediction}</code></pre>\n"
	html_output += "</div>\n"
	html_output += "</div>\n"
	html_output += "</details>\n"

	html_output += "</div>\n</div>"
	return html_output

	# Create the Gradio interface
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# Model Generation Details")
	gr.Markdown("View detailed outputs for a specific model")

	with gr.Row():
	repo_select = gr.Radio(
	choices=["Choose from list", "Custom"],
	label="Repository Selection Method",
	value="Choose from list",
	info="Select how you want to specify the repository"
	)

	with gr.Row():
	repo_dropdown = gr.Dropdown(
	choices=REPO_OPTIONS,
	label="Repository Name",
	value=REPO_OPTIONS[0] if REPO_OPTIONS else None,
	visible=True,
	info="Select from predefined repositories"
	)
	repo_custom = gr.Textbox(
	label="Custom Repository Name",
	placeholder="e.g. OpenEvals/details_custom_model_private",
	visible=False,
	info="Enter custom repository name"
	)

	with gr.Row():
	benchmark_select = gr.Radio(
	choices=["Choose from list", "Custom"],
	label="Benchmark Selection Method",
	value="Choose from list",
	info="Select how you want to specify the benchmark"
	)

	with gr.Row():
	benchmark_dropdown = gr.Dropdown(
	label="Benchmark",
	choices=[],
	info="Select the benchmark subset",
	visible=True
	)
	benchmark_custom = gr.Textbox(
	label="Custom Benchmark",
	placeholder="e.g. lighteval\|gpqa:diamond\|0",
	visible=False,
	info="Enter custom benchmark name"
	)
	split = gr.Dropdown(
	label="Split",
	choices=[],
	info="Select evaluation."
	)
	load_splits_btn = gr.Button("Load Splits", variant="secondary")

	with gr.Row():
	example_index = gr.Number(
	label="Example Index",
	value=0,
	step=1,
	info="Navigate through different examples"
	)
	submit_btn = gr.Button("Show Results", variant="primary")

	# Add this function to handle visibility toggling
	def toggle_repo_input(choice):
	return {
	repo_dropdown: gr.update(visible=(choice == "Choose from list")),
	repo_custom: gr.update(visible=(choice == "Custom"))
	}

	# Add this function to get the active repository name
	def get_active_repo(selection_method, dropdown_value, custom_value):
	return custom_value if selection_method == "Custom" else dropdown_value

	# Add this function to handle benchmark visibility toggling
	def toggle_benchmark_input(choice):
	return {
	benchmark_dropdown: gr.update(visible=(choice == "Choose from list")),
	benchmark_custom: gr.update(visible=(choice == "Custom"))
	}

	# Add this function to get the active benchmark name
	def get_active_benchmark(selection_method, dropdown_value, custom_value):
	return custom_value if selection_method == "Custom" else dropdown_value

	# Update the event handlers
	repo_select.change(
	fn=toggle_repo_input,
	inputs=[repo_select],
	outputs=[repo_dropdown, repo_custom]
	)

	benchmark_select.change(
	fn=toggle_benchmark_input,
	inputs=[benchmark_select],
	outputs=[benchmark_dropdown, benchmark_custom]
	)

	# Update the repository change handler to update available benchmarks
	def update_benchmarks(selection_method, dropdown_value, custom_value):
	repo = get_active_repo(selection_method, dropdown_value, custom_value)
	available_benchmarks = get_available_benchmarks(repo)
	print(available_benchmarks)
	return gr.Dropdown(choices=available_benchmarks, value=available_benchmarks[0] if available_benchmarks else None)

	repo_dropdown.change(
	fn=update_benchmarks,
	inputs=[repo_select, repo_dropdown, repo_custom],
	outputs=benchmark_dropdown
	)

	repo_custom.change(
	fn=update_benchmarks,
	inputs=[repo_select, repo_dropdown, repo_custom],
	outputs=benchmark_dropdown
	)

	# Update the benchmark change handler
	benchmark_dropdown.change(
	fn=lambda selection_method, dropdown, custom, bench: gr.Dropdown(choices=[], value=None),
	inputs=[repo_select, repo_dropdown, repo_custom, benchmark_dropdown],
	outputs=split
	)

	benchmark_custom.change(
	fn=lambda selection_method, dropdown, custom, bench: gr.Dropdown(choices=[], value=None),
	inputs=[repo_select, repo_dropdown, repo_custom, benchmark_custom],
	outputs=split
	)

	# Add handler for the load splits button
	load_splits_btn.click(
	fn=lambda selection_method, dropdown, custom, bench_selection_method, bench_dropdown, bench_custom: update_splits(
	get_active_repo(selection_method, dropdown, custom),
	get_active_benchmark(bench_selection_method, bench_dropdown, bench_custom)
	),
	inputs=[repo_select, repo_dropdown, repo_custom, benchmark_select, benchmark_dropdown, benchmark_custom],
	outputs=split
	)

	# Display results
	output = gr.HTML()
	submit_btn.click(
	fn=lambda repo_selection_method, repo_dropdown, repo_custom, bench_selection_method, bench_dropdown, bench_custom, split_val, idx: display_model_details(
	get_active_repo(repo_selection_method, repo_dropdown, repo_custom),
	get_active_benchmark(bench_selection_method, bench_dropdown, bench_custom),
	split_val,
	idx
	),
	inputs=[repo_select, repo_dropdown, repo_custom, benchmark_select, benchmark_dropdown, benchmark_custom, split, example_index],
	outputs=output
	)

	if __name__ == "__main__":
	demo.launch()