import gradio as gr import gradio.themes.base from utils import * from data_utils import * from datasets import load_dataset ds = load_dataset("visionLMsftw/vibe-testing-samples", split="train") evaluation_data = get_evaluation_data(ds) ds_results = load_dataset("visionLMsftw/vibe-testing-results", split="train") models = get_model_names(ds_results) responses = get_responses(ds_results) default_category = evaluation_data[0]["category"] default_example_id = evaluation_data[0]["id"] with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# VLMVibeEval") gr.Markdown("VLM evaluation leaderboard based on vibes.") mode = gr.Radio(["View model-wise responses", "Compare model responses on a specific example"], label="Mode", value="View model-wise responses") with gr.Column(visible=True) as model_mode: selected_model = gr.Dropdown(models, label="Choose model") model_category = gr.Dropdown( choices=list(set(ex["category"] for ex in evaluation_data)), label="Category", value=default_category ) model_output = gr.HTML() current_index = gr.State(value=0) current_html = gr.State(value="") def load_initial(model, category): filtered_data = [ex for ex in evaluation_data if ex["category"] == category] html = display_model_responses_html(evaluation_data, responses, model, start_index=0, batch_size=5, category=category) has_more = 5 < len(filtered_data) return html, 5, html, gr.update(visible=has_more) def load_more(model, index, html_so_far, category): filtered_data = [ex for ex in evaluation_data if ex["category"] == category] new_html = display_model_responses_html(evaluation_data, responses, model, start_index=index, batch_size=5, category=category) updated_html = html_so_far + new_html new_index = index + 5 has_more = new_index < len(filtered_data) return updated_html, new_index, updated_html, gr.update(visible=has_more) more_button = gr.Button("Load more") selected_model.change( load_initial, inputs=[selected_model, model_category], outputs=[model_output, current_index, current_html, more_button] ) model_category.change( load_initial, inputs=[selected_model, model_category], outputs=[model_output, current_index, current_html, more_button] ) demo.load( load_initial, inputs=[selected_model, model_category], outputs=[model_output, current_index, current_html, more_button] ) more_button.click( load_more, inputs=[selected_model, current_index, current_html, model_category], outputs=[model_output, current_index, current_html, more_button] ) with gr.Column(visible=False) as example_mode: category = gr.Dropdown( choices=list(set(ex["category"] for ex in evaluation_data)), label="Category", value=default_category ) example = gr.Dropdown( label="Example", value=default_example_id, choices=get_examples_by_category(evaluation_data, default_category) ) example_display = gr.HTML() category.change(lambda c: gr.update(choices=get_examples_by_category(evaluation_data, c)), category, example) example.change( fn=lambda ex_id: display_example_responses_html(evaluation_data, responses, models, ex_id), inputs=example, outputs=example_display ) demo.load(fn=lambda: display_example_responses_html(evaluation_data, responses, models, default_example_id), inputs=None, outputs=example_display) def switch_mode(selected): return { model_mode: gr.update(visible=selected == "View model-wise responses"), example_mode: gr.update(visible=selected == "Compare model responses on a specific example"), } mode.change(switch_mode, mode, [model_mode, example_mode]) gr.HTML(r"""