import gradio as gr import gradio.themes.base from utils import * from data_utils import * from datasets import load_dataset ds = load_dataset("visionLMsftw/vibe-testing-samples", split="train") evaluation_data = get_evaluation_data(ds) ds_results = load_dataset("visionLMsftw/vibe-testing-results", split="train") models = get_model_names(ds_results) responses = get_responses(ds_results) model_params = { "Qwen/Qwen2.5-VL-32B-Instruct": 32, "google/gemma-3-27b-it": 27, "meta-llama/Llama-4-Maverick-17B-128E-Instruct": 17, "Qwen/Qwen2.5-VL-7B-Instruct": 7, "HuggingFaceTB/SmolVLM2-2.2B-Instruct": 2.2, } def filter_models_by_param(min_params): filtered_models = [m for m, p in model_params.items() if p >= min_params] selected = filtered_models[0] if filtered_models else None return gr.update(choices=filtered_models, value=selected) def display_model_details(model_name): if model_name not in model_params: return "No info available." size = model_params[model_name] provider = model_name.split("/")[0] if "/" in model_name else "Unknown" link = f"https://huggingface.co/{model_name}" return f"""

Provider: {provider} | Size: {size}B | Link: {model_name}

""" models = list(model_params.keys()) default_category = evaluation_data[0]["category"] default_example_id = evaluation_data[0]["id"] with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# VLMVibeEval") gr.Markdown( """ A lightweight leaderboard for evaluating Vision Language Models (VLMs) — based on vibes. 🌞 Traditional benchmarks don't give concrete signal for your use case and models are often saturated over them. Instead, we let you **vibe test** models across curated, in-the-wild examples: 1. Predefined categories with images and prompts. 2. Check any model on these examples. 3. Explore the generations and judge for yourself, as different models have different styles and strengths. 🗣️ This is not about scores — it's about *how it feels*. You can submit new models in the community tab and we'll shortly update the app! 🤗 """ ) mode = gr.Radio(["View model-wise responses", "Compare model responses on a specific example"], label="Mode", value="View model-wise responses") with gr.Column(visible=True) as model_mode: param_slider = gr.Slider(minimum=2, maximum=32, step=1, label="Minimum model parameters (B)") selected_model = gr.Dropdown(models, label="Choose model") model_info_box = gr.HTML() param_slider.change(filter_models_by_param, inputs=param_slider, outputs=selected_model) model_category = gr.Dropdown( choices=list(set(ex["category"] for ex in evaluation_data)), label="Category", value=default_category ) model_output = gr.HTML() current_index = gr.State(value=0) current_html = gr.State(value="") def load_initial(model, category): filtered_data = [ex for ex in evaluation_data if ex["category"] == category] html = display_model_responses_html(evaluation_data, responses, model, start_index=0, batch_size=5, category=category) has_more = 5 < len(filtered_data) model_info_html = display_model_details(model) return html, 5, html, gr.update(visible=has_more), model_info_html def load_more(model, index, html_so_far, category): filtered_data = [ex for ex in evaluation_data if ex["category"] == category] new_html = display_model_responses_html(evaluation_data, responses, model, start_index=index, batch_size=5, category=category) updated_html = html_so_far + new_html new_index = index + 5 has_more = new_index < len(filtered_data) return updated_html, new_index, updated_html, gr.update(visible=has_more) more_button = gr.Button("Load more") selected_model.change( load_initial, inputs=[selected_model, model_category], outputs=[model_output, current_index, current_html, more_button, model_info_box] ) model_category.change( load_initial, inputs=[selected_model, model_category], outputs=[model_output, current_index, current_html, more_button, model_info_box] ) demo.load( load_initial, inputs=[selected_model, model_category], outputs=[model_output, current_index, current_html, more_button, model_info_box] ) more_button.click( load_more, inputs=[selected_model, current_index, current_html, model_category], outputs=[model_output, current_index, current_html, more_button] ) with gr.Column(visible=False) as example_mode: category = gr.Dropdown( choices=list(set(ex["category"] for ex in evaluation_data)), label="Category", value=default_category ) example = gr.Dropdown( label="Example", value=default_example_id, choices=get_examples_by_category(evaluation_data, default_category) ) example_display = gr.HTML() category.change(lambda c: gr.update(choices=get_examples_by_category(evaluation_data, c)), category, example) example.change( fn=lambda ex_id: display_example_responses_html(evaluation_data, responses, models, ex_id), inputs=example, outputs=example_display ) demo.load(fn=lambda: display_example_responses_html(evaluation_data, responses, models, default_example_id), inputs=None, outputs=example_display) def switch_mode(selected): return { model_mode: gr.update(visible=selected == "View model-wise responses"), example_mode: gr.update(visible=selected == "Compare model responses on a specific example"), } mode.change(switch_mode, mode, [model_mode, example_mode]) gr.HTML(r""" """) demo.launch()