Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import random | |
import pandas as pd | |
import os | |
import threading | |
import time | |
from utils.data_loader import get_random_example | |
from utils.models import generate_summaries, model_names | |
from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html | |
from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html | |
from utils.vote_logger import save_vote_details | |
from utils.shared import generation_interrupt # Import from shared module | |
# Feedback options for different voting outcomes | |
feedback_options = { | |
"left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"], | |
"right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"], | |
"tie": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)", | |
"Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"], | |
"neither": ["Model A: Incomplete", "Model A: Hallucinate", "Model A: Irrelevant", "Model A: Incorrect refusal (if applicable)", | |
"Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"] | |
} | |
def load_context(set_interrupt=False): | |
""" | |
Load a new question and context | |
Parameters: | |
- set_interrupt: If True, will interrupt any ongoing inference before loading | |
""" | |
if set_interrupt: | |
# Interrupt any ongoing inference | |
generation_interrupt.set() | |
time.sleep(0.2) # Short delay to allow threads to detect interrupt | |
# Always clear the flag before starting new work | |
generation_interrupt.clear() | |
example = get_random_example() | |
# Format the context description | |
context_desc = example.get('processed_context_desc', '') | |
if context_desc: | |
context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>" | |
show_full = False | |
context_html = get_context_html(example, show_full=show_full) | |
return [ | |
example, | |
gr.update(value=example['question']), | |
gr.update(value=context_desc, visible=bool(context_desc)), | |
gr.update(value=context_html), | |
gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]), | |
show_full | |
] | |
def load_leaderboard(): | |
"""Loads and displays the leaderboard data""" | |
results = load_leaderboard_data() | |
leaderboard_html = generate_leaderboard_html(results) | |
return leaderboard_html | |
def generate_model_summaries(example): | |
"""Run model inference""" | |
result = { | |
"model_a": "", | |
"model_b": "", | |
"summary_a": "", | |
"summary_b": "", | |
"completed": False | |
} | |
if generation_interrupt.is_set(): | |
return result | |
try: | |
m_a_name, m_b_name = random.sample(model_names, 2) | |
# Track the partial completion state | |
result["model_a"] = m_a_name | |
result["model_b"] = m_b_name | |
s_a, s_b = generate_summaries(example, m_a_name, m_b_name) | |
if not generation_interrupt.is_set(): | |
result["summary_a"] = s_a | |
result["summary_b"] = s_b | |
result["completed"] = bool(s_a and s_b) # Only mark complete if both have content | |
except Exception as e: | |
print(f"Error in generation: {e}") | |
return result | |
def process_generation_result(result): | |
"""Process the results from the generation function""" | |
if not result["completed"] or not result["summary_a"] or not result["summary_b"]: | |
# Either generation was interrupted or both summaries aren't ready | |
return [ | |
result.get("model_a", ""), | |
result.get("model_b", ""), | |
result.get("summary_a", ""), | |
result.get("summary_b", ""), | |
None, [], False, load_leaderboard_data(), | |
gr.update(value=result.get("summary_a", "Generation was interrupted or failed.")), | |
gr.update(value=result.get("summary_b", "Generation was interrupted or failed.")), | |
gr.update(interactive=False, elem_classes=["vote-button"]), # Explicitly disable | |
gr.update(interactive=False, elem_classes=["vote-button"]), | |
gr.update(interactive=False, elem_classes=["vote-button"]), | |
gr.update(interactive=False, elem_classes=["vote-button", "vote-button-neither"]), | |
gr.update(choices=[], value=[], interactive=False, visible=False), | |
gr.update(visible=False), | |
gr.update(interactive=False, visible=True), | |
gr.update(visible=False), | |
gr.update(interactive=True), | |
gr.update(elem_classes=[]) | |
] | |
# Only enable voting when both summaries are complete and non-empty | |
buttons_interactive = bool(result["summary_a"] and result["summary_b"]) | |
# Generation completed successfully | |
agg_results = load_leaderboard_data() | |
return [ | |
result["model_a"], result["model_b"], | |
result["summary_a"], result["summary_b"], | |
None, [], False, agg_results, | |
gr.update(value=result["summary_a"]), | |
gr.update(value=result["summary_b"]), | |
gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]), | |
gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]), | |
gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]), | |
gr.update(interactive=buttons_interactive, elem_classes=["vote-button", "vote-button-neither"]), | |
gr.update(choices=[], value=[], interactive=False, visible=False), | |
gr.update(visible=False), | |
gr.update(interactive=False, visible=True), | |
gr.update(visible=False), | |
gr.update(interactive=True), | |
gr.update(elem_classes=[]) | |
] | |
def process_example(example): | |
result = generate_model_summaries(example) | |
return process_generation_result(result) | |
def select_vote_improved(winner_choice): | |
"""Updates UI based on vote selection""" | |
feedback_choices = feedback_options.get(winner_choice, []) | |
btn_a_classes = ["vote-button"] | |
btn_b_classes = ["vote-button"] | |
btn_tie_classes = ["vote-button"] | |
btn_neither_classes = ["vote-button", "vote-button-neither"] | |
if winner_choice == 'left': | |
btn_a_classes.append("selected") | |
elif winner_choice == 'right': | |
btn_b_classes.append("selected") | |
elif winner_choice == 'tie': | |
btn_tie_classes.append("selected") | |
elif winner_choice == 'neither': | |
btn_neither_classes.append("selected") | |
return [ | |
winner_choice, | |
gr.update(choices=feedback_choices, value=[], interactive=True, visible=True), | |
gr.update(visible=True), | |
gr.update(interactive=True), | |
gr.update(elem_classes=btn_a_classes), | |
gr.update(elem_classes=btn_b_classes), | |
gr.update(elem_classes=btn_tie_classes), | |
gr.update(elem_classes=btn_neither_classes) | |
] | |
def handle_vote_submission(example, m_a, m_b, winner, feedback, summary_a, summary_b, current_results): | |
"""Handle vote submission - logs details and updates leaderboard""" | |
if winner is None: | |
print("Warning: Submit called without a winner selected.") | |
return {} | |
# Save detailed vote information | |
save_vote_details(example, m_a, m_b, winner, feedback, summary_a, summary_b) | |
# Update Elo ratings and get UI updates | |
return submit_vote_with_elo(m_a, m_b, winner, feedback, current_results) | |
def show_loading_state(): | |
"""Show loading state while fetching new content""" | |
return [ | |
gr.update(value="Loading new question and summaries...", interactive=False), | |
gr.update(value="Loading new question and summaries...", interactive=False), | |
gr.update(interactive=False), # For vote_button_a | |
gr.update(interactive=False), # For vote_button_b | |
gr.update(interactive=False), # For vote_button_tie | |
gr.update(interactive=False) # For vote_button_neither | |
] | |
def handle_new_example_click(): | |
"""Handle clicking 'Get new example' button""" | |
# Use the centralized approach - set_interrupt=True tells load_context to handle interruption | |
return load_context(set_interrupt=True)[0] | |
def update_ui_for_new_context(example): | |
"""Update UI with new context information""" | |
# Format the context description | |
context_desc = example.get('processed_context_desc', '') | |
if context_desc: | |
context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>" | |
return [ | |
gr.update(value=example['question']), | |
gr.update(value=context_desc, visible=bool(context_desc)), | |
gr.update(value=get_context_html(example, False)), | |
gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]), | |
False | |
] | |
# Resource cleanup function for unload event | |
def cleanup_on_disconnect(): | |
"""Clean up resources when browser disconnects""" | |
print(f"Browser disconnected. Cleaning up resources...") | |
generation_interrupt.set() | |
# No need for time.sleep here as this is just setting the flag | |
# Threads will detect it on their next check | |
# Create Gradio interface | |
with gr.Blocks(theme=gr.themes.Default( | |
primary_hue=gr.themes.colors.orange, | |
secondary_hue=gr.themes.colors.slate | |
)) as demo: | |
# Load CSS | |
css_path = os.path.join(os.getcwd(), 'static', 'styles.css') | |
# Load the CSS file | |
with open(css_path, 'r') as f: | |
css_content = f.read() | |
# Create HTML components with CSS | |
gr.HTML(f"<style>{css_content}</style>") | |
# Add JavaScript to handle browser unload events | |
unload_js = """ | |
<script> | |
// This runs when the page is about to be closed or refreshed | |
window.addEventListener('beforeunload', function(e) { | |
// Send a synchronous request to the server | |
navigator.sendBeacon('/cleanup?session_id=' + window.gradioClientState.session_hash); | |
}); | |
</script> | |
""" | |
gr.HTML(unload_js) | |
# State Variables | |
current_example = gr.State({}) | |
model_a_name = gr.State("") | |
model_b_name = gr.State("") | |
summary_a_text = gr.State("") | |
summary_b_text = gr.State("") | |
selected_winner = gr.State(None) | |
feedback_list = gr.State([]) | |
show_results_state = gr.State(False) | |
results_agg = gr.State(load_leaderboard_data()) | |
show_full_context = gr.State(False) | |
# Create Tabs | |
with gr.Tabs() as tabs: | |
# Main Arena Tab | |
with gr.TabItem("Arena", id="arena-tab"): | |
gr.Markdown("# RAG SLM Summarizer/Generator Arena") | |
gr.Markdown(""" | |
1️⃣ Review the query and examine the highlighted context (✨ highlights contain key information! )\n | |
2️⃣ Compare answers generated by two different models side-by-side\n | |
3️⃣ Vote for the better response or select 'Tie/Neither' if appropriate""") | |
gr.HTML("<hr>") | |
# Main container | |
with gr.Column(elem_id="main-interface-area") as main_interface_area: | |
# Query section | |
with gr.Row(elem_id="query-title-row"): | |
gr.Markdown("### 💬 Query (What Users Want to Ask About the Doc)", elem_classes="section-heading") | |
with gr.Row(elem_id="query-container"): | |
with gr.Row(elem_classes="query-box-row"): | |
query_display = gr.Markdown(value="Loading question...", elem_classes="query-text", elem_id="query-section") | |
random_question_btn = gr.Button("🔄 Try a New Question", elem_classes="query-button") | |
# Context description and display | |
context_description = gr.Markdown("", elem_classes="context-description") | |
gr.HTML("<hr>") | |
with gr.Row(elem_id="context-header-row"): | |
gr.Markdown("### 📋 Context (Relevant Information We Got from the Database)", elem_classes="context-title") | |
context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"]) | |
context_display = gr.HTML(value="Loading context...", label="Context Chunks") | |
gr.Markdown("---") | |
gr.Markdown("### 🔍 Compare Answers from Models", elem_classes="section-heading") | |
# Model summaries - Add ID for JavaScript to target and disable autoscroll | |
with gr.Row(elem_id="summary-containers"): | |
with gr.Column(scale=1): | |
with gr.Group(elem_classes=["summary-card", "summary-card-a"]): | |
summary_a_display = gr.Textbox( | |
label="Model A", | |
lines=10, | |
interactive=False, | |
show_copy_button=True, | |
autoscroll=False, # Disable auto-scrolling | |
elem_id="summary-a-display" | |
) | |
with gr.Column(scale=1): | |
with gr.Group(elem_classes=["summary-card", "summary-card-b"]): | |
summary_b_display = gr.Textbox( | |
label="Model B", | |
lines=10, | |
interactive=False, | |
show_copy_button=True, | |
autoscroll=False, # Disable auto-scrolling | |
elem_id="summary-b-display" | |
) | |
gr.HTML("<hr>") | |
# Voting section | |
gr.Markdown("### 🏅 Cast Your Vote", elem_classes="section-heading") | |
with gr.Row(): | |
vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"], interactive=False) | |
vote_button_tie = gr.Button("🤝 Tie / Equally Good", elem_classes=["vote-button"], interactive=False) | |
vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"], interactive=False) | |
vote_button_neither = gr.Button("❌ Neither is Good", elem_classes=["vote-button", "vote-button-neither"], interactive=False) | |
# Feedback and Submit sections | |
with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section: | |
feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False) | |
submit_button = gr.Button("Submit Your Vote", variant="primary", interactive=False, elem_id="submit-button") | |
# Results area | |
with gr.Column(visible=False) as results_reveal_area: | |
gr.Markdown("---") | |
gr.Markdown("### ✅ Vote Submitted!", elem_classes="section-heading") | |
# Model reveal section | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("### Model A was:", elem_classes="section-heading") | |
model_a_reveal = gr.Markdown("", elem_classes="model-reveal model-a-reveal") | |
with gr.Column(scale=1): | |
gr.Markdown("### Model B was:", elem_classes="section-heading") | |
model_b_reveal = gr.Markdown("", elem_classes="model-reveal model-b-reveal") | |
gr.HTML("<hr>") | |
# Try another button | |
with gr.Row(elem_classes=["control-buttons"]): | |
try_another_btn = gr.Button("🔄 Try Another Question", elem_id="try-another-btn") | |
# Leaderboard Tab | |
with gr.TabItem("Leaderboard", id="leaderboard-tab"): | |
gr.Markdown("# RAG SLM Summarizer/Generator Leaderboard", elem_classes="orange-title") | |
gr.Markdown("View performance statistics for all models ranked by Elo rating.") | |
with gr.Group(elem_id="leaderboard-info"): | |
gr.Markdown("""### About Elo Ratings | |
The Elo rating system provides a more accurate ranking than simple win rates: | |
- All models start at 1500 points | |
- Points are exchanged after each comparison based on the expected outcome | |
- Beating a stronger model earns more points than beating a weaker one | |
- The ± value shows the statistical confidence interval (95%) | |
""") | |
results_table_display = gr.HTML(label="Model Performance") | |
# Event handling | |
# Toggle context display | |
context_toggle_btn.click( | |
fn=toggle_context_display, | |
inputs=[current_example, show_full_context], | |
outputs=[show_full_context, context_display, context_toggle_btn] | |
) | |
# Initial loading - context first, then summaries | |
# Uses load_context without interruption since it's the first load | |
demo.load( | |
fn=load_context, # Default is set_interrupt=False | |
inputs=[], | |
outputs=[current_example, query_display, context_description, context_display, | |
context_toggle_btn, show_full_context] | |
).then( | |
fn=process_example, | |
inputs=[current_example], | |
outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text, | |
selected_winner, feedback_list, show_results_state, results_agg, | |
summary_a_display, summary_b_display, vote_button_a, vote_button_b, | |
vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section, | |
submit_button, results_reveal_area, random_question_btn, main_interface_area] | |
) | |
# Load leaderboard content on app start | |
demo.load( | |
fn=load_leaderboard, | |
inputs=[], | |
outputs=[results_table_display] | |
) | |
# Use a single event chain for each button, structured to update UI first, then run inference | |
for btn in [random_question_btn, try_another_btn]: | |
btn.click( | |
# Step 1: Show loading state immediately | |
fn=show_loading_state, | |
inputs=[], | |
outputs=[summary_a_display, summary_b_display, vote_button_a, | |
vote_button_b, vote_button_tie, vote_button_neither] | |
).then( | |
# Step 2: Get new example | |
fn=handle_new_example_click, | |
inputs=[], | |
outputs=[current_example] | |
).then( | |
# Step 3: Update context UI immediately | |
fn=update_ui_for_new_context, | |
inputs=[current_example], | |
outputs=[query_display, context_description, context_display, | |
context_toggle_btn, show_full_context] | |
).then( | |
# Step 4: Then process example for model outputs | |
fn=process_example, | |
inputs=[current_example], | |
outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text, | |
selected_winner, feedback_list, show_results_state, results_agg, | |
summary_a_display, summary_b_display, vote_button_a, vote_button_b, | |
vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section, | |
submit_button, results_reveal_area, random_question_btn, main_interface_area] | |
) | |
# Vote button handlers | |
for btn, choice in zip( | |
[vote_button_a, vote_button_b, vote_button_tie, vote_button_neither], | |
['left', 'right', 'tie', 'neither'] | |
): | |
btn.click( | |
fn=lambda choice=choice: select_vote_improved(choice), | |
inputs=None, | |
outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, | |
vote_button_a, vote_button_b, vote_button_tie, vote_button_neither] | |
) | |
# Update feedback when checkboxes change | |
feedback_checkboxes.change( | |
fn=update_feedback, | |
inputs=[feedback_checkboxes], | |
outputs=[feedback_list] | |
) | |
# Process vote submission and reveal results | |
submit_button.click( | |
fn=handle_vote_submission, | |
inputs=[current_example, model_a_name, model_b_name, selected_winner, feedback_list, summary_a_text, summary_b_text, results_agg], | |
outputs=[show_results_state, results_agg, vote_button_a, vote_button_b, | |
vote_button_tie, vote_button_neither, feedback_checkboxes, | |
feedback_section, submit_button, results_reveal_area, | |
random_question_btn, results_table_display, main_interface_area, | |
context_toggle_btn, model_a_reveal, model_b_reveal] | |
) | |
# Refresh leaderboard when switching to the leaderboard tab | |
tabs.select( | |
fn=load_leaderboard, | |
inputs=[], | |
outputs=[results_table_display], | |
api_name="refresh_leaderboard" | |
) | |
# Register unload event for browser disconnections | |
demo.unload(cleanup_on_disconnect) | |
if __name__ == "__main__": | |
demo.launch(debug=True) |