Haoguang Cai commited on
Commit
8a142a6
·
1 Parent(s): e2dea9b

add UI and data processing

Browse files
app.py CHANGED
@@ -1,102 +1,368 @@
1
  import gradio as gr
2
  import random
3
- import json
4
  import os
5
- from datetime import datetime
6
-
7
- # This would be replaced with your actual SLM integration
8
- def generate_response(query, context, model_name):
9
- """Placeholder function to generate response from an SLM"""
10
- return f"This is a placeholder response from {model_name} based on query: {query} and context: {context}"
11
-
12
- def save_evaluation(query, context, model_a, model_b, response_a, response_b, preference):
13
- """Save evaluation results to a JSON file"""
14
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
15
- evaluation = {
16
- "timestamp": timestamp,
17
- "query": query,
18
- "context": context,
19
- "models": {
20
- "left": model_a,
21
- "right": model_b
22
- },
23
- "responses": {
24
- "left": response_a,
25
- "right": response_b
26
- },
27
- "preference": preference
28
- }
29
-
30
- # Create directory if it doesn't exist
31
- os.makedirs("evaluations", exist_ok=True)
32
-
33
- # Save to a file
34
- with open(f"evaluations/eval_{timestamp.replace(' ', '_').replace(':', '-')}.json", "w") as f:
35
- json.dump(evaluation, f, indent=2)
36
 
37
- return "Evaluation saved successfully!"
 
 
38
 
39
- def process_query(query, context, model_a="SLM-A", model_b="SLM-B"):
40
- """Process query and generate responses from two models"""
41
- # Generate responses
42
- response_a = generate_response(query, context, model_a)
43
- response_b = generate_response(query, context, model_b)
44
 
45
- # Randomly swap to avoid position bias
46
- if random.random() > 0.5:
47
- return response_a, response_b, model_a, model_b
48
- else:
49
- return response_b, response_a, model_b, model_a
50
-
51
- def submit_evaluation(query, context, response_left, response_right, preference, model_left, model_right):
52
- """Submit and save the evaluation"""
53
- if not preference:
54
- return "Please select a preference before submitting."
55
 
56
- save_evaluation(query, context, model_left, model_right, response_left, response_right, preference)
57
- return "Thank you for your evaluation!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- with gr.Blocks(title="SLM-RAG Arena") as app:
60
- gr.Markdown("# SLM-RAG Arena")
61
- gr.Markdown("Compare responses from different models for RAG tasks.")
 
62
 
63
- with gr.Row():
64
- with gr.Column():
65
- query_input = gr.Textbox(label="Query", placeholder="Enter your query here...")
66
- context_input = gr.Textbox(label="Context", placeholder="Enter context information here...", lines=5)
67
- generate_btn = gr.Button("Generate Responses")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- # Hidden state variables
70
- model_left = gr.State("")
71
- model_right = gr.State("")
72
-
73
- with gr.Row():
74
- with gr.Column():
75
- gr.Markdown("### Response A")
76
- response_left = gr.Textbox(label="", lines=10, interactive=False)
77
- with gr.Column():
78
- gr.Markdown("### Response B")
79
- response_right = gr.Textbox(label="", lines=10, interactive=False)
80
-
81
- with gr.Row():
82
- preference = gr.Radio(
83
- choices=["Prefer Left", "Tie", "Prefer Right", "Neither"],
84
- label="Which response do you prefer?"
85
- )
86
-
87
- submit_btn = gr.Button("Submit Evaluation")
88
- result = gr.Textbox(label="Result")
89
-
90
- generate_btn.click(
91
- process_query,
92
- inputs=[query_input, context_input],
93
- outputs=[response_left, response_right, model_left, model_right]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  )
95
 
96
- submit_btn.click(
97
- submit_evaluation,
98
- inputs=[query_input, context_input, response_left, response_right, preference, model_left, model_right],
99
- outputs=[result]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  )
101
 
102
- app.launch()
 
 
1
  import gradio as gr
2
  import random
3
+ import pandas as pd
4
  import os
5
+ from utils.data_loader import get_random_example
6
+ from utils.models import generate_summaries, model_names
7
+ from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
8
+ from utils.leaderboard import load_leaderboard_data, save_leaderboard_data
9
+
10
+ # Read CSS from file
11
+ css_path = os.path.join(os.getcwd(), 'static', 'styles.css')
12
+ with open(css_path, 'r') as f:
13
+ css_content = f.read()
14
+
15
+ # Feedback options
16
+ feedback_options = {
17
+ "left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"],
18
+ "right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
19
+ "tie": ["Both complete", "Both accurate", "Both well written", "Both handle refusal well (if applicable)"],
20
+ "neither": ["Both incomplete", "Both hallucinate", "Both irrelevant", "Both incorrectly refuse (if applicable)", "A is bad", "B is bad"]
21
+ }
22
+
23
+ def load_new_question_improved(agg_results=None, show_full=False):
24
+ """Loads a new random question, contexts, and model summaries."""
25
+ if agg_results is None:
26
+ agg_results = load_leaderboard_data()
 
 
 
 
 
 
 
 
 
27
 
28
+ example = get_random_example()
29
+ m_a_name, m_b_name = random.sample(model_names, 2)
30
+ s_a, s_b = generate_summaries(example, m_a_name, m_b_name)
31
 
32
+ context_desc = example.get('processed_context_desc', '')
33
+ if context_desc:
34
+ context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"
 
 
35
 
36
+ show_full = False
37
+ context_html = get_context_html(example, show_full=show_full)
 
 
 
 
 
 
 
 
38
 
39
+ return [
40
+ example, # current_example
41
+ m_a_name, # model_a_name
42
+ m_b_name, # model_b_name
43
+ s_a, # summary_a_text
44
+ s_b, # summary_b_text
45
+ None, # selected_winner
46
+ [], # feedback_list
47
+ False, # show_results_state
48
+ agg_results, # results_agg
49
+ show_full, # show_full_context
50
+ gr.update(value=example['question']), # query_display
51
+ gr.update(value=context_desc, visible=bool(context_desc)), # context_description
52
+ gr.update(value=context_html), # context_display
53
+ gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]), # context_toggle_btn
54
+ gr.update(value=s_a), # summary_a_display
55
+ gr.update(value=s_b), # summary_b_display
56
+ gr.update(interactive=True, elem_classes=["vote-button"]), # vote_button_a
57
+ gr.update(interactive=True, elem_classes=["vote-button"]), # vote_button_b
58
+ gr.update(interactive=True, elem_classes=["vote-button"]), # vote_button_tie
59
+ gr.update(interactive=True, elem_classes=["vote-button", "vote-button-neither"]), # vote_button_neither
60
+ gr.update(choices=[], value=[], interactive=False, visible=False), # feedback_checkboxes
61
+ gr.update(visible=False), # feedback_section
62
+ gr.update(interactive=False, visible=True), # submit_button
63
+ gr.update(visible=False), # results_reveal_area
64
+ gr.update(interactive=True), # random_question_btn
65
+ gr.update(elem_classes=[]) # main_interface_area
66
+ ]
67
+
68
+ def select_vote_improved(winner_choice):
69
+ """Handles vote button selections."""
70
+ feedback_choices = feedback_options.get(winner_choice, [])
71
 
72
+ btn_a_classes = ["vote-button"]
73
+ btn_b_classes = ["vote-button"]
74
+ btn_tie_classes = ["vote-button"]
75
+ btn_neither_classes = ["vote-button", "vote-button-neither"]
76
 
77
+ if winner_choice == 'left':
78
+ btn_a_classes.append("selected")
79
+ elif winner_choice == 'right':
80
+ btn_b_classes.append("selected")
81
+ elif winner_choice == 'tie':
82
+ btn_tie_classes.append("selected")
83
+ elif winner_choice == 'neither':
84
+ btn_neither_classes.append("selected")
85
+
86
+ return [
87
+ winner_choice, # selected_winner
88
+ gr.update(choices=feedback_choices, value=[], interactive=True, visible=True), # feedback_checkboxes
89
+ gr.update(visible=True), # feedback_section
90
+ gr.update(interactive=True), # submit_button
91
+ gr.update(elem_classes=btn_a_classes), # vote_button_a
92
+ gr.update(elem_classes=btn_b_classes), # vote_button_b
93
+ gr.update(elem_classes=btn_tie_classes), # vote_button_tie
94
+ gr.update(elem_classes=btn_neither_classes) # vote_button_neither
95
+ ]
96
+
97
+ def submit_vote_fixed(m_a, m_b, winner, feedback, current_results):
98
+ """Processes vote submission and updates results."""
99
+ if winner is None:
100
+ print("Warning: Submit called without a winner selected.")
101
+ return {}
102
+
103
+ updated_results = current_results.copy()
104
+ models_involved = [m_a, m_b]
105
+ for model in models_involved:
106
+ if model not in updated_results["wins"]:
107
+ updated_results["wins"][model] = 0
108
+ updated_results["losses"][model] = 0
109
+ updated_results["ties"][model] = 0
110
+
111
+ if winner == 'left':
112
+ updated_results["wins"][m_a] = updated_results["wins"].get(m_a, 0) + 1
113
+ updated_results["losses"][m_b] = updated_results["losses"].get(m_b, 0) + 1
114
+ elif winner == 'right':
115
+ updated_results["wins"][m_b] = updated_results["wins"].get(m_b, 0) + 1
116
+ updated_results["losses"][m_a] = updated_results["losses"].get(m_a, 0) + 1
117
+ elif winner == 'tie':
118
+ updated_results["ties"][m_a] = updated_results["ties"].get(m_a, 0) + 1
119
+ updated_results["ties"][m_b] = updated_results["ties"].get(m_b, 0) + 1
120
+
121
+ updated_results["votes"] = updated_results.get("votes", 0) + 1
122
+ save_leaderboard_data(updated_results)
123
+
124
+ # Prepare Results Table
125
+ results_list = []
126
+ all_models = list(set(list(updated_results["wins"].keys()) + list(updated_results["losses"].keys()) + list(updated_results["ties"].keys())))
127
+
128
+ for model in sorted(all_models):
129
+ wins = updated_results["wins"].get(model, 0)
130
+ losses = updated_results["losses"].get(model, 0)
131
+ ties = updated_results["ties"].get(model, 0)
132
+ total_comparisons = wins + losses + ties
133
+ win_rate = (wins + 0.5 * ties) / total_comparisons if total_comparisons > 0 else 0.0
134
+ results_list.append({
135
+ "Model": model,
136
+ "Win Rate (%)": f"{win_rate:.1%}",
137
+ "Wins": wins,
138
+ "Losses": losses,
139
+ "Ties": ties,
140
+ "Comparisons": total_comparisons
141
+ })
142
 
143
+ results_df = pd.DataFrame(results_list)
144
+ if not results_df.empty:
145
+ results_df['Win Rate Value'] = results_df['Win Rate (%)'].str.rstrip('%').astype('float') / 100.0
146
+ results_df = results_df.sort_values(by='Win Rate Value', ascending=False).drop(columns=['Win Rate Value'])
147
+
148
+ return [
149
+ True, # show_results_state
150
+ updated_results, # results_agg
151
+ gr.update(interactive=False), # vote_button_a
152
+ gr.update(interactive=False), # vote_button_b
153
+ gr.update(interactive=False), # vote_button_tie
154
+ gr.update(interactive=False), # vote_button_neither
155
+ gr.update(interactive=False), # feedback_checkboxes
156
+ gr.update(visible=True), # feedback_section
157
+ gr.update(visible=False), # submit_button
158
+ gr.update(visible=True), # results_reveal_area
159
+ gr.update(interactive=False), # random_question_btn
160
+ gr.update(value=results_df, visible=True), # results_table_display
161
+ gr.update(elem_classes=["results-revealed"]), # main_interface_area
162
+ gr.update(interactive=True), # context_toggle_btn
163
+ gr.update(value=m_a), # model_a_reveal
164
+ gr.update(value=m_b) # model_b_reveal
165
+ ]
166
+
167
+ # Create embedded CSS
168
+ css_html = f"<style>{css_content}</style>"
169
+
170
+ # Create Gradio interface
171
+ with gr.Blocks(theme=gr.themes.Default(
172
+ primary_hue=gr.themes.colors.orange,
173
+ secondary_hue=gr.themes.colors.slate
174
+ )) as demo:
175
+ # Embed CSS directly in HTML
176
+ gr.HTML(css_html)
177
+
178
+ # State Variables
179
+ current_example = gr.State({})
180
+ model_a_name = gr.State("")
181
+ model_b_name = gr.State("")
182
+ summary_a_text = gr.State("")
183
+ summary_b_text = gr.State("")
184
+ selected_winner = gr.State(None)
185
+ feedback_list = gr.State([])
186
+ show_results_state = gr.State(False)
187
+ results_agg = gr.State({"wins": {}, "losses": {}, "ties": {}, "votes": 0})
188
+ show_full_context = gr.State(False)
189
+
190
+ # Create Tabs
191
+ with gr.Tabs() as tabs:
192
+ # Main Arena Tab
193
+ with gr.TabItem("Arena", id="arena-tab"):
194
+ # Main title and description
195
+ gr.Markdown("# RAG Summarizer Arena")
196
+ gr.Markdown("Compare summaries generated by different models based on the provided context and query. Select the better summary, or choose 'Tie' or 'Neither'. Your feedback helps evaluate model performance.")
197
+
198
+ # Main container
199
+ with gr.Column(elem_id="main-interface-area") as main_interface_area:
200
+ # Query section
201
+ with gr.Row(elem_id="query-title-row"):
202
+ gr.Markdown("### Query", elem_classes="section-heading")
203
+
204
+ with gr.Row(elem_id="query-container"):
205
+ with gr.Row(elem_classes="query-box-row"):
206
+ query_display = gr.Markdown(value="Loading question...", elem_classes="query-text")
207
+ random_question_btn = gr.Button("🔄 Get Random Question", elem_classes="query-button")
208
+
209
+ # Context description
210
+ context_description = gr.Markdown("", elem_classes="context-description")
211
+
212
+ # Context section
213
+ with gr.Row(elem_id="context-header-row"):
214
+ gr.Markdown("### Context Provided", elem_classes="context-title")
215
+ context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"])
216
+
217
+ context_display = gr.HTML(value="Loading context...", label="Context Chunks")
218
+
219
+ gr.Markdown("---")
220
+ gr.Markdown("### Compare Summaries", elem_classes="section-heading")
221
+
222
+ # Model summaries
223
+ with gr.Row():
224
+ with gr.Column(scale=1):
225
+ with gr.Group(elem_classes=["summary-card", "summary-card-a"]):
226
+ summary_a_display = gr.Textbox(label="Model A", lines=10, interactive=False, show_copy_button=True)
227
+ with gr.Column(scale=1):
228
+ with gr.Group(elem_classes=["summary-card", "summary-card-b"]):
229
+ summary_b_display = gr.Textbox(label="Model B", lines=10, interactive=False, show_copy_button=True)
230
+
231
+ # Voting section
232
+ gr.Markdown("### Cast Your Vote", elem_classes="section-heading")
233
+ with gr.Row():
234
+ vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"])
235
+ vote_button_tie = gr.Button("🤝 Tie / Equally Good", elem_classes=["vote-button"])
236
+ vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"])
237
+ vote_button_neither = gr.Button("❌ Neither is Adequate", elem_classes=["vote-button", "vote-button-neither"])
238
+
239
+ # Feedback section
240
+ with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section:
241
+ feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False)
242
+
243
+ # Submit button
244
+ submit_button = gr.Button("Submit Vote", variant="primary", interactive=False, elem_id="submit-button")
245
+
246
+ # Results area
247
+ with gr.Column(visible=False) as results_reveal_area:
248
+ gr.Markdown("---")
249
+ gr.Markdown("### ✅ Vote Submitted!", elem_classes="section-heading")
250
+
251
+ # Model reveal section
252
+ with gr.Row():
253
+ with gr.Column(scale=1):
254
+ gr.Markdown("### Model A was actually:", elem_classes="section-heading")
255
+ model_a_reveal = gr.Markdown("", elem_classes="model-reveal model-a-reveal")
256
+ with gr.Column(scale=1):
257
+ gr.Markdown("### Model B was actually:", elem_classes="section-heading")
258
+ model_b_reveal = gr.Markdown("", elem_classes="model-reveal model-b-reveal")
259
+
260
+ gr.HTML("<div style='height: 10px;'></div>")
261
+
262
+ # Try another button
263
+ with gr.Row(elem_classes=["control-buttons"]):
264
+ try_another_btn = gr.Button("🔄 Try Another Question", elem_id="try-another-btn")
265
+
266
+ # Leaderboard Tab
267
+ with gr.TabItem("Leaderboard", id="leaderboard-tab"):
268
+ gr.Markdown("# Model Performance Leaderboard")
269
+ gr.Markdown("View aggregate performance statistics for all models. The table below shows win rates, wins, losses, and ties for each model based on all evaluations.")
270
+ results_table_display = gr.DataFrame(label="Model Performance", interactive=False, wrap=True)
271
+
272
+ # Event Listeners
273
+ context_toggle_btn.click(
274
+ fn=toggle_context_display,
275
+ inputs=[current_example, show_full_context],
276
+ outputs=[show_full_context, context_display, context_toggle_btn]
277
  )
278
 
279
+ demo.load(
280
+ fn=load_new_question_improved,
281
+ inputs=[],
282
+ outputs=[
283
+ current_example, model_a_name, model_b_name, summary_a_text, summary_b_text,
284
+ selected_winner, feedback_list, show_results_state, results_agg, show_full_context,
285
+ query_display, context_description, context_display, context_toggle_btn,
286
+ summary_a_display, summary_b_display,
287
+ vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
288
+ feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn,
289
+ main_interface_area
290
+ ]
291
+ )
292
+
293
+ random_question_btn.click(
294
+ fn=load_new_question_improved,
295
+ inputs=[],
296
+ outputs=[
297
+ current_example, model_a_name, model_b_name, summary_a_text, summary_b_text,
298
+ selected_winner, feedback_list, show_results_state, results_agg, show_full_context,
299
+ query_display, context_description, context_display, context_toggle_btn,
300
+ summary_a_display, summary_b_display,
301
+ vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
302
+ feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn,
303
+ main_interface_area
304
+ ]
305
+ )
306
+
307
+ vote_button_a.click(
308
+ fn=lambda: select_vote_improved('left'),
309
+ inputs=None,
310
+ outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
311
+ )
312
+ vote_button_b.click(
313
+ fn=lambda: select_vote_improved('right'),
314
+ inputs=None,
315
+ outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
316
+ )
317
+ vote_button_tie.click(
318
+ fn=lambda: select_vote_improved('tie'),
319
+ inputs=None,
320
+ outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
321
+ )
322
+ vote_button_neither.click(
323
+ fn=lambda: select_vote_improved('neither'),
324
+ inputs=None,
325
+ outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
326
+ )
327
+
328
+ feedback_checkboxes.change(
329
+ fn=update_feedback,
330
+ inputs=[feedback_checkboxes],
331
+ outputs=[feedback_list]
332
+ )
333
+
334
+ submit_button.click(
335
+ fn=submit_vote_fixed,
336
+ inputs=[model_a_name, model_b_name, selected_winner, feedback_list, results_agg],
337
+ outputs=[
338
+ show_results_state, results_agg,
339
+ vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
340
+ feedback_checkboxes,
341
+ feedback_section,
342
+ submit_button,
343
+ results_reveal_area,
344
+ random_question_btn,
345
+ results_table_display,
346
+ main_interface_area,
347
+ context_toggle_btn,
348
+ model_a_reveal,
349
+ model_b_reveal
350
+ ]
351
+ )
352
+
353
+ try_another_btn.click(
354
+ fn=load_new_question_improved,
355
+ inputs=[],
356
+ outputs=[
357
+ current_example, model_a_name, model_b_name, summary_a_text, summary_b_text,
358
+ selected_winner, feedback_list, show_results_state, results_agg, show_full_context,
359
+ query_display, context_description, context_display, context_toggle_btn,
360
+ summary_a_display, summary_b_display,
361
+ vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
362
+ feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn,
363
+ main_interface_area
364
+ ]
365
  )
366
 
367
+ if __name__ == "__main__":
368
+ demo.launch(debug=True)
static/.DS_Store ADDED
Binary file (6.15 kB). View file
 
static/styles.css ADDED
@@ -0,0 +1,640 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Base styles */
2
+ body, .gradio-container {
3
+ background-color: #ffffff;
4
+ font-size: 15px;
5
+ overflow-x: hidden !important;
6
+ }
7
+
8
+ /* Main color variables for a simpler, more subdued theme */
9
+ :root {
10
+ --primary: #FF7D1E; /* Main orange accent - used sparingly */
11
+ --primary-light: #FFF8F2; /* Very subtle orange tint */
12
+ --primary-selected: #FFE8D5; /* More visible but still subtle orange for selections */
13
+ --accent: #6B7280; /* Neutral gray for most UI elements */
14
+ --text-dark: #333333; /* Dark text */
15
+ --text-medium: #666666; /* Medium text */
16
+ --border-light: #E6E6E6; /* Light border */
17
+ --background-light: #F9F9F9; /* Light background */
18
+ --highlight: #FFFBEB; /* Subtle highlight color */
19
+ --model-a-color: #92B4F4; /* Model A color (blue) */
20
+ --model-b-color: #F8ADA7; /* Model B color (red) */
21
+ }
22
+
23
+ /* Tab styling */
24
+ .tabs {
25
+ margin-top: 0 !important;
26
+ }
27
+
28
+ /* Style for tab buttons */
29
+ .tab-nav {
30
+ background-color: var(--background-light) !important;
31
+ padding: 5px 10px !important;
32
+ border-radius: 8px 8px 0 0 !important;
33
+ border-bottom: 1px solid var(--border-light) !important;
34
+ }
35
+ .tab-nav button {
36
+ font-size: 1.1em !important;
37
+ font-weight: 600 !important;
38
+ padding: 10px 25px !important;
39
+ margin: 0 5px !important;
40
+ border-radius: 6px 6px 0 0 !important;
41
+ border: none !important;
42
+ background-color: transparent !important;
43
+ color: var(--text-medium) !important;
44
+ transition: all 0.3s ease !important;
45
+ }
46
+ .tab-nav button.selected {
47
+ background-color: white !important;
48
+ color: var(--primary) !important;
49
+ border-bottom: 2px solid var(--primary) !important;
50
+ }
51
+ .tab-nav button:hover:not(.selected) {
52
+ background-color: rgba(255,255,255,0.5) !important;
53
+ color: var(--text-dark) !important;
54
+ }
55
+
56
+ /* Tab content area */
57
+ .tabitem {
58
+ border: none !important;
59
+ padding: 20px 10px !important;
60
+ }
61
+
62
+ /* Style the row containing the Query title */
63
+ #query-title-row {
64
+ margin: 0 !important;
65
+ padding: 0 10px !important;
66
+ display: flex !important;
67
+ align-items: center !important;
68
+ overflow: hidden !important;
69
+ height: 40px !important;
70
+ }
71
+
72
+ #query-title-row h3 {
73
+ margin: 0 !important;
74
+ padding: 0 !important;
75
+ font-size: 1.2em !important;
76
+ font-weight: 600 !important;
77
+ line-height: 1.2 !important;
78
+ flex-grow: 0 !important;
79
+ flex-shrink: 0 !important;
80
+ white-space: nowrap !important;
81
+ overflow: visible !important;
82
+ color: var(--text-dark) !important;
83
+ }
84
+
85
+ /* New query container layout with button next to box */
86
+ #query-container {
87
+ display: flex !important;
88
+ align-items: stretch !important;
89
+ gap: 10px !important;
90
+ margin: 0 10px 8px 10px !important;
91
+ overflow: visible !important;
92
+ }
93
+
94
+ /* Style the query box - optimized for long queries */
95
+ .query-box-row {
96
+ background-color: #F0F7FF !important; /* Light blue background */
97
+ padding: 12px 15px !important;
98
+ border-radius: 6px !important;
99
+ border: 1px solid #D1E3F8 !important; /* Light blue border */
100
+ margin: 0 !important;
101
+ align-items: flex-start !important;
102
+ flex: 1 1 50% !important;
103
+ max-width: 50% !important;
104
+ overflow: visible !important;
105
+ display: flex !important;
106
+ min-height: 50px !important;
107
+ height: auto !important;
108
+ }
109
+
110
+ /* Context description styling - simple version */
111
+ .context-description {
112
+ background-color: transparent !important;
113
+ padding: 0 15px !important;
114
+ margin: 0 0 15px 0 !important;
115
+ font-style: normal !important;
116
+ color: var(--text-medium) !important; /* Lighter text color */
117
+ font-size: 1.05em !important; /* Slightly larger */
118
+ }
119
+
120
+ .context-topic {
121
+ display: inline-flex !important;
122
+ align-items: center !important;
123
+ background-color: transparent !important; /* No background */
124
+ padding: 0 !important;
125
+ border-radius: 0 !important;
126
+ box-shadow: none !important;
127
+ }
128
+
129
+ .topic-label {
130
+ font-weight: 600 !important;
131
+ color: var(--text-medium) !important; /* Lighter text color */
132
+ margin-right: 6px !important;
133
+ }
134
+
135
+ /* Style the Get Random Question button */
136
+ .query-button {
137
+ padding: 0 20px !important;
138
+ border-radius: 6px !important;
139
+ font-weight: 500 !important;
140
+ flex: 0 0 auto !important;
141
+ display: flex !important;
142
+ align-items: center !important;
143
+ justify-content: center !important;
144
+ background-color: var(--background-light) !important;
145
+ color: var(--text-medium) !important;
146
+ border: 1px solid var(--border-light) !important;
147
+ font-size: 0.95em !important;
148
+ min-height: 50px !important;
149
+ white-space: nowrap !important;
150
+ transition: all 0.2s ease !important;
151
+ box-shadow: 0 1px 2px rgba(0,0,0,0.05) !important;
152
+ }
153
+
154
+ .query-button:hover {
155
+ background-color: var(--primary-light) !important;
156
+ color: var(--primary) !important;
157
+ border-color: var(--primary) !important;
158
+ }
159
+
160
+ /* Context header row with title and toggle button */
161
+ #context-header-row {
162
+ display: flex !important;
163
+ justify-content: space-between !important;
164
+ align-items: center !important;
165
+ margin-bottom: 8px !important;
166
+ padding: 0 10px !important;
167
+ }
168
+
169
+ /* Context title styling */
170
+ .context-title {
171
+ margin: 0 !important;
172
+ padding: 0 !important;
173
+ font-size: 1.2em !important;
174
+ font-weight: 600 !important;
175
+ color: var(--text-dark) !important;
176
+ }
177
+
178
+ /* Style for the context toggle button */
179
+ .context-toggle-button {
180
+ background-color: var(--background-light) !important;
181
+ color: var(--text-medium) !important;
182
+ padding: 5px 10px !important;
183
+ border-radius: 4px !important;
184
+ border: 1px solid var(--border-light) !important;
185
+ font-size: 0.85em !important;
186
+ font-weight: 500 !important;
187
+ cursor: pointer !important;
188
+ transition: all 0.2s ease !important;
189
+ margin: 0 !important;
190
+ height: 30px !important;
191
+ line-height: 1 !important;
192
+ width: auto !important;
193
+ min-width: 0 !important;
194
+ max-width: 150px !important;
195
+ }
196
+
197
+ .context-toggle-button:hover {
198
+ background-color: var(--primary-light) !important;
199
+ color: var(--primary) !important;
200
+ border-color: var(--primary) !important;
201
+ }
202
+
203
+ /* Style the Markdown component displaying the query text */
204
+ .query-text {
205
+ padding: 0 !important;
206
+ margin: 0 !important;
207
+ background-color: transparent !important;
208
+ border: none !important;
209
+ overflow: visible !important;
210
+ width: 100% !important;
211
+ }
212
+
213
+ /* Style the actual query text */
214
+ .query-text p {
215
+ font-size: 1.2em !important;
216
+ font-weight: 600 !important;
217
+ color: #2E5AAC !important; /* Blue for query text */
218
+ line-height: 1.4 !important;
219
+ margin: 0 !important;
220
+ padding: 0 !important;
221
+ background-color: transparent !important;
222
+ border: none !important;
223
+ overflow-wrap: break-word !important;
224
+ word-wrap: break-word !important;
225
+ word-break: normal !important;
226
+ hyphens: auto !important;
227
+ white-space: normal !important;
228
+ }
229
+
230
+ /* Container for context items */
231
+ .context-items-container {
232
+ border-radius: 6px;
233
+ overflow: hidden;
234
+ }
235
+
236
+ /* Style for individual context items */
237
+ .context-item {
238
+ border: 1px solid var(--border-light);
239
+ background-color: var(--background-light);
240
+ padding: 12px;
241
+ border-radius: 6px;
242
+ margin-bottom: 8px;
243
+ font-size: 1em;
244
+ line-height: 1.5;
245
+ box-shadow: 0 1px 2px rgba(0,0,0,0.03);
246
+ }
247
+
248
+ /* Style for primary context items */
249
+ .primary-context {
250
+ border-left: 3px solid #FFF0F0 !important; /* Light red border */
251
+ }
252
+
253
+ /* Style for chunk headers */
254
+ .chunk-header {
255
+ font-weight: 600;
256
+ color: #2E5AAC;
257
+ margin-bottom: 8px;
258
+ padding-bottom: 5px;
259
+ border-bottom: 1px solid #D1E3F8;
260
+ }
261
+
262
+ /* Style for highlighted text within context items */
263
+ .highlight {
264
+ background-color: #FFECB3 !important;
265
+ padding: 0.1em 0.3em !important;
266
+ border-radius: 3px !important;
267
+ font-weight: 600 !important;
268
+ color: #664500 !important;
269
+ }
270
+
271
+ /* Markdown table styling */
272
+ .md-table {
273
+ width: 100% !important;
274
+ border-collapse: collapse !important;
275
+ margin: 10px 0 !important;
276
+ font-size: 0.95em !important;
277
+ }
278
+
279
+ .md-table th {
280
+ background-color: #F0F7FF !important;
281
+ color: #2E5AAC !important;
282
+ font-weight: 600 !important;
283
+ text-align: left !important;
284
+ padding: 10px !important;
285
+ border: 1px solid #D1E3F8 !important;
286
+ }
287
+
288
+ .md-table td {
289
+ padding: 8px 10px !important;
290
+ border: 1px solid #E6E6E6 !important;
291
+ vertical-align: top !important;
292
+ }
293
+
294
+ .md-table tr:nth-child(even) {
295
+ background-color: #F9F9F9 !important;
296
+ }
297
+
298
+ .md-table tr:hover {
299
+ background-color: #F0F7FF !important;
300
+ }
301
+
302
+ /* Style for the insufficient context alert */
303
+ .insufficient-alert {
304
+ border: 2px solid #f78989;
305
+ background-color: #fff0f0;
306
+ color: #b92020;
307
+ padding: 12px;
308
+ border-radius: 6px;
309
+ margin-bottom: 12px;
310
+ font-size: 1em;
311
+ }
312
+ .insufficient-alert strong {
313
+ display: block;
314
+ margin-bottom: 5px;
315
+ font-size: 1.05em;
316
+ }
317
+ .insufficient-alert p {
318
+ margin: 0;
319
+ font-size: 1em;
320
+ }
321
+
322
+ /* Style for section headings */
323
+ .section-heading {
324
+ color: var(--text-dark) !important;
325
+ margin: 5px 0 2px 0 !important;
326
+ padding: 0 !important;
327
+ font-weight: 600 !important;
328
+ font-size: 1.2em !important;
329
+ }
330
+
331
+ /* Style the group displaying model summaries */
332
+ .summary-card {
333
+ border: 1px solid var(--border-light);
334
+ padding: 12px !important;
335
+ border-radius: 6px;
336
+ height: 100%;
337
+ box-shadow: 0 1px 3px rgba(0,0,0,0.03);
338
+ background-color: var(--background-light) !important;
339
+ }
340
+
341
+ /* Apply specific background colors to summary cards */
342
+ .summary-card-a {
343
+ border-left: 3px solid #92B4F4 !important; /* Lighter blue accent */
344
+ }
345
+ .summary-card-b {
346
+ border-left: 3px solid #F8ADA7 !important; /* Light red accent */
347
+ }
348
+
349
+ /* Style the Textbox itself inside the summary card */
350
+ .summary-card textarea {
351
+ font-size: 1em !important;
352
+ line-height: 1.4 !important;
353
+ background-color: rgba(255,255,255,0.7) !important;
354
+ }
355
+ /* Style the Textbox label */
356
+ .summary-card .gr-input-label {
357
+ display: block !important;
358
+ padding: 0 0 5px 0 !important;
359
+ margin: 0 !important;
360
+ font-size: 1.05em !important;
361
+ font-weight: 600 !important;
362
+ color: var(--text-dark) !important;
363
+ }
364
+
365
+ /* Style the voting buttons */
366
+ .vote-button {
367
+ flex-grow: 1;
368
+ margin: 0 5px !important;
369
+ font-size: 1.05em !important;
370
+ padding: 12px 15px !important;
371
+ border-radius: 6px !important;
372
+ transition: all 0.2s ease !important;
373
+ background-color: var(--background-light) !important;
374
+ border: 1px solid var(--border-light) !important;
375
+ min-height: 50px !important;
376
+ font-weight: 500 !important;
377
+ color: var(--text-dark) !important;
378
+ margin-bottom: 5px !important;
379
+ }
380
+
381
+ /* Hover effect for A/B/Tie buttons */
382
+ .vote-button:hover:not(.vote-button-neither) {
383
+ background-color: var(--primary-light) !important;
384
+ border-color: var(--primary) !important;
385
+ color: var(--primary) !important;
386
+ }
387
+
388
+ /* Hover effect for Neither button */
389
+ .vote-button-neither:hover {
390
+ background-color: #fff0f0 !important;
391
+ border-color: #f78989 !important;
392
+ color: #b92020 !important;
393
+ }
394
+
395
+ /* Style for selected buttons with persistent selection state */
396
+ .vote-button.selected:not(.vote-button-neither) {
397
+ border-width: 2px !important;
398
+ border-style: solid !important;
399
+ border-color: #FF7D1E !important;
400
+ background-color: #FFF2E6 !important;
401
+ color: #FF7D1E !important;
402
+ font-weight: 600 !important;
403
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1) !important;
404
+ }
405
+
406
+ /* Special neither button styling when selected */
407
+ .vote-button-neither.selected {
408
+ border-width: 2px !important;
409
+ border-style: solid !important;
410
+ border-color: #f78989 !important;
411
+ background-color: #fff0f0 !important;
412
+ color: #b92020 !important;
413
+ font-weight: 600 !important;
414
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1) !important;
415
+ }
416
+
417
+ /* Ensure selection state persists when hovered */
418
+ .vote-button.selected:hover:not(.vote-button-neither) {
419
+ border-color: #FF7D1E !important;
420
+ background-color: #FFF2E6 !important;
421
+ color: #FF7D1E !important;
422
+ }
423
+
424
+ /* Ensure neither selection state persists when hovered */
425
+ .vote-button-neither.selected:hover {
426
+ border-color: #f78989 !important;
427
+ background-color: #fff0f0 !important;
428
+ color: #b92020 !important;
429
+ }
430
+
431
+ /* Style the feedback section */
432
+ .feedback-section {
433
+ padding: 3px 0 !important;
434
+ background-color: transparent !important;
435
+ margin-top: 3px !important;
436
+ margin-bottom: 3px !important;
437
+ font-size: 1em;
438
+ border: none !important;
439
+ box-shadow: none !important;
440
+ }
441
+
442
+ /* Improved feedback checkbox styling */
443
+ .feedback-section .gr-check-radio {
444
+ font-size: 1.05em !important;
445
+ }
446
+
447
+ .feedback-section .gr-check-radio span {
448
+ font-size: 1.05em !important;
449
+ color: var(--text-dark) !important;
450
+ }
451
+
452
+ /* Checkbox larger size and color customization */
453
+ .feedback-section input[type="checkbox"] {
454
+ width: 18px !important;
455
+ height: 18px !important;
456
+ margin-right: 6px !important;
457
+ }
458
+
459
+ /* Make the checkbox checked color stronger */
460
+ .feedback-section input[type="checkbox"]:checked {
461
+ accent-color: #FF8C38 !important;
462
+ border-color: #FF8C38 !important;
463
+ background-color: #FF8C38 !important;
464
+ }
465
+
466
+ /* Style for model reveals */
467
+ .model-reveal {
468
+ font-size: 1.3em !important;
469
+ padding: 8px 0 !important;
470
+ text-align: center !important;
471
+ margin-top: 5px !important;
472
+ font-weight: 600 !important;
473
+ border-radius: 6px !important;
474
+ }
475
+
476
+ /* Style for model A reveal */
477
+ .model-a-reveal {
478
+ background-color: #F0F7FF !important;
479
+ }
480
+
481
+ /* Style for model B reveal */
482
+ .model-b-reveal {
483
+ background-color: #FFF0F0 !important;
484
+ }
485
+
486
+ /* Style the control buttons area */
487
+ .control-buttons button {
488
+ margin: 0 10px !important;
489
+ font-size: 1em !important;
490
+ border-radius: 6px !important;
491
+ padding: 8px 16px !important;
492
+ transition: all 0.2s ease !important;
493
+ }
494
+
495
+ /* Make headings slightly larger */
496
+ h3 {
497
+ font-size: 1.2em !important;
498
+ font-weight: 600 !important;
499
+ margin: 5px 0 2px 0 !important;
500
+ padding: 0 !important;
501
+ color: var(--text-dark) !important;
502
+ }
503
+ /* Adjust main title size */
504
+ h1 {
505
+ font-size: 1.6em !important;
506
+ color: var(--primary) !important;
507
+ margin: 10px 0 5px 0 !important;
508
+ padding: 0 !important;
509
+ }
510
+ /* Adjust main description size */
511
+ #main-interface-area > p:first-of-type {
512
+ font-size: 1em !important;
513
+ margin: 0 0 8px 0 !important;
514
+ padding: 0 !important;
515
+ line-height: 1.4 !important;
516
+ color: var(--text-medium) !important;
517
+ }
518
+
519
+ /* Adjust CheckboxGroup label/choices size */
520
+ .feedback-section .gr-input-label {
521
+ font-size: 1.1em !important;
522
+ font-weight: 600 !important;
523
+ margin-bottom: 0.6em !important;
524
+ color: var(--text-dark) !important;
525
+ }
526
+
527
+ /* Adjust DataFrame font size */
528
+ .gr-dataframe table {
529
+ font-size: 0.95em !important;
530
+ border-collapse: separate !important;
531
+ border-spacing: 0 !important;
532
+ border-radius: 6px !important;
533
+ overflow: hidden !important;
534
+ }
535
+ .gr-dataframe th, .gr-dataframe td {
536
+ padding: 8px 10px !important;
537
+ border: none !important;
538
+ border-bottom: 1px solid var(--border-light) !important;
539
+ }
540
+ .gr-dataframe th {
541
+ background-color: var(--background-light) !important;
542
+ color: var(--text-dark) !important;
543
+ font-weight: 600 !important;
544
+ }
545
+
546
+ /* Reduce space caused by Markdown wrappers */
547
+ .gradio-container .prose {
548
+ line-height: 1.4 !important;
549
+ margin: 0 !important;
550
+ padding: 0 !important;
551
+ }
552
+ hr {
553
+ margin: 5px 0 !important;
554
+ border: none !important;
555
+ height: 1px !important;
556
+ background-color: var(--border-light) !important;
557
+ }
558
+
559
+ /* Fix for any scrollbar issues */
560
+ .gradio-row {
561
+ overflow: visible !important;
562
+ }
563
+
564
+ /* Submit button styling */
565
+ #submit-button {
566
+ background-color: var(--primary) !important;
567
+ color: white !important;
568
+ padding: 12px 30px !important;
569
+ border-radius: 6px !important;
570
+ font-weight: 600 !important;
571
+ font-size: 1.2em !important;
572
+ transition: all 0.2s ease !important;
573
+ box-shadow: 0 1px 2px rgba(0,0,0,0.08) !important;
574
+ border: none !important;
575
+ margin-top: 15px !important;
576
+ }
577
+
578
+ #submit-button:hover {
579
+ background-color: #E56E0F !important;
580
+ box-shadow: 0 2px 4px rgba(0,0,0,0.12) !important;
581
+ }
582
+
583
+ /* Try another button styling */
584
+ #try-another-btn {
585
+ background-color: var(--primary) !important;
586
+ color: white !important;
587
+ padding: 10px 25px !important;
588
+ border-radius: 6px !important;
589
+ font-weight: 600 !important;
590
+ transition: all 0.2s ease !important;
591
+ box-shadow: 0 1px 2px rgba(0,0,0,0.08) !important;
592
+ border: none !important;
593
+ }
594
+
595
+ #try-another-btn:hover {
596
+ background-color: #E56E0F !important;
597
+ box-shadow: 0 2px 4px rgba(0,0,0,0.12) !important;
598
+ }
599
+
600
+ /* Reduce vertical spacing */
601
+ .gradio-column > *, .gradio-row > * {
602
+ margin-top: 0 !important;
603
+ margin-bottom: 0 !important;
604
+ padding-top: 0 !important;
605
+ padding-bottom: 0 !important;
606
+ }
607
+
608
+ .gradio-markdown {
609
+ margin-top: 0 !important;
610
+ margin-bottom: 0 !important;
611
+ padding-top: 0 !important;
612
+ padding-bottom: 0 !important;
613
+ }
614
+
615
+ /* Reduce container padding */
616
+ .gradio-container {
617
+ padding: 0 !important;
618
+ }
619
+
620
+ /* Custom compact spacing for specific sections */
621
+ #main-interface-area > div {
622
+ margin-bottom: 4px !important;
623
+ }
624
+
625
+ /* Media query for responsive behavior on smaller screens */
626
+ @media screen and (max-width: 768px) {
627
+ #query-container {
628
+ flex-direction: column !important;
629
+ }
630
+
631
+ .query-box-row {
632
+ flex: 1 1 100% !important;
633
+ max-width: 100% !important;
634
+ margin-bottom: 10px !important;
635
+ }
636
+
637
+ .query-button {
638
+ width: 100% !important;
639
+ }
640
+ }
utils/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Makes utils a proper Python package
2
+ # This file can be empty
utils/arena_df.csv ADDED
The diff for this file is too large to render. See raw diff
 
utils/context_processor.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+
4
+ def debug_text(text, label="Text"):
5
+ """Helper function to debug text processing issues"""
6
+ print(f"\n--- DEBUG {label} ---")
7
+ print(f"Length: {len(text)}")
8
+ print(f"First 100 chars: {text[:100]}")
9
+ print(f"Contains highlight_start: {'[[highlight_start]]' in text}")
10
+ print(f"Contains start_highlight: {'[[start_highlight]]' in text}")
11
+ print("-------------------------\n")
12
+
13
+ def clean_json_text(text):
14
+ """
15
+ Handle text that came from JSON and might have JSON escaping.
16
+ This handles the case of text like: "the sky isn\\'t falling"
17
+ """
18
+ # First attempt to clean JSON-style escapes
19
+ try:
20
+ # Try to treat the string as if it were a JSON string
21
+ if '\\' in text:
22
+ # Create a valid JSON string with the text as content
23
+ json_str = json.dumps({"text": text})
24
+ # Parse it back to get properly unescaped text
25
+ parsed = json.loads(json_str)
26
+ return parsed["text"]
27
+ except Exception:
28
+ # If that fails, continue with the original text
29
+ pass
30
+
31
+ return text
32
+
33
+ def process_highlights(text):
34
+ """
35
+ Process highlight markers in text to create HTML highlighted text.
36
+ Handles both standard format and alternative format.
37
+ Also properly handles escaped quotes.
38
+ """
39
+ # Debug info
40
+ # debug_text(text, "Before processing")
41
+
42
+ # Clean JSON escaping
43
+ text = clean_json_text(text)
44
+
45
+ # Process highlight tags
46
+ pattern1 = r'\[\[highlight_start\]\](.*?)\[\[highlight_end\]\]'
47
+ replacement = r'<span class="highlight">\1</span>'
48
+ highlighted_text = re.sub(pattern1, replacement, text)
49
+
50
+ pattern2 = r'\[\[start_highlight\]\](.*?)\[\[end_highlight\]\]'
51
+ highlighted_text = re.sub(pattern2, replacement, highlighted_text)
52
+
53
+ # Debug info
54
+ # debug_text(highlighted_text, "After processing")
55
+
56
+ return highlighted_text
57
+
58
+ def process_table_with_highlights(markdown_table):
59
+ """
60
+ Special function to process markdown tables with highlights.
61
+ Ensures the table structure is preserved while applying highlights.
62
+ """
63
+ # First, split the table into lines
64
+ lines = markdown_table.strip().split('\n')
65
+ processed_lines = []
66
+
67
+ for line in lines:
68
+ # Process highlights in each line
69
+ processed_line = process_highlights(line)
70
+ processed_lines.append(processed_line)
71
+
72
+ return convert_markdown_table_to_html('\n'.join(processed_lines))
73
+
74
+ def convert_markdown_table_to_html(markdown_text):
75
+ """
76
+ Converts a markdown table to an HTML table.
77
+ """
78
+ # Clean JSON escaping
79
+ markdown_text = clean_json_text(markdown_text)
80
+
81
+ lines = markdown_text.strip().split('\n')
82
+ table_lines = [line for line in lines if line.strip().startswith('|')]
83
+
84
+ if len(table_lines) < 2: # Need at least header and separator
85
+ return markdown_text # Return original if not a proper table
86
+
87
+ html = '<table class="md-table">'
88
+
89
+ # Check if we have a header row
90
+ if len(table_lines) >= 2 and '---' in table_lines[1]:
91
+ # Process header
92
+ header_cells = table_lines[0].split('|')[1:-1] if table_lines[0].strip().endswith('|') else table_lines[0].split('|')[1:]
93
+ html += '<thead><tr>'
94
+ for cell in header_cells:
95
+ # Process highlights in the cell
96
+ processed_cell = process_highlights(cell.strip())
97
+ html += f'<th>{processed_cell}</th>'
98
+ html += '</tr></thead>'
99
+
100
+ # Process data rows (skip the separator row at index 1)
101
+ html += '<tbody>'
102
+ for line in table_lines[2:]:
103
+ if not line.strip():
104
+ continue
105
+
106
+ cells = line.split('|')[1:-1] if line.strip().endswith('|') else line.split('|')[1:]
107
+ html += '<tr>'
108
+ for cell in cells:
109
+ # Process highlights in the cell
110
+ processed_cell = process_highlights(cell.strip())
111
+ html += f'<td>{processed_cell}</td>'
112
+ html += '</tr>'
113
+ html += '</tbody>'
114
+ else:
115
+ # No header row, treat all rows as data
116
+ html += '<tbody>'
117
+ for line in table_lines:
118
+ if not line.strip():
119
+ continue
120
+
121
+ cells = line.split('|')[1:-1] if line.strip().endswith('|') else line.split('|')[1:]
122
+ html += '<tr>'
123
+ for cell in cells:
124
+ # Process highlights in the cell
125
+ processed_cell = process_highlights(cell.strip())
126
+ html += f'<td>{processed_cell}</td>'
127
+ html += '</tr>'
128
+ html += '</tbody>'
129
+
130
+ html += '</table>'
131
+ return html
132
+
133
+ def get_context_html(example, show_full=False):
134
+ """
135
+ Formats the context chunks into an HTML string for display using specific CSS classes.
136
+ Includes an alert for insufficient context and applies highlighting.
137
+
138
+ Parameters:
139
+ - example: The example data containing contexts
140
+ - show_full: Boolean indicating whether to show full context
141
+ """
142
+ html = ""
143
+
144
+ # Add insufficient context warning if needed
145
+ if example.get("insufficient", False):
146
+ insufficient_reason = example.get("insufficient_reason", "")
147
+ reason_html = f"<p>{insufficient_reason}</p>" if insufficient_reason else "<p>The context may not contain enough information to fully answer the question, or the question might be ambiguous. Models should ideally indicate this limitation or refuse to answer.</p>"
148
+
149
+ html += f"""
150
+ <div class="insufficient-alert">
151
+ <strong>
152
+ <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align: middle; margin-right: 5px;">
153
+ <path d="m21.73 18-8-14a2 2 0 0 0-3.48 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z"></path>
154
+ <line x1="12" y1="9" x2="12" y2="13"></line>
155
+ <line x1="12" y1="17" x2="12.01" y2="17"></line>
156
+ </svg>
157
+ Insufficient Context
158
+ </strong>
159
+ {reason_html}
160
+ </div>
161
+ """
162
+
163
+ # Create container div for all context items
164
+ html += '<div class="context-items-container">'
165
+
166
+ # Determine which context to display based on show_full flag
167
+ if show_full and "full_contexts" in example and example["full_contexts"]:
168
+ # If showing full context, create individual items for each chunk without headers
169
+ for context_item in example["full_contexts"]:
170
+ context_text = context_item.get('content', '')
171
+
172
+ # Check for markdown table format (both standard and newline format)
173
+ if '|' in context_text and ('\n|' in context_text or '\n-' in context_text):
174
+ # Process as a table
175
+ html += f'<div class="context-item">{process_table_with_highlights(context_text)}</div>'
176
+ else:
177
+ # Regular text content - process highlights
178
+ processed_text = process_highlights(context_text)
179
+ html += f'<div class="context-item">{processed_text}</div>'
180
+ else:
181
+ # Show the highlighted context items
182
+ if "contexts" in example and example["contexts"]:
183
+ for context_item in example["contexts"]:
184
+ chunk_num = context_item.get('chunk_num', '')
185
+ context_text = context_item.get('content', '')
186
+ is_primary = context_item.get('is_primary', False)
187
+
188
+ # Add appropriate class for primary chunks
189
+ extra_class = " primary-context" if is_primary else ""
190
+
191
+ # Check for markdown table format
192
+ if '|' in context_text and ('\n|' in context_text or '\n-' in context_text):
193
+ # Process as a table
194
+ html += f'<div class="context-item{extra_class}">{process_table_with_highlights(context_text)}</div>'
195
+ else:
196
+ # Regular text with potential highlights
197
+ processed_text = process_highlights(context_text)
198
+ html += f'<div class="context-item{extra_class}">{processed_text}</div>'
199
+ else:
200
+ # If no contexts available, show a message
201
+ html += '<div class="context-item">No context available. Try toggling to full context view.</div>'
202
+
203
+ # Close the container div
204
+ html += '</div>'
205
+
206
+ return html
utils/data_loader.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import pandas as pd
4
+ import random
5
+ import re
6
+ from .context_processor import process_highlights
7
+
8
+ def load_arena_data():
9
+ """
10
+ Loads the arena data from the arena_df.csv file in the utils directory.
11
+ Returns the data in a format compatible with the application.
12
+ """
13
+ try:
14
+ # Define the path to the CSV file
15
+ csv_path = os.path.join('utils', 'arena_df.csv')
16
+
17
+ # Read the CSV file
18
+ df = pd.read_csv(csv_path)
19
+ print(f"Loaded arena data with {len(df)} examples")
20
+ return df
21
+ except Exception as e:
22
+ print(f"Error loading arena data: {e}")
23
+ # Return an empty DataFrame if file can't be loaded
24
+ return pd.DataFrame()
25
+
26
+ def create_dummy_example():
27
+ """Creates a dummy example if no data is loaded"""
28
+ return {
29
+ "question": "Could not load questions from the dataset. Please check the data file.",
30
+ "processed_context_desc": "Error: Data not available",
31
+ "contexts": ["No context available"],
32
+ "full_context": "Error loading context data.",
33
+ "Answerable": False,
34
+ "insufficient": True
35
+ }
36
+
37
+ def get_random_example():
38
+ """
39
+ Selects a random example from the loaded arena data.
40
+ Returns the example data in a format compatible with the application.
41
+ """
42
+ # Load the arena data
43
+ df = load_arena_data()
44
+
45
+ if df.empty:
46
+ # If no data is loaded, return a dummy example
47
+ return create_dummy_example()
48
+
49
+ # Select a random row
50
+ example = df.sample(1).iloc[0]
51
+
52
+ # Process the example data
53
+ processed_example = {
54
+ "question": example['question'],
55
+ "processed_context_desc": example.get('processed_context_desc', ''),
56
+ "Answerable": example.get('Answerable', True), # Default to True unless specified otherwise
57
+ "insufficient": example.get('insufficient', False),
58
+ "insufficient_reason": example.get('insufficient_reason', '')
59
+ }
60
+
61
+ # Process contexts - for full context
62
+ try:
63
+ contexts_raw = example['contexts']
64
+ if isinstance(contexts_raw, str):
65
+ contexts = json.loads(contexts_raw)
66
+ # Store full contexts as individual items
67
+ full_contexts = []
68
+ if isinstance(contexts, list):
69
+ for i, chunk in enumerate(contexts):
70
+ if isinstance(chunk, dict) and 'content' in chunk:
71
+ full_contexts.append({
72
+ 'chunk_num': i + 1,
73
+ 'content': chunk.get('content', '')
74
+ })
75
+ processed_example["full_contexts"] = full_contexts
76
+ else:
77
+ processed_example["full_contexts"] = []
78
+ except Exception as e:
79
+ print(f"Error processing contexts: {e}")
80
+ processed_example["full_contexts"] = []
81
+
82
+ # Process highlighted contexts for display
83
+ contexts_highlighted = []
84
+
85
+ try:
86
+ # Check if contexts_highlighted exists
87
+ if 'contexts_highlighted' in example and example['contexts_highlighted']:
88
+ highlighted_contexts = []
89
+
90
+ if isinstance(example['contexts_highlighted'], str):
91
+ try:
92
+ # Try direct parsing, assuming it's a valid JSON array
93
+ raw_str = example['contexts_highlighted']
94
+
95
+ # First, manually parse the highlighted contexts using regex
96
+ # This is a more robust approach for our specific format
97
+ type_pattern = r'"type":\s*"(primary|secondary)"'
98
+ content_pattern = r'"abbreviatedContent":\s*"([^"]*)"|"abbreviatedContent":\s*"([^"]*)'
99
+
100
+ types = re.findall(type_pattern, raw_str)
101
+ # Handle both regular quotes and escaped quotes in content
102
+ raw_contents = re.findall(content_pattern, raw_str)
103
+
104
+ # Extract contents from tuple matches (the regex has capture groups)
105
+ contents = []
106
+ for match in raw_contents:
107
+ # Get the non-empty string from the tuple
108
+ content = next((s for s in match if s), "")
109
+ contents.append(content)
110
+
111
+ # Create the highlighted contexts from extracted data
112
+ for i, (ctx_type, content) in enumerate(zip(types, contents)):
113
+ highlighted_contexts.append({
114
+ 'type': ctx_type,
115
+ 'abbreviatedContent': content
116
+ })
117
+
118
+ except Exception as e:
119
+ print(f"Error extracting contexts with regex: {e}")
120
+ else:
121
+ # Already an object, not a string
122
+ highlighted_contexts = example['contexts_highlighted']
123
+
124
+ # Process each context item
125
+ for i, item in enumerate(highlighted_contexts):
126
+ if isinstance(item, dict):
127
+ ctx_type = item.get('type', 'secondary')
128
+ content = item.get('abbreviatedContent', '')
129
+
130
+ # Process highlights using the standard format
131
+ content = process_highlights(content)
132
+
133
+ contexts_highlighted.append({
134
+ 'chunk_num': i + 1,
135
+ 'content': content,
136
+ 'is_primary': ctx_type == 'primary'
137
+ })
138
+ except Exception as e:
139
+ print(f"Error processing highlighted contexts: {e}")
140
+
141
+ # If we couldn't process the highlighted contexts, fall back to the full contexts
142
+ if not contexts_highlighted and processed_example["full_contexts"]:
143
+ for i, ctx in enumerate(processed_example["full_contexts"]):
144
+ contexts_highlighted.append({
145
+ 'chunk_num': i + 1,
146
+ 'content': ctx.get('content', ''),
147
+ 'is_primary': False
148
+ })
149
+
150
+ processed_example["contexts"] = contexts_highlighted
151
+
152
+ return processed_example
153
+
154
+ def get_random_example_and_models(model_names):
155
+ """
156
+ Selects a random example from the arena data and assigns two distinct
157
+ random models to positions A and B.
158
+ """
159
+ example = get_random_example()
160
+ # Choose two different models from the model list
161
+ model_a_name, model_b_name = random.sample(model_names, 2)
162
+ return example, model_a_name, model_b_name
utils/leaderboard.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import random
4
+ from .models import model_names
5
+
6
+ def load_leaderboard_data():
7
+ """
8
+ Loads the leaderboard data from the leaderboard CSV file.
9
+ Returns the data in a format compatible with the application.
10
+ """
11
+ # Initialize the results structure
12
+ results = {"wins": {}, "losses": {}, "ties": {}, "votes": 0}
13
+
14
+ try:
15
+ # Define the path to the CSV file for leaderboard
16
+ csv_path = os.path.join('utils', 'arena_df_leaderboard.csv')
17
+
18
+ # Check if the file exists and load it
19
+ if os.path.exists(csv_path):
20
+ df = pd.read_csv(csv_path)
21
+
22
+ # Process the data into our structure
23
+ for _, row in df.iterrows():
24
+ model = row['model']
25
+ results["wins"][model] = row['wins']
26
+ results["losses"][model] = row['losses']
27
+ results["ties"][model] = row['ties']
28
+
29
+ # Calculate total votes
30
+ for model in results["wins"].keys():
31
+ results["votes"] += results["wins"][model] + results["losses"][model] + results["ties"][model] // 2
32
+ else:
33
+ # If file doesn't exist, pre-populate with some data
34
+ for model in model_names:
35
+ results["wins"][model] = random.randint(0, 10)
36
+ results["losses"][model] = random.randint(0, 10)
37
+ results["ties"][model] = random.randint(0, 5)
38
+
39
+ # Calculate total votes
40
+ for model in model_names:
41
+ results["votes"] += results["wins"][model] + results["losses"][model] + results["ties"][model] // 2
42
+
43
+ return results
44
+ except Exception as e:
45
+ print(f"Error loading leaderboard data: {e}")
46
+ # Return the initialized structure if file can't be loaded
47
+ return results
48
+
49
+ def save_leaderboard_data(results):
50
+ """
51
+ Saves the current leaderboard results back to the CSV file.
52
+
53
+ Parameters:
54
+ - results: The results dictionary containing wins, losses, ties, and votes
55
+ """
56
+ try:
57
+ # Define the path to the CSV file
58
+ csv_path = os.path.join('utils', 'arena_df_leaderboard.csv')
59
+
60
+ # Convert the results dictionary to a DataFrame
61
+ data = []
62
+ for model in results["wins"].keys():
63
+ data.append({
64
+ 'model': model,
65
+ 'wins': results["wins"].get(model, 0),
66
+ 'losses': results["losses"].get(model, 0),
67
+ 'ties': results["ties"].get(model, 0)
68
+ })
69
+
70
+ df = pd.DataFrame(data)
71
+
72
+ # Save to CSV
73
+ df.to_csv(csv_path, index=False)
74
+ print(f"Leaderboard data saved successfully to {csv_path}")
75
+ except Exception as e:
76
+ print(f"Error saving leaderboard data: {e}")
utils/models.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- Dummy Model Summaries ---
2
+ # Define functions that simulate model summary generation
3
+ dummy_models = {
4
+ "Model Alpha": lambda context, question, answerable: f"Alpha Summary: Based on the context for '{question[:20]}...', it appears the question is {'answerable' if answerable else 'unanswerable'}.",
5
+ "Model Beta": lambda context, question, answerable: f"Beta Summary: Regarding '{question[:20]}...', the provided documents {'allow' if answerable else 'do not allow'} for a conclusive answer based on the text.",
6
+ "Model Gamma": lambda context, question, answerable: f"Gamma Summary: For the question '{question[:20]}...', I {'can' if answerable else 'cannot'} provide a specific answer from the given text snippets.",
7
+ "Model Delta (Refusal Specialist)": lambda context, question, answerable: f"Delta Summary: The context for '{question[:20]}...' is {'sufficient' if answerable else 'insufficient'} to formulate a direct response. Therefore, I must refuse."
8
+ }
9
+
10
+ # List of model names for easy access
11
+ model_names = list(dummy_models.keys())
12
+
13
+ def generate_summaries(example, model_a_name, model_b_name):
14
+ """
15
+ Generates summaries for the given example using the assigned models.
16
+ """
17
+ # Create a plain text version of the contexts for the models
18
+ context_text = ""
19
+ if "contexts" in example and example["contexts"]:
20
+ context_parts = []
21
+ for ctx in example["contexts"]:
22
+ if isinstance(ctx, dict) and "content" in ctx:
23
+ context_parts.append(ctx["content"])
24
+ context_text = "\n---\n".join(context_parts)
25
+ else:
26
+ # Fallback to full contexts if highlighted contexts are not available
27
+ context_parts = []
28
+ if "full_contexts" in example:
29
+ for ctx in example["full_contexts"]:
30
+ if isinstance(ctx, dict) and "content" in ctx:
31
+ context_parts.append(ctx["content"])
32
+ context_text = "\n---\n".join(context_parts)
33
+
34
+ # Pass 'Answerable' status to models (they might use it)
35
+ answerable = example.get("Answerable", True)
36
+ question = example.get("question", "")
37
+
38
+ # Call the dummy model functions
39
+ summary_a = dummy_models[model_a_name](context_text, question, answerable)
40
+ summary_b = dummy_models[model_b_name](context_text, question, answerable)
41
+ return summary_a, summary_b
utils/ui_helpers.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from .context_processor import get_context_html
3
+
4
+ def toggle_context_display(example, current_state):
5
+ """
6
+ Toggles between full context and highlights display.
7
+
8
+ Parameters:
9
+ - example: The current example data
10
+ - current_state: Boolean indicating if full context is already shown
11
+
12
+ Returns:
13
+ - Updated context HTML and toggle button text
14
+ """
15
+ new_state = not current_state
16
+
17
+ # UPDATED: Changed button text based on new state
18
+ button_text = "Show Highlights" if new_state else "Show Full Context"
19
+
20
+ context_html = get_context_html(example, show_full=new_state)
21
+
22
+ # Add or remove the showing-full class to the button
23
+ elem_classes = ["context-toggle-button"]
24
+ if new_state:
25
+ elem_classes.append("showing-full")
26
+
27
+ # Return the values as list in the expected order, not as a dictionary
28
+ return new_state, gr.update(value=context_html), gr.update(value=button_text, elem_classes=elem_classes)
29
+
30
+ def update_feedback(choice):
31
+ """Updates the feedback list state when checkbox selections change."""
32
+ # Return the value directly, not as a dictionary
33
+ return choice