lisabdunlap commited on
Commit
646b99f
·
verified ·
1 Parent(s): 4a2140f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +261 -137
app.py CHANGED
@@ -1,142 +1,266 @@
1
  import json
2
- import re
3
- import argparse
 
 
 
4
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- # Load the JSONL file
7
- def load_jsonl(file_path):
8
- data = []
9
- with open(file_path, 'r') as f:
10
- for line in f:
11
- data.append(json.loads(line))
12
- return data
13
-
14
- def display_pairwise_answer(data):
15
- chat_mds = pairwise_to_gradio_chat_mds(data)
16
- return chat_mds
17
-
18
- newline_pattern1 = re.compile("\n\n(\d+\. )")
19
- newline_pattern2 = re.compile("\n\n(- )")
20
-
21
- def post_process_answer(x):
22
- return x
23
-
24
- def pairwise_to_gradio_chat_mds(data):
25
- end = data["turn"] * 3
26
- ans_a = data["conversation_a"]
27
- ans_b = data["conversation_b"]
28
-
29
- mds = [""] * end
30
- base = 0
31
- for i in range(0, end, 3):
32
- mds[i] = "## User Prompt\n" + data["conversation_a"][base]["content"].strip()
33
- mds[i + 1] = f"## {data['model_a']}\n" + post_process_answer(ans_a[base + 1]["content"].strip())
34
- mds[i + 2] = f"## {data['model_b']}\n" + post_process_answer(ans_b[base + 1]["content"].strip())
35
- base += 2
36
-
37
- if data["winner"] == "tie":
38
- winner = "tie"
39
- elif data["winner"] == "model_a":
40
- winner = "gemini" if "gemini" in data["model_a"] else "opponent"
41
- else:
42
- winner = "gemini" if "gemini" in data["model_b"] else "opponent"
43
-
44
- mds += [f"## Winner: {winner}"]
45
- mds += [""] * (16 - len(mds))
46
- return mds
47
-
48
- # Filtering functions
49
- def filter_by_language(language):
50
- return [item for item in data if item['language'] == language]
51
-
52
- def filter_by_winner(winner_model, filtered_data):
53
- if winner_model == "anyone":
54
- return [item for item in filtered_data]
55
- return [item for item in filtered_data if item['winner'] == winner_model]
56
-
57
- def filter_by_conversation_a_prefix(prefix, filtered_data):
58
- return [item for item in filtered_data if item['conversation_a'][0]["content"][:128] == prefix]
59
-
60
- # Create Gradio interface
61
- def update_winner_and_questions(language):
62
- filtered_data = filter_by_language(language)
63
- winners = ["anyone"] + sorted(set(item['winner'] for item in filtered_data))
64
- prefixes = [item['conversation_a'][0]["content"][:128] for item in filtered_data]
65
- return gr.update(choices=winners, value=winners[0]), gr.update(choices=prefixes, value=prefixes[0])
66
-
67
- def update_question_options(language, winner_model):
68
- filtered_data = filter_by_language(language)
69
- filtered_data = filter_by_winner(winner_model, filtered_data)
70
- prefixes = [item['conversation_a'][0]["content"][:128] for item in filtered_data]
71
- return gr.update(choices=prefixes, value=prefixes[0])
72
-
73
- def display_filtered_data(language, winner_model, prefix):
74
- filtered_data = filter_by_language(language)
75
- filtered_data = filter_by_winner(winner_model, filtered_data)
76
- filtered_data = filter_by_conversation_a_prefix(prefix, filtered_data)
77
- if len(filtered_data) == 0:
78
- return [""] * 16
79
- return pairwise_to_gradio_chat_mds(filtered_data[0])
80
-
81
- def next_question(language, winner_model, prefix):
82
- filtered_data = filter_by_language(language)
83
- filtered_data = filter_by_winner(winner_model, filtered_data)
84
- all_items = [item['conversation_a'][0]["content"][:128] for item in filtered_data]
85
- if prefix:
86
- i = all_items.index(prefix) + 1
87
- else:
88
- i = 0
89
-
90
- if i >= len(all_items):
91
- return gr.update(choices=all_items, value=all_items[-1])
92
- return gr.update(choices=all_items, value=all_items[i])
93
 
94
  if __name__ == "__main__":
95
- parser = argparse.ArgumentParser()
96
- parser.add_argument("--host", type=str, default="0.0.0.0")
97
- parser.add_argument("--port", type=int)
98
- parser.add_argument("--share", action="store_true")
99
- args = parser.parse_args()
100
-
101
- data = load_jsonl('gemini_battles.jsonl')
102
-
103
- default_lang = "English"
104
- filter_data = filter_by_language(language=default_lang)
105
- question_prefixes = [item['conversation_a'][0]["content"][:128] for item in filter_data]
106
- default_question = question_prefixes[2]
107
-
108
- with gr.Blocks() as demo:
109
- gr.Markdown(value="# Welcome to gemini-1.5-pro-api-0514 battles")
110
- with gr.Row():
111
- with gr.Column():
112
- languages = ["English"] + list(sorted(set([item['language'] for item in data if item['language'] != "English"])))
113
- language_dropdown = gr.Dropdown(label="Select Language", choices=languages, value=default_lang)
114
- with gr.Column():
115
- winners = ["anyone"] + sorted(set(item['winner'] for item in filter_data))
116
- winner_dropdown = gr.Dropdown(label="Winner Model", choices=winners, value="anyone")
117
-
118
- with gr.Row():
119
- with gr.Column(scale=5):
120
- question_dropdown = gr.Dropdown(label="Select Question", choices=question_prefixes, value=default_question)
121
- with gr.Column():
122
- next_button = gr.Button("Next Question")
123
-
124
- default_chat_mds = display_filtered_data(default_lang, "anyone", default_question)
125
- chat_mds = []
126
- for i in range(5):
127
- chat_mds.append(gr.Markdown(elem_id=f"user_question_{i+1}", value=default_chat_mds[len(chat_mds)]))
128
- with gr.Row():
129
- for j in range(2):
130
- with gr.Column(scale=100):
131
- chat_mds.append(gr.Markdown(value=default_chat_mds[len(chat_mds)]))
132
- if j == 0:
133
- with gr.Column(scale=1, min_width=8):
134
- gr.Markdown()
135
- chat_mds.append(gr.Markdown())
136
-
137
- language_dropdown.change(fn=update_winner_and_questions, inputs=language_dropdown, outputs=[winner_dropdown, question_dropdown])
138
- winner_dropdown.change(fn=update_question_options, inputs=[language_dropdown, winner_dropdown], outputs=question_dropdown)
139
- next_button.click(fn=next_question, inputs=[language_dropdown, winner_dropdown, question_dropdown], outputs=question_dropdown)
140
- question_dropdown.change(fn=display_filtered_data, inputs=[language_dropdown, winner_dropdown, question_dropdown], outputs=chat_mds)
141
-
142
- demo.launch(share=args.share)
 
1
  import json
2
+ import random
3
+ import html
4
+ import markdown
5
+ from typing import List, Dict, Any, Tuple
6
+
7
  import gradio as gr
8
+ import pandas as pd
9
+ from datasets import load_dataset
10
+
11
+ # df = pd.read_json("selected_battles.json")
12
+
13
+ # load arena battles
14
+ ds = load_dataset("lmarena-ai/arena-human-preference-100k", split="train")
15
+ battles = ds['train'].to_pandas()
16
+
17
+ # Expected columns in this dataset family:
18
+ # ['question_id','model_a','model_b','winner','conversation_a','conversation_b',
19
+ # 'turn','anony','language','tstamp','conv_metadata','is_code','is_refusal',
20
+ # 'dedup_tag','category_tag','judge_hash', ...]
21
+ # See HF card. ──> winner ∈ {model_a, model_b, tie, both_bad}; conversations are full threads. [oai_citation:1‡Hugging Face](https://huggingface.co/datasets/lmarena-ai/arena-human-preference-100k/blob/c9fe392b54cd08a0fd27777455318bac2e7b495c/README.md?utm_source=chatgpt.com)
22
+
23
+ # Dropdown options - sorted by frequency
24
+ def get_sorted_options(column_name):
25
+ if column_name not in df.columns:
26
+ return ["(Any)"]
27
+ value_counts = df[column_name].dropna().value_counts()
28
+ sorted_values = value_counts.index.tolist()
29
+ return ["(Any)"] + sorted_values
30
+
31
+ models_a = get_sorted_options("model_a")
32
+ models_b = get_sorted_options("model_b")
33
+ languages = get_sorted_options("language")
34
+
35
+ def _ensure_messages(x: Any) -> List[Dict[str, Any]]:
36
+ """
37
+ conversation_a / conversation_b can be:
38
+ - a Python list of {role, content} dicts
39
+ - a JSON string encoding that list
40
+ Normalize to a list of dicts with 'role' and 'content'.
41
+ """
42
+ if isinstance(x, list):
43
+ return x
44
+ if isinstance(x, str):
45
+ try:
46
+ val = json.loads(x)
47
+ if isinstance(val, list):
48
+ return val
49
+ except Exception:
50
+ pass
51
+ # Last resort: wrap as a single assistant message
52
+ return [{"role": "assistant", "content": str(x)}]
53
+
54
+ def _winner_text(row: pd.Series) -> str:
55
+ w = str(row.get("winner", "")).strip().lower()
56
+ mapping = {
57
+ "model_a": "Preference: Model A",
58
+ "model_b": "Preference: Model B",
59
+ "tie": "Preference: Tie",
60
+ "both_bad": "Preference: Tie (both bad)",
61
+ }
62
+ return mapping.get(w, "Preference: (unknown)")
63
+
64
+ def _bubble_html(messages: List[Dict[str, Any]], side_label: str) -> str:
65
+ """
66
+ Make a chat-like interface with proper user/assistant bubbles.
67
+ User messages are on the left, assistant messages on the right.
68
+ """
69
+ # Tailwind-like inline styles (no external CSS)
70
+ css = """
71
+ <style>
72
+ .chat-container {padding:12px; border-radius:16px; background:#fafafa; box-shadow:0 1px 3px rgba(0,0,0,.08);}
73
+ .model-label {font-weight:600; font-size:14px; margin-bottom:12px; opacity:.8; text-align:center;}
74
+ .message {margin:12px 0; display:flex; align-items:flex-start;}
75
+ .message.user {justify-content:flex-start;}
76
+ .message.assistant {justify-content:flex-end;}
77
+ .bubble {max-width:70%; padding:10px 14px; border-radius:18px; word-wrap:break-word;}
78
+ .bubble.user {background:#e9eef7; color:#2c3e50; margin-right:auto;}
79
+ .bubble.assistant {background:#eaf7ea; color:#2c3e50; margin-left:auto;}
80
+ .role-label {font-size:11px; font-weight:500; margin-bottom:4px; opacity:.7;}
81
+ .role-label.assistant {text-align:right;}
82
+ .bubble pre {background:#f5f5f5; padding:8px; border-radius:4px; overflow-x:auto; margin:8px 0;}
83
+ .bubble code {background:#f0f0f0; padding:2px 4px; border-radius:3px; font-family:monospace;}
84
+ .bubble p {margin:8px 0;}
85
+ .bubble ul, .bubble ol {margin:8px 0; padding-left:20px;}
86
+ .bubble blockquote {border-left:3px solid #ddd; padding-left:12px; margin:8px 0; color:#666;}
87
+ </style>
88
+ """
89
+ body = [f'<div class="chat-container">']
90
+
91
+ # Only show model label at top for User side
92
+ if side_label != "Assistant":
93
+ body.append(f'<div class="model-label">{side_label}</div>')
94
+
95
+ first_assistant_message = True
96
+ for m in messages:
97
+ role = (m.get("role") or "").lower()
98
+ content = str(m.get("content", "")).strip()
99
+ if not content:
100
+ continue
101
+
102
+ # Convert markdown to HTML
103
+ try:
104
+ rendered_content = markdown.markdown(content, extensions=['fenced_code', 'codehilite', 'tables'])
105
+ except:
106
+ # Fallback to escaped content if markdown rendering fails
107
+ rendered_content = html.escape(content)
108
+
109
+ if role in ("user", "system"):
110
+ role_display = "User" if role == "user" else "System"
111
+ body.append(f'''
112
+ <div class="message user">
113
+ <div>
114
+ <div class="role-label">{role_display}</div>
115
+ <div class="bubble user">{rendered_content}</div>
116
+ </div>
117
+ </div>
118
+ ''')
119
+ else:
120
+ # For assistant messages, include the model name in the first message
121
+ if first_assistant_message and side_label == "Assistant":
122
+ content = f"{side_label}: {content}"
123
+ try:
124
+ rendered_content = markdown.markdown(content, extensions=['fenced_code', 'codehilite', 'tables'])
125
+ except:
126
+ rendered_content = html.escape(content)
127
+ first_assistant_message = False
128
+
129
+ body.append(f'''
130
+ <div class="message assistant">
131
+ <div>
132
+ <div class="role-label assistant">Assistant</div>
133
+ <div class="bubble assistant">{rendered_content}</div>
134
+ </div>
135
+ </div>
136
+ ''')
137
+
138
+ body.append("</div>")
139
+ return css + "\n".join(body)
140
+
141
+ def filter_df(model_a_sel: str, model_b_sel: str, lang_sel: str) -> pd.DataFrame:
142
+ sub = df
143
+ if model_a_sel != "(Any)":
144
+ sub = sub[sub["model_a"] == model_a_sel]
145
+ if model_b_sel != "(Any)":
146
+ sub = sub[sub["model_b"] == model_b_sel]
147
+ if "language" in sub.columns and lang_sel != "(Any)":
148
+ sub = sub[sub["language"].astype(str) == lang_sel]
149
+ return sub.reset_index(drop=True)
150
+
151
+ def format_row(row: pd.Series) -> Tuple[str, str, str, str, str]:
152
+ # Prompt headline = first user message if present
153
+ msgs_a = _ensure_messages(row["conversation_a"])
154
+ msgs_b = _ensure_messages(row["conversation_b"])
155
+ first_user = ""
156
+ for m in msgs_a:
157
+ if (m.get("role") or "").lower() == "user":
158
+ first_user = str(m.get("content", "")).strip()
159
+ break
160
+
161
+ left = _bubble_html(msgs_a, f"Model A: {row['model_a']}")
162
+ right = _bubble_html(msgs_b, f"Model B: {row['model_b']}")
163
+
164
+ # Create a subtle preference footer with soft yellow background
165
+ preference_text = _winner_text(row)
166
+ footer_html = f"""
167
+ <div style="
168
+ background: #fff8e1;
169
+ color: #5d4037;
170
+ padding: 10px 16px;
171
+ margin: 12px 0;
172
+ border-radius: 6px;
173
+ font-weight: 600;
174
+ font-size: 14px;
175
+ text-align: center;
176
+ box-shadow: 0 1px 3px rgba(0,0,0,0.08);
177
+ border: 1px solid #ffcc02;
178
+ ">
179
+ {preference_text}
180
+ </div>
181
+ """
182
+
183
+ return "", left, right, footer_html, ""
184
+
185
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo")) as demo:
186
+ gr.Markdown("# Chatbot Arena Battle Viewer (100k)")
187
+ gr.Markdown(
188
+ "Filter by **Model A**, **Model B**, and **Language**, then browse side-by-side conversations. "
189
+ "Data: `lmarena-ai/arena-human-preference-100k`."
190
+ )
191
+
192
+ with gr.Row():
193
+ dd_a = gr.Dropdown(models_a, label="Model A", value="(Any)")
194
+ dd_b = gr.Dropdown(models_b, label="Model B", value="(Any)")
195
+ dd_l = gr.Dropdown(languages, label="Language", value=languages[0])
196
+
197
+ with gr.Row():
198
+ btn_rand = gr.Button("Random match")
199
+ btn_prev = gr.Button("◀ Prev")
200
+ btn_next = gr.Button("Next ▶")
201
+
202
+ st_indices = gr.State([])
203
+ st_ptr = gr.State(0)
204
+
205
+ header_md = gr.Markdown()
206
+ with gr.Row():
207
+ left_html = gr.HTML()
208
+ right_html = gr.HTML()
209
+ footer_md = gr.HTML()
210
+ meta_md = gr.Markdown()
211
+
212
+ def apply_filters(a, b, l):
213
+ sub = filter_df(a, b, l)
214
+ idxs = list(range(len(sub)))
215
+ ptr = 0 if idxs else -1
216
+ if ptr >= 0:
217
+ row = sub.iloc[ptr]
218
+ head, left, right, foot, meta = format_row(row)
219
+ else:
220
+ head = left = right = foot = meta = "_No rows match your filters._"
221
+ return idxs, ptr, head, left, right, foot, meta
222
+
223
+ def nav(a, b, l, indices, ptr, direction):
224
+ sub = filter_df(a, b, l)
225
+ if not len(sub):
226
+ return [], -1, "_No rows match your filters._", "", "", "", ""
227
+ idxs = list(range(len(sub)))
228
+ if ptr is None or ptr < 0 or ptr >= len(sub):
229
+ ptr = 0
230
+ if direction == "next":
231
+ ptr = (ptr + 1) % len(sub)
232
+ elif direction == "prev":
233
+ ptr = (ptr - 1) % len(sub)
234
+ row = sub.iloc[ptr]
235
+ head, left, right, foot, meta = format_row(row)
236
+ return idxs, ptr, head, left, right, foot, meta
237
+
238
+ def rand(a, b, l):
239
+ sub = filter_df(a, b, l)
240
+ if not len(sub):
241
+ return [], -1, "_No rows match your filters._", "", "", "", ""
242
+ r = random.randrange(len(sub))
243
+ row = sub.iloc[r]
244
+ head, left, right, foot, meta = format_row(row)
245
+ return list(range(len(sub))), r, head, left, right, foot, meta
246
+
247
+ # Auto-update when dropdowns change
248
+ dd_a.change(apply_filters, [dd_a, dd_b, dd_l],
249
+ [st_indices, st_ptr, header_md, left_html, right_html, footer_md, meta_md])
250
+ dd_b.change(apply_filters, [dd_a, dd_b, dd_l],
251
+ [st_indices, st_ptr, header_md, left_html, right_html, footer_md, meta_md])
252
+ dd_l.change(apply_filters, [dd_a, dd_b, dd_l],
253
+ [st_indices, st_ptr, header_md, left_html, right_html, footer_md, meta_md])
254
+
255
+ btn_next.click(nav, [dd_a, dd_b, dd_l, st_indices, st_ptr, gr.State("next")],
256
+ [st_indices, st_ptr, header_md, left_html, right_html, footer_md, meta_md])
257
+ btn_prev.click(nav, [dd_a, dd_b, dd_l, st_indices, st_ptr, gr.State("prev")],
258
+ [st_indices, st_ptr, header_md, left_html, right_html, footer_md, meta_md])
259
+ btn_rand.click(rand, [dd_a, dd_b, dd_l],
260
+ [st_indices, st_ptr, header_md, left_html, right_html, footer_md, meta_md])
261
 
262
+ gr.on([demo.load], apply_filters, [dd_a, dd_b, dd_l],
263
+ [st_indices, st_ptr, header_md, left_html, right_html, footer_md, meta_md])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
  if __name__ == "__main__":
266
+ demo.launch()