AdnanElAssadi commited on
Commit
9f8b4b9
·
verified ·
1 Parent(s): e1f1819

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +707 -261
app.py CHANGED
@@ -3,69 +3,41 @@ import json
3
  import os
4
  from pathlib import Path
5
 
6
- # Create a minimal demo with hardcoded sample data
7
- SAMPLE_TASK = {
8
- "task_name": "Demo Reranking Task",
9
- "task_type": "reranking",
10
- "instructions": "Rank the documents from most relevant to least relevant for the given query.",
11
- "samples": [
12
- {
13
- "id": "sample_1",
14
- "query": "How do I install Python on Windows?",
15
- "candidates": [
16
- "To install Python on Windows, go to python.org and download the latest installer. Run the installer and make sure to check 'Add Python to PATH' during installation.",
17
- "Python is a popular programming language used for web development, data analysis, and machine learning.",
18
- "Windows is an operating system developed by Microsoft. It's the most popular desktop operating system worldwide.",
19
- "Installing software on Windows typically involves downloading an installer and running it with administrator privileges.",
20
- "Programming languages like Python, JavaScript, and Java are essential tools for modern software development."
21
- ]
22
- },
23
- {
24
- "id": "sample_2",
25
- "query": "What are the benefits of exercise?",
26
- "candidates": [
27
- "Regular exercise improves cardiovascular health, strengthens muscles, and can help with weight management.",
28
- "Exercise releases endorphins which can improve mood and reduce feelings of depression and anxiety.",
29
- "A balanced diet is important for maintaining good health and providing energy for daily activities.",
30
- "Regular physical activity can reduce the risk of chronic diseases such as heart disease, diabetes, and certain cancers.",
31
- "Sleep is essential for recovery and overall health, with most adults needing 7-9 hours per night."
32
- ]
33
- }
34
- ]
35
- }
36
-
37
  def create_reranking_interface(task_data):
38
  """Create a Gradio interface for reranking evaluation."""
39
  samples = task_data["samples"]
40
  results = {"task_name": task_data["task_name"], "task_type": "reranking", "annotations": []}
41
  completed_samples = {s["id"]: False for s in samples}
42
 
43
- # Store the current document order for the active sample
44
- current_order = []
45
-
46
- def save_ranking(sample_id):
47
- """Save the current document ordering as rankings."""
48
  try:
49
- if not current_order:
50
- return "⚠️ No document ordering found", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
 
 
51
 
52
- # Convert document positions to rankings
53
- # First document (position 0) gets rank 1, etc.
54
- rankings = []
55
- for i, doc_idx in enumerate(current_order):
56
- rankings.append(i + 1) # Convert to 1-based ranks
57
 
 
 
 
 
58
  # Store this annotation in memory
59
  existing_idx = next((i for i, a in enumerate(results["annotations"]) if a["sample_id"] == sample_id), None)
60
  if existing_idx is not None:
61
  results["annotations"][existing_idx] = {
62
  "sample_id": sample_id,
63
- "rankings": rankings
64
  }
65
  else:
66
  results["annotations"].append({
67
  "sample_id": sample_id,
68
- "rankings": rankings
69
  })
70
 
71
  completed_samples[sample_id] = True
@@ -76,12 +48,10 @@ def create_reranking_interface(task_data):
76
  with open(output_path, "w") as f:
77
  json.dump(results, f, indent=2)
78
  return f"✅ Rankings saved successfully (in memory and to file)", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
79
- except Exception as write_error:
80
- print(f"Error writing results file: {str(write_error)}")
81
  # If file saving fails, still mark as success since we saved in memory
82
- return f"✅ Rankings saved in memory (file save failed: {str(write_error)})", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
83
  except Exception as e:
84
- print(f"Error in save_ranking: {str(e)}")
85
  # Return specific error message
86
  return f"Error: {str(e)}", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
87
 
@@ -96,11 +66,12 @@ def create_reranking_interface(task_data):
96
 
97
  ### How to use this interface:
98
  1. Read the query at the top
99
- 2. Review each document in the list
100
- 3. Use the "Move Up" and "Move Down" buttons to arrange documents by relevance
101
- (most relevant at the top, least relevant at the bottom)
102
- 4. Click "Submit Rankings" when you're done with the current query
103
- 5. Use "Previous" and "Next" to navigate between queries
 
104
  """.format(instructions=task_data["instructions"]))
105
 
106
  current_sample_id = gr.State(value=samples[0]["id"])
@@ -113,236 +84,711 @@ def create_reranking_interface(task_data):
113
  gr.Markdown("## Query:")
114
  query_text = gr.Textbox(value=samples[0]["query"], label="", interactive=False)
115
 
116
- gr.Markdown("## Documents (Arrange in order of relevance, most relevant at top):")
117
 
118
- # Create simple document list with move up/down buttons
119
- document_containers = []
 
120
 
121
- # Function to initialize the document list for a sample
122
- def initialize_document_list(sample_id):
123
- nonlocal current_order
124
- try:
125
- sample = next((s for s in samples if s["id"] == sample_id), None)
126
- if not sample:
127
- return "Query not found", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
128
-
129
- # Get the documents for this sample
130
- docs = sample["candidates"]
131
-
132
- # Initialize document order (0, 1, 2, ..., n-1)
133
- current_order = list(range(len(docs)))
134
-
135
- # Check if this sample has already been annotated to restore ordering
136
- existing_annotation = next((a for a in results["annotations"] if a["sample_id"] == sample_id), None)
137
- if existing_annotation and "rankings" in existing_annotation:
138
- # Create pairs of (doc_idx, rank)
139
- ranked_docs = []
140
- for doc_idx, rank in enumerate(existing_annotation["rankings"]):
141
- ranked_docs.append((doc_idx, rank))
142
-
143
- # Sort by rank (ascending)
144
- ranked_docs.sort(key=lambda x: x[1])
 
 
 
 
 
 
145
 
146
- # Extract document indices in rank order
147
- current_order = [doc[0] for doc in ranked_docs]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
- # Update UI
150
- for i in range(len(document_containers)):
151
- if i < len(docs):
152
- doc_idx = current_order[i]
153
- document_containers[i].value = f"Document {doc_idx+1} (Rank {i+1}): {docs[doc_idx]}"
154
- else:
155
- document_containers[i].value = "" # Clear unused containers
 
156
 
157
- # Status message
158
- status = f"Viewing query {samples.index(sample) + 1} of {len(samples)}"
159
- if completed_samples[sample_id]:
160
- status += " (already completed)"
 
 
161
 
162
- return status, f"Progress: {sum(completed_samples.values())}/{len(samples)}"
163
- except Exception as e:
164
- print(f"Error in initialize_document_list: {str(e)}")
165
- return f"Error initializing documents: {str(e)}", "Progress: 0/0"
166
-
167
- # Create document display containers with up/down buttons
168
- with gr.Column():
169
- # Display up to 10 documents (or however many are in the largest sample)
170
- max_docs = max(len(s["candidates"]) for s in samples)
171
- for i in range(max_docs):
172
- with gr.Group():
173
- doc_text = gr.Textbox(label=f"Document {i+1}", interactive=False)
174
- document_containers.append(doc_text)
175
-
176
- with gr.Row():
177
- up_btn = gr.Button(f"⬆️ Move Up", size="sm")
178
- down_btn = gr.Button(f"⬇️ Move Down", size="sm")
179
 
180
- # Create closures for up/down buttons
181
- def make_up_handler(idx):
182
- def up_handler():
183
- nonlocal current_order
184
- if idx > 0:
185
- # Swap with document above
186
- current_order[idx], current_order[idx-1] = current_order[idx-1], current_order[idx]
187
-
188
- # Get current sample
189
- sample = next((s for s in samples if s["id"] == current_sample_id.value), None)
190
- if sample:
191
- docs = sample["candidates"]
192
- # Update document displays
193
- for j in range(len(document_containers)):
194
- if j < len(docs):
195
- doc_idx = current_order[j]
196
- document_containers[j].value = f"Document {doc_idx+1} (Rank {j+1}): {docs[doc_idx]}"
197
-
198
- # Return empty list since we update the containers directly
199
- return {}
200
- return up_handler
201
 
202
- def make_down_handler(idx):
203
- def down_handler():
204
- nonlocal current_order
205
- if idx < len(current_order) - 1:
206
- # Swap with document below
207
- current_order[idx], current_order[idx+1] = current_order[idx+1], current_order[idx]
208
-
209
- # Get current sample
210
- sample = next((s for s in samples if s["id"] == current_sample_id.value), None)
211
- if sample:
212
- docs = sample["candidates"]
213
- # Update document displays
214
- for j in range(len(document_containers)):
215
- if j < len(docs):
216
- doc_idx = current_order[j]
217
- document_containers[j].value = f"Document {doc_idx+1} (Rank {j+1}): {docs[doc_idx]}"
218
-
219
- # Return empty list since we update the containers directly
220
- return {}
221
- return down_handler
222
 
223
- # Connect buttons
224
- up_btn.click(
225
- fn=make_up_handler(i),
226
- inputs=None,
227
- outputs=None
228
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
- down_btn.click(
231
- fn=make_down_handler(i),
232
- inputs=None,
233
- outputs=None
234
- )
235
-
236
- with gr.Row():
237
- prev_btn = gr.Button("← Previous Query", size="sm")
238
- submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary")
239
- next_btn = gr.Button("Next Query →", size="sm")
240
-
241
- # Navigation functions
242
- def nav_to_prev(current_id):
243
- try:
244
- current_sample = next((s for s in samples if s["id"] == current_id), None)
245
- if not current_sample:
246
- return current_id
247
 
248
- current_idx = samples.index(current_sample)
249
- if current_idx > 0:
250
- prev_sample = samples[current_idx - 1]
251
- return prev_sample["id"]
252
- return current_id
253
- except Exception as e:
254
- print(f"Error in nav_to_prev: {str(e)}")
255
- return current_id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
- def nav_to_next(current_id):
258
- try:
259
- current_sample = next((s for s in samples if s["id"] == current_id), None)
260
- if not current_sample:
261
- return current_id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
- current_idx = samples.index(current_sample)
264
- if current_idx < len(samples) - 1:
265
- next_sample = samples[current_idx + 1]
266
- return next_sample["id"]
267
- return current_id
268
- except Exception as e:
269
- print(f"Error in nav_to_next: {str(e)}")
270
- return current_id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
- # Update query text
273
- def update_query(sample_id):
274
- try:
275
- sample = next((s for s in samples if s["id"] == sample_id), None)
276
- if not sample:
277
- return ""
278
- return sample["query"]
279
- except Exception as e:
280
- print(f"Error in update_query: {str(e)}")
281
- return "Error loading query"
282
-
283
- # Initialize the first sample
284
- initialize_document_list(samples[0]["id"])
285
-
286
- # Connect navigation buttons
287
- prev_btn.click(
288
- nav_to_prev,
289
- inputs=[current_sample_id],
290
- outputs=[current_sample_id]
291
- ).then(
292
- initialize_document_list,
293
- inputs=[current_sample_id],
294
- outputs=[status_box, progress_text]
295
- ).then(
296
- update_query,
297
- inputs=[current_sample_id],
298
- outputs=[query_text]
299
- )
300
 
301
- next_btn.click(
302
- nav_to_next,
303
- inputs=[current_sample_id],
304
- outputs=[current_sample_id]
305
- ).then(
306
- initialize_document_list,
307
- inputs=[current_sample_id],
308
- outputs=[status_box, progress_text]
309
- ).then(
310
- update_query,
311
- inputs=[current_sample_id],
312
- outputs=[query_text]
313
  )
314
 
315
- # Connect submit button
316
- submit_btn.click(
317
- save_ranking,
318
- inputs=[current_sample_id],
319
- outputs=[status_box, progress_text]
 
 
 
 
 
 
 
320
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
  return demo
323
 
324
- # Simple Gradio app with hardcoded sample data
325
- with gr.Blocks(theme=gr.themes.Soft()) as app:
326
  gr.Markdown("# MTEB Human Evaluation Demo")
327
- gr.Markdown("## Sample Reranking Task")
328
 
329
- # Create and display the demo interface
330
- demo = create_reranking_interface(SAMPLE_TASK)
331
-
332
- if __name__ == "__main__":
333
- try:
334
- print("Starting MTEB Human Evaluation App with hardcoded sample data...")
335
- print(f"Current directory: {os.getcwd()}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
- # Launch with hardcoded sample data
338
- app.launch(debug=True)
339
- except Exception as e:
340
- print(f"ERROR STARTING APP: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
- # Create an ultra-simple fallback app
343
- with gr.Blocks() as fallback_app:
344
- gr.Markdown("# MTEB Human Evaluation - Emergency Fallback Mode")
345
- gr.Markdown("There was an error loading the application. Please see details below.")
346
- gr.Textbox(value=str(e), label="Error", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
- fallback_app.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import os
4
  from pathlib import Path
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  def create_reranking_interface(task_data):
7
  """Create a Gradio interface for reranking evaluation."""
8
  samples = task_data["samples"]
9
  results = {"task_name": task_data["task_name"], "task_type": "reranking", "annotations": []}
10
  completed_samples = {s["id"]: False for s in samples}
11
 
12
+ def save_ranking(rankings, sample_id):
13
+ """Save the current set of rankings."""
 
 
 
14
  try:
15
+ # Check if all documents have rankings
16
+ all_ranked = all(r is not None and r != "" for r in rankings)
17
+ if not all_ranked:
18
+ return "⚠️ Please assign a rank to all documents before submitting", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
19
 
20
+ # Convert rankings to integers with better error handling
21
+ try:
22
+ processed_rankings = [int(r) for r in rankings]
23
+ except ValueError:
24
+ return "⚠️ Invalid ranking value. Please use only numbers.", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
25
 
26
+ # Check for duplicate rankings
27
+ if len(set(processed_rankings)) != len(processed_rankings):
28
+ return "⚠️ Each document must have a unique rank. Please review your rankings.", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
29
+
30
  # Store this annotation in memory
31
  existing_idx = next((i for i, a in enumerate(results["annotations"]) if a["sample_id"] == sample_id), None)
32
  if existing_idx is not None:
33
  results["annotations"][existing_idx] = {
34
  "sample_id": sample_id,
35
+ "rankings": processed_rankings
36
  }
37
  else:
38
  results["annotations"].append({
39
  "sample_id": sample_id,
40
+ "rankings": processed_rankings
41
  })
42
 
43
  completed_samples[sample_id] = True
 
48
  with open(output_path, "w") as f:
49
  json.dump(results, f, indent=2)
50
  return f"✅ Rankings saved successfully (in memory and to file)", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
51
+ except:
 
52
  # If file saving fails, still mark as success since we saved in memory
53
+ return f"✅ Rankings saved in memory (file save failed)", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
54
  except Exception as e:
 
55
  # Return specific error message
56
  return f"Error: {str(e)}", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
57
 
 
66
 
67
  ### How to use this interface:
68
  1. Read the query at the top
69
+ 2. Review each document carefully
70
+ 3. Assign a rank to each document (1 = most relevant, higher numbers = less relevant)
71
+ 4. Each document must have a unique rank
72
+ 5. Click "Submit Rankings" when you're done with the current query
73
+ 6. Use "Previous" and "Next" to navigate between queries
74
+ 7. Click "Save All Results" periodically to ensure your work is saved
75
  """.format(instructions=task_data["instructions"]))
76
 
77
  current_sample_id = gr.State(value=samples[0]["id"])
 
84
  gr.Markdown("## Query:")
85
  query_text = gr.Textbox(value=samples[0]["query"], label="", interactive=False)
86
 
87
+ gr.Markdown("## Documents to Rank:")
88
 
89
+ # Create document displays and ranking dropdowns in synchronized pairs
90
+ doc_containers = []
91
+ ranking_dropdowns = []
92
 
93
+ with gr.Column():
94
+ for i, doc in enumerate(samples[0]["candidates"]):
95
+ with gr.Row():
96
+ doc_box = gr.Textbox(
97
+ value=doc,
98
+ label=f"Document {i+1}",
99
+ interactive=False,
100
+ elem_classes="doc-box"
101
+ )
102
+ dropdown = gr.Dropdown(
103
+ choices=[str(j) for j in range(1, len(samples[0]["candidates"])+1)],
104
+ label=f"Rank",
105
+ value="",
106
+ elem_classes="ranking-dropdown"
107
+ )
108
+ # Add Quick Rank buttons for fast selection
109
+ with gr.Column(scale=1, min_width=120):
110
+ gr.Markdown(f"Quick Rank", elem_classes="quick-rank-label")
111
+ with gr.Row():
112
+ # Add first 5 rank buttons (or fewer if there are fewer candidates)
113
+ num_buttons = min(5, len(samples[0]["candidates"]))
114
+ for r in range(1, num_buttons + 1):
115
+ button = gr.Button(f"{r}", size="sm", elem_classes=f"quick-rank-btn quick-rank-btn-{i}-{r}")
116
+ # Use JavaScript to set the dropdown value when clicked
117
+ button.click(
118
+ None,
119
+ [],
120
+ [],
121
+ _js=f"() => {{ document.querySelectorAll('.ranking-dropdown')[{i}].value = '{r}'; return []; }}"
122
+ )
123
 
124
+ doc_containers.append(doc_box)
125
+ ranking_dropdowns.append(dropdown)
126
+
127
+ # Add keyboard shortcuts explanation
128
+ with gr.Accordion("Keyboard Shortcuts", open=False):
129
+ gr.Markdown("""
130
+ ### Keyboard Shortcuts
131
+ - When a document text box is focused:
132
+ - Press number keys (1-9) to assign rankings quickly
133
+ - Navigation:
134
+ - Press 'n' to go to the next query
135
+ - Press 'p' to go to the previous query
136
+ - Press 's' to submit the current rankings
137
+ """)
138
+
139
+ # Add JavaScript for keyboard shortcuts
140
+ gr.HTML("""
141
+ <script>
142
+ document.addEventListener('DOMContentLoaded', function() {
143
+ // Wait for Gradio elements to be fully loaded
144
+ setTimeout(() => {
145
+ // Get all document textboxes
146
+ const docBoxes = document.querySelectorAll('.doc-box');
147
+ const dropdowns = document.querySelectorAll('.ranking-dropdown');
148
 
149
+ // Add event listeners to document boxes
150
+ docBoxes.forEach((box, index) => {
151
+ box.addEventListener('click', function() {
152
+ // Mark this box as active for keyboard shortcuts
153
+ docBoxes.forEach(b => b.classList.remove('active-doc'));
154
+ box.classList.add('active-doc');
155
+ });
156
+ });
157
 
158
+ // Add event listeners to dropdowns for color coding
159
+ dropdowns.forEach((dropdown, index) => {
160
+ dropdown.addEventListener('change', function() {
161
+ updateDropdownColor(dropdown, docBoxes[index]);
162
+ });
163
+ });
164
 
165
+ // Function to update color based on rank
166
+ function updateDropdownColor(dropdown, docBox) {
167
+ const value = dropdown.value;
168
+ if (!value) return;
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
+ // Remove existing color classes
171
+ dropdown.classList.remove('rank-1', 'rank-2', 'rank-3', 'rank-4', 'rank-5', 'rank-high');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
+ // Add appropriate color class
174
+ if (value === '1') dropdown.classList.add('rank-1');
175
+ else if (value === '2') dropdown.classList.add('rank-2');
176
+ else if (value === '3') dropdown.classList.add('rank-3');
177
+ else if (value === '4') dropdown.classList.add('rank-4');
178
+ else if (value === '5') dropdown.classList.add('rank-5');
179
+ else dropdown.classList.add('rank-high');
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
+ // Add highlighting to document box
182
+ docBox.classList.add('ranked-doc');
183
+ }
184
+
185
+ // Add global keyboard listener
186
+ document.addEventListener('keydown', function(e) {
187
+ // Number keys 1-9 for ranking
188
+ if (e.key >= '1' && e.key <= '9') {
189
+ const activeDoc = document.querySelector('.active-doc');
190
+ if (activeDoc) {
191
+ const index = Array.from(docBoxes).indexOf(activeDoc);
192
+ const dropdown = document.querySelectorAll('.ranking-dropdown')[index];
193
+ if (dropdown) {
194
+ dropdown.value = e.key;
195
+ dropdown.dispatchEvent(new Event('change'));
196
+ updateDropdownColor(dropdown, activeDoc);
197
+ }
198
+ }
199
+ }
200
 
201
+ // Navigation shortcuts
202
+ if (e.key === 'n') {
203
+ // Next query
204
+ document.querySelector('#next-btn').click();
205
+ } else if (e.key === 'p') {
206
+ // Previous query
207
+ document.querySelector('#prev-btn').click();
208
+ } else if (e.key === 's') {
209
+ // Submit rankings
210
+ document.querySelector('#submit-btn').click();
211
+ }
212
+ });
 
 
 
 
 
213
 
214
+ // Add some CSS for active document
215
+ const style = document.createElement('style');
216
+ style.textContent = `
217
+ .active-doc {
218
+ border-left: 3px solid #3B82F6 !important;
219
+ background-color: rgba(59, 130, 246, 0.05) !important;
220
+ }
221
+ .ranked-doc {
222
+ border-bottom: 2px solid #4ADE80 !important;
223
+ }
224
+ .rank-1 {
225
+ background-color: rgba(74, 222, 128, 0.2) !important;
226
+ font-weight: bold !important;
227
+ }
228
+ .rank-2 {
229
+ background-color: rgba(74, 222, 128, 0.15) !important;
230
+ }
231
+ .rank-3 {
232
+ background-color: rgba(251, 191, 36, 0.15) !important;
233
+ }
234
+ .rank-4 {
235
+ background-color: rgba(251, 191, 36, 0.1) !important;
236
+ }
237
+ .rank-5 {
238
+ background-color: rgba(239, 68, 68, 0.1) !important;
239
+ }
240
+ .rank-high {
241
+ background-color: rgba(239, 68, 68, 0.05) !important;
242
+ }
243
+ .quick-rank-label {
244
+ margin-bottom: 0 !important;
245
+ font-size: 0.8rem !important;
246
+ opacity: 0.8;
247
+ }
248
+ .quick-rank-btn {
249
+ min-width: 20px !important;
250
+ height: 24px !important;
251
+ line-height: 1 !important;
252
+ padding: 2px 6px !important;
253
+ }
254
+ `;
255
+ document.head.appendChild(style);
256
+ }, 1000);
257
+ });
258
+ </script>
259
+ """)
260
 
261
+ # Add visual ranking mode option
262
+ with gr.Row():
263
+ visual_mode_btn = gr.Button("Toggle Visual Ranking Mode", size="sm")
264
+ reset_rankings_btn = gr.Button("Reset Rankings", size="sm", variant="secondary")
265
+
266
+ # Visual ranking display
267
+ with gr.Column(visible=False) as visual_ranking_container:
268
+ gr.Markdown("## Current Rankings (Most to Least Relevant)")
269
+ ranked_display = gr.HTML("No rankings yet")
270
+
271
+ # Function to toggle visual ranking mode
272
+ def toggle_visual_mode(visible):
273
+ return not visible
274
+
275
+ # Function to update visual ranking display
276
+ def update_visual_ranking(*rankings):
277
+ # Convert to integers with error handling
278
+ clean_rankings = []
279
+ for r in rankings:
280
+ try:
281
+ if r and r.strip():
282
+ clean_rankings.append(int(r))
283
+ else:
284
+ clean_rankings.append(None)
285
+ except ValueError:
286
+ clean_rankings.append(None)
287
+
288
+ # Check if any rankings exist
289
+ if not any(r is not None for r in clean_rankings):
290
+ return "<p>No rankings assigned yet.</p>"
291
+
292
+ # Create sorted order
293
+ ranked_indices = []
294
+ for rank in range(1, len(clean_rankings) + 1):
295
+ try:
296
+ idx = clean_rankings.index(rank)
297
+ ranked_indices.append(idx)
298
+ except ValueError:
299
+ pass
300
+
301
+ # Build HTML
302
+ html = "<div class='visual-ranking'>"
303
+ for i, idx in enumerate(ranked_indices):
304
+ rank = i + 1
305
+ doc_text = doc_containers[idx].value
306
 
307
+ # Apply color classes based on rank
308
+ rank_class = ""
309
+ if rank == 1:
310
+ rank_class = "visual-rank-1"
311
+ elif rank == 2:
312
+ rank_class = "visual-rank-2"
313
+ elif rank == 3:
314
+ rank_class = "visual-rank-3"
315
+ elif rank <= 5:
316
+ rank_class = "visual-rank-45"
317
+ else:
318
+ rank_class = "visual-rank-high"
319
+
320
+ html += f"""
321
+ <div class='visual-rank-item {rank_class}'>
322
+ <div class='visual-rank-number'>{rank}</div>
323
+ <div class='visual-rank-content'>{doc_text}</div>
324
+ </div>
325
+ """
326
+
327
+ # Add unranked items if any
328
+ unranked_indices = [i for i, r in enumerate(clean_rankings) if r is None]
329
+ if unranked_indices:
330
+ html += "<h3>Unranked Documents</h3>"
331
+ for idx in unranked_indices:
332
+ doc_text = doc_containers[idx].value
333
+ html += f"""
334
+ <div class='visual-rank-item visual-rank-unranked'>
335
+ <div class='visual-rank-number'>?</div>
336
+ <div class='visual-rank-content'>{doc_text}</div>
337
+ </div>
338
+ """
339
+
340
+ html += "</div>"
341
+
342
+ # Add CSS
343
+ html += """
344
+ <style>
345
+ .visual-ranking {
346
+ margin-top: 15px;
347
+ }
348
+ .visual-rank-item {
349
+ display: flex;
350
+ margin-bottom: 15px;
351
+ padding: 10px;
352
+ border-radius: 8px;
353
+ }
354
+ .visual-rank-number {
355
+ font-size: 18px;
356
+ font-weight: bold;
357
+ margin-right: 10px;
358
+ min-width: 30px;
359
+ height: 30px;
360
+ border-radius: 15px;
361
+ background-color: #e5e7eb;
362
+ display: flex;
363
+ align-items: center;
364
+ justify-content: center;
365
+ }
366
+ .visual-rank-content {
367
+ flex: 1;
368
+ }
369
+ .visual-rank-1 {
370
+ background-color: rgba(74, 222, 128, 0.2);
371
+ border-left: 4px solid #4ADE80;
372
+ }
373
+ .visual-rank-2 {
374
+ background-color: rgba(74, 222, 128, 0.15);
375
+ border-left: 3px solid #4ADE80;
376
+ }
377
+ .visual-rank-3 {
378
+ background-color: rgba(251, 191, 36, 0.15);
379
+ border-left: 3px solid #FBBF24;
380
+ }
381
+ .visual-rank-45 {
382
+ background-color: rgba(251, 191, 36, 0.1);
383
+ border-left: 2px solid #FBBF24;
384
+ }
385
+ .visual-rank-high {
386
+ background-color: rgba(239, 68, 68, 0.05);
387
+ border-left: 2px solid #EF4444;
388
+ }
389
+ .visual-rank-unranked {
390
+ background-color: #f9fafb;
391
+ border: 1px dashed #d1d5db;
392
+ }
393
+ .visual-rank-unranked .visual-rank-number {
394
+ background-color: #d1d5db;
395
+ }
396
+ </style>
397
+ """
398
+
399
+ return html
400
 
401
+ # Function to reset all rankings
402
+ def reset_rankings():
403
+ return ["" for _ in ranking_dropdowns]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
+ # Connect events
406
+ visual_mode_btn.click(
407
+ toggle_visual_mode,
408
+ inputs=[visual_ranking_container],
409
+ outputs=[visual_ranking_container]
 
 
 
 
 
 
 
410
  )
411
 
412
+ # Update visual ranking when any dropdown changes
413
+ for dropdown in ranking_dropdowns:
414
+ dropdown.change(
415
+ update_visual_ranking,
416
+ inputs=ranking_dropdowns,
417
+ outputs=[ranked_display]
418
+ )
419
+
420
+ # Reset rankings button
421
+ reset_rankings_btn.click(
422
+ reset_rankings,
423
+ outputs=ranking_dropdowns
424
  )
425
+
426
+ with gr.Row():
427
+ prev_btn = gr.Button("← Previous Query", size="sm", elem_id="prev-btn")
428
+ submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary", elem_id="submit-btn")
429
+ next_btn = gr.Button("Next Query →", size="sm", elem_id="next-btn")
430
+
431
+ save_btn = gr.Button("💾 Save All Results", variant="secondary")
432
+
433
+ def load_sample(sample_id):
434
+ """Load a specific sample into the interface."""
435
+ sample = next((s for s in samples if s["id"] == sample_id), None)
436
+ if not sample:
437
+ return [query_text.value] + [d.value for d in doc_containers] + [""] * len(ranking_dropdowns) + [current_sample_id.value, progress_text.value, status_box.value]
438
+
439
+ # Update query
440
+ new_query = sample["query"]
441
+
442
+ # Update documents
443
+ new_docs = []
444
+ for i, doc in enumerate(sample["candidates"]):
445
+ if i < len(doc_containers):
446
+ new_docs.append(doc)
447
+
448
+ # Initialize rankings
449
+ new_rankings = [""] * len(ranking_dropdowns)
450
+
451
+ # Check if this sample has already been annotated
452
+ existing_annotation = next((a for a in results["annotations"] if a["sample_id"] == sample_id), None)
453
+ if existing_annotation:
454
+ # Restore previous rankings
455
+ for i, rank in enumerate(existing_annotation["rankings"]):
456
+ if i < len(new_rankings) and rank is not None:
457
+ new_rankings[i] = str(rank)
458
+
459
+ # Update progress
460
+ current_idx = samples.index(sample)
461
+ new_progress = f"Progress: {sum(completed_samples.values())}/{len(samples)}"
462
+
463
+ new_status = f"Viewing query {current_idx + 1} of {len(samples)}"
464
+ if completed_samples[sample_id]:
465
+ new_status += " (already completed)"
466
+
467
+ return [new_query] + new_docs + new_rankings + [sample["id"], new_progress, new_status]
468
+
469
+ def next_sample(current_id):
470
+ """Load the next sample."""
471
+ current_sample = next((s for s in samples if s["id"] == current_id), None)
472
+ if not current_sample:
473
+ return current_id
474
+
475
+ current_idx = samples.index(current_sample)
476
+ if current_idx < len(samples) - 1:
477
+ next_sample = samples[current_idx + 1]
478
+ return next_sample["id"]
479
+ return current_id
480
+
481
+ def prev_sample(current_id):
482
+ """Load the previous sample."""
483
+ current_sample = next((s for s in samples if s["id"] == current_id), None)
484
+ if not current_sample:
485
+ return current_id
486
+
487
+ current_idx = samples.index(current_sample)
488
+ if current_idx > 0:
489
+ prev_sample = samples[current_idx - 1]
490
+ return prev_sample["id"]
491
+ return current_id
492
+
493
+ def save_results():
494
+ """Save all collected results to a file."""
495
+ output_path = f"{task_data['task_name']}_human_results.json"
496
+ with open(output_path, "w") as f:
497
+ json.dump(results, f, indent=2)
498
+ return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)"
499
+
500
+ # Define a wrapper function that collects all the dropdown values into a list
501
+ def save_ranking_wrapper(*args):
502
+ # The last argument is the sample_id, all others are rankings
503
+ rankings = args[:-1]
504
+ sample_id = args[-1]
505
+ return save_ranking(rankings, sample_id)
506
+
507
+ # Connect events
508
+ submit_btn.click(
509
+ save_ranking_wrapper,
510
+ inputs=ranking_dropdowns + [current_sample_id],
511
+ outputs=[status_box, progress_text]
512
+ ).then(
513
+ update_visual_ranking,
514
+ inputs=ranking_dropdowns,
515
+ outputs=[ranked_display]
516
+ )
517
+
518
+ next_btn.click(
519
+ next_sample,
520
+ inputs=[current_sample_id],
521
+ outputs=[current_sample_id]
522
+ ).then(
523
+ load_sample,
524
+ inputs=[current_sample_id],
525
+ outputs=[query_text] + doc_containers + ranking_dropdowns + [current_sample_id, progress_text, status_box]
526
+ ).then(
527
+ update_visual_ranking,
528
+ inputs=ranking_dropdowns,
529
+ outputs=[ranked_display]
530
+ )
531
+
532
+ prev_btn.click(
533
+ prev_sample,
534
+ inputs=[current_sample_id],
535
+ outputs=[current_sample_id]
536
+ ).then(
537
+ load_sample,
538
+ inputs=[current_sample_id],
539
+ outputs=[query_text] + doc_containers + ranking_dropdowns + [current_sample_id, progress_text, status_box]
540
+ ).then(
541
+ update_visual_ranking,
542
+ inputs=ranking_dropdowns,
543
+ outputs=[ranked_display]
544
+ )
545
+
546
+ save_btn.click(save_results, outputs=[status_box])
547
 
548
  return demo
549
 
550
+ # Main app with file upload capability
551
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
552
  gr.Markdown("# MTEB Human Evaluation Demo")
 
553
 
554
+ with gr.Tabs():
555
+ with gr.TabItem("Demo"):
556
+ gr.Markdown("""
557
+ ## MTEB Human Evaluation Interface
558
+
559
+ This interface allows you to evaluate the relevance of documents for reranking tasks.
560
+ """)
561
+
562
+ # Function to get the most recent task file
563
+ def get_latest_task_file():
564
+ # Check first in uploaded_tasks directory
565
+ os.makedirs("uploaded_tasks", exist_ok=True)
566
+ uploaded_tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
567
+
568
+ if uploaded_tasks:
569
+ # Sort by modification time, newest first
570
+ uploaded_tasks.sort(key=lambda x: os.path.getmtime(os.path.join("uploaded_tasks", x)), reverse=True)
571
+ return os.path.join("uploaded_tasks", uploaded_tasks[0])
572
+
573
+ # Fall back to default example
574
+ return "AskUbuntuDupQuestions_human_eval.json"
575
+
576
+ # Load the task file
577
+ task_file = get_latest_task_file()
578
+
579
+ try:
580
+ with open(task_file, "r") as f:
581
+ task_data = json.load(f)
582
+
583
+ # Show which task is currently loaded
584
+ gr.Markdown(f"**Current Task: {task_data['task_name']}** ({len(task_data['samples'])} samples)")
585
+
586
+ # Display the interface
587
+ reranking_demo = create_reranking_interface(task_data)
588
+ except Exception as e:
589
+ gr.Markdown(f"**Error loading task: {str(e)}**")
590
+ gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
591
 
592
+ with gr.TabItem("Upload & Evaluate"):
593
+ gr.Markdown("""
594
+ ## Upload Your Own Task File
595
+
596
+ If you have a prepared task file, you can upload it here to create an evaluation interface.
597
+ """)
598
+
599
+ with gr.Row():
600
+ with gr.Column(scale=1):
601
+ file_input = gr.File(label="Upload a task file (JSON)")
602
+ load_btn = gr.Button("Load Task")
603
+ message = gr.Textbox(label="Status", interactive=False)
604
+
605
+ # Add task list for previously uploaded tasks
606
+ gr.Markdown("### Previous Uploads")
607
+
608
+ # Function to list existing task files in the tasks directory
609
+ def list_task_files():
610
+ os.makedirs("uploaded_tasks", exist_ok=True)
611
+ tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
612
+ if not tasks:
613
+ return "No task files uploaded yet."
614
+ return "\n".join([f"- [{t}](javascript:selectTask('{t}'))" for t in tasks])
615
+
616
+ task_list = gr.Markdown(list_task_files())
617
+ refresh_btn = gr.Button("Refresh List")
618
+
619
+ # Add results management section
620
+ gr.Markdown("### Results Management")
621
+
622
+ # Function to list existing result files
623
+ def list_result_files():
624
+ results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
625
+ if not results:
626
+ return "No result files available yet."
627
+
628
+ result_links = []
629
+ for r in results:
630
+ # Calculate completion stats
631
+ try:
632
+ with open(r, "r") as f:
633
+ result_data = json.load(f)
634
+ annotation_count = len(result_data.get("annotations", []))
635
+ task_name = result_data.get("task_name", "Unknown")
636
+ result_links.append(f"- {r} ({annotation_count} annotations for {task_name})")
637
+ except:
638
+ result_links.append(f"- {r}")
639
+
640
+ return "\n".join(result_links)
641
+
642
+ results_list = gr.Markdown(list_result_files())
643
+ download_results_btn = gr.Button("Download Results")
644
+
645
+ # Right side - will contain the actual interface
646
+ with gr.Column(scale=2):
647
+ task_container = gr.HTML()
648
+
649
+ # Handle file upload and storage
650
+ def handle_upload(file):
651
+ if not file:
652
+ return "Please upload a task file", task_list.value, task_container.value
653
+
654
+ try:
655
+ # Create directory if it doesn't exist
656
+ os.makedirs("uploaded_tasks", exist_ok=True)
657
+
658
+ # Read the uploaded file
659
+ with open(file.name, "r") as f:
660
+ task_data = json.load(f)
661
+
662
+ # Validate task format
663
+ if "task_name" not in task_data or "samples" not in task_data:
664
+ return "Invalid task file format. Must contain 'task_name' and 'samples' fields.", task_list.value, task_container.value
665
+
666
+ # Save to a consistent location
667
+ task_filename = f"uploaded_tasks/{task_data['task_name']}_task.json"
668
+ with open(task_filename, "w") as f:
669
+ json.dump(task_data, f, indent=2)
670
+
671
+ # Instead of trying to create the interface here,
672
+ # we'll return a message with instructions
673
+ return f"Task '{task_data['task_name']}' uploaded successfully with {len(task_data['samples'])} samples. Please refresh the app and use the Demo tab to evaluate it.", list_task_files(), f"""
674
+ <div style="padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
675
+ <h3>Task uploaded successfully!</h3>
676
+ <p>Task Name: {task_data['task_name']}</p>
677
+ <p>Samples: {len(task_data['samples'])}</p>
678
+ <p>To evaluate this task:</p>
679
+ <ol>
680
+ <li>Refresh the app</li>
681
+ <li>The Demo tab will now use your uploaded task</li>
682
+ <li>Complete your evaluations</li>
683
+ <li>Results will be saved as {task_data['task_name']}_human_results.json</li>
684
+ </ol>
685
+ </div>
686
+ """
687
+ except Exception as e:
688
+ return f"Error processing task file: {str(e)}", task_list.value, task_container.value
689
+
690
+ # Function to prepare results for download
691
+ def prepare_results_for_download():
692
+ results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
693
+ if not results:
694
+ return None
695
+
696
+ # Create a zip file with all results
697
+ import zipfile
698
+ zip_path = "mteb_human_eval_results.zip"
699
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
700
+ for r in results:
701
+ zipf.write(r)
702
+
703
+ return zip_path
704
+
705
+ # Connect events
706
+ load_btn.click(handle_upload, inputs=[file_input], outputs=[message, task_list, task_container])
707
+ refresh_btn.click(list_task_files, outputs=[task_list])
708
+ download_results_btn.click(prepare_results_for_download, outputs=[gr.File(label="Download Results")])
709
 
710
+ with gr.TabItem("Results Management"):
711
+ gr.Markdown("""
712
+ ## Manage Evaluation Results
713
+
714
+ View, download, and analyze your evaluation results.
715
+ """)
716
+
717
+ # Function to load and display result stats
718
+ def get_result_stats():
719
+ results = [f for f in os.listdir(".") if f.endswith("_human_results.json")]
720
+ if not results:
721
+ return "No result files available yet."
722
+
723
+ stats = []
724
+ for r in results:
725
+ try:
726
+ with open(r, "r") as f:
727
+ result_data = json.load(f)
728
+
729
+ task_name = result_data.get("task_name", "Unknown")
730
+ annotations = result_data.get("annotations", [])
731
+ annotation_count = len(annotations)
732
+
733
+ # Calculate completion percentage
734
+ sample_ids = set(a.get("sample_id") for a in annotations)
735
+
736
+ # Try to get the total sample count from the corresponding task file
737
+ total_samples = 0
738
+ task_file = f"uploaded_tasks/{task_name}_task.json"
739
+ if os.path.exists(task_file):
740
+ with open(task_file, "r") as f:
741
+ task_data = json.load(f)
742
+ total_samples = len(task_data.get("samples", []))
743
+
744
+ completion = f"{len(sample_ids)}/{total_samples}" if total_samples else f"{len(sample_ids)} samples"
745
+
746
+ stats.append(f"### {task_name}\n- Annotations: {annotation_count}\n- Completion: {completion}\n- File: {r}")
747
+ except Exception as e:
748
+ stats.append(f"### {r}\n- Error loading results: {str(e)}")
749
+
750
+ return "\n\n".join(stats)
751
+
752
+ result_stats = gr.Markdown(get_result_stats())
753
+ refresh_results_btn = gr.Button("Refresh Results")
754
 
755
+ # Add download options
756
+ with gr.Row():
757
+ download_all_btn = gr.Button("Download All Results (ZIP)")
758
+ result_select = gr.Dropdown(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")], label="Select Result to Download")
759
+ download_selected_btn = gr.Button("Download Selected")
760
+
761
+ # Add results visualization placeholder
762
+ gr.Markdown("### Results Visualization")
763
+ gr.Markdown("*Visualization features will be added in a future update.*")
764
+
765
+ # Connect events
766
+ refresh_results_btn.click(get_result_stats, outputs=[result_stats])
767
+
768
+ # Function to prepare all results for download as ZIP
769
+ def prepare_all_results():
770
+ import zipfile
771
+ zip_path = "mteb_human_eval_results.zip"
772
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
773
+ for r in [f for f in os.listdir(".") if f.endswith("_human_results.json")]:
774
+ zipf.write(r)
775
+ return zip_path
776
+
777
+ # Function to return a single result file
778
+ def get_selected_result(filename):
779
+ if not filename:
780
+ return None
781
+ if os.path.exists(filename):
782
+ return filename
783
+ return None
784
+
785
+ # Update dropdown when refreshing results
786
+ def update_result_dropdown():
787
+ return gr.Dropdown.update(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")])
788
+
789
+ refresh_results_btn.click(update_result_dropdown, outputs=[result_select])
790
+ download_all_btn.click(prepare_all_results, outputs=[gr.File(label="Download All Results")])
791
+ download_selected_btn.click(get_selected_result, inputs=[result_select], outputs=[gr.File(label="Download Selected Result")])
792
+
793
+ if __name__ == "__main__":
794
+ demo.launch()