wwydmanski commited on
Commit
e263dc5
·
verified ·
1 Parent(s): b07df7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +223 -84
app.py CHANGED
@@ -4,8 +4,12 @@ from pathlib import Path
4
  import pandas as pd
5
  import numpy as np
6
  import gradio as gr
 
 
 
7
 
8
  DEFAULT_OUTPUT_JSON = str((Path(__file__).parent / "leaderboard.json").resolve())
 
9
 
10
  # Predefined parameter bins for filtering (in billions)
11
  PARAM_BIN_CHOICES: list[str] = [
@@ -228,17 +232,15 @@ def build_view(json_path: str, name_filter: str = "", param_bins: list[str] | No
228
  return table_value
229
 
230
 
231
- def build_view_and_tasks(
232
  json_path: str,
233
  name_filter: str = "",
234
  param_bins: list[str] | None = None,
235
  excluded_tasks: list[str] | None = None,
236
  ):
237
- """Return the table and an update object for the exclude-tasks control.
238
-
239
- - The available task choices are derived from the columns of the prepared dataframe
240
- by excluding meta columns and helper columns.
241
- - The table excludes the selected tasks and recomputes avg_score from only the included tasks.
242
  """
243
  df = _prepare_dataframe(json_path)
244
 
@@ -305,97 +307,234 @@ def build_view_and_tasks(
305
  else:
306
  table_value = pd.DataFrame()
307
 
308
- # Update object for the exclude tasks checkbox
309
- tasks_update = gr.update(choices=tasks_all, value=excluded_valid)
310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  return table_value, tasks_update
312
 
313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  def ui() -> gr.Blocks:
315
  with gr.Blocks(title="Model Leaderboard") as demo:
316
  gr.Markdown("""
317
  ### Polish Legal RAG Leaderboard
318
 
319
  Explore and compare model performance on Polish legal QA tasks.
320
- - Use filters to narrow by name and parameter bins.
321
- - Use "Exclude tasks" to hide selected metrics; avg_score updates accordingly.
322
- - Click column headers to sort; data updates automatically as filters change.
323
  """)
324
 
325
- # Fixed internal state for the JSON path; users cannot change this
326
  json_path_state = gr.State(value=DEFAULT_OUTPUT_JSON)
327
-
328
- # Filters
329
- with gr.Row():
330
- name_filter_in = gr.Textbox(label="Filter by name", placeholder="e.g. llama", lines=1)
331
- param_bins_in = gr.CheckboxGroup(
332
- label="Parameter bins",
333
- choices=PARAM_BIN_CHOICES,
334
- value=[],
335
- info="Select one or more bins"
336
- )
337
- excluded_tasks_in = gr.CheckboxGroup(
338
- label="Exclude tasks",
339
- choices=[],
340
- value=[],
341
- info="Select tasks to hide; all are shown by default",
342
- )
343
-
344
- # Non-interactive so Pandas Styler is respected; header sorting remains available
345
- leaderboard_out = gr.Dataframe(label="Leaderboard", interactive=False)
346
-
347
- demo.load(
348
- fn=build_view_and_tasks,
349
- inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
350
- outputs=[leaderboard_out, excluded_tasks_in],
351
- )
352
-
353
- # Recompute table on filter changes
354
- name_filter_in.change(
355
- fn=build_view_and_tasks,
356
- inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
357
- outputs=[leaderboard_out, excluded_tasks_in],
358
- )
359
- param_bins_in.change(
360
- fn=build_view_and_tasks,
361
- inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
362
- outputs=[leaderboard_out, excluded_tasks_in],
363
- )
364
- excluded_tasks_in.change(
365
- fn=build_view_and_tasks,
366
- inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
367
- outputs=[leaderboard_out, excluded_tasks_in],
368
- )
369
-
370
- gr.Markdown("""
371
- ### Methodology
372
- - **`src_clf`**: Source classification of a fragment.
373
- - **`sum_rag`**: RAG-style QA strictly from provided passages. Answers are graded by a judge gpt-4o model on a 0-2 scale; we report F1 score.
374
- - **`sum_rag_v2`**: Like `sum_rag` but harder - with longer, augmented contexts and strict deranged negatives built. Same generation and 0-2 judging; we report F1 score.
375
- """)
376
- gr.Markdown("""
377
- ### Notes
378
- - GPT-5-nano sometimes fails to answer, responding with an empty string.
379
- - GPT-4o has 100% precision on the `sum_rag_v2` task, but seems to have surprisingly low recall.
380
- - Llama-3-8B-Instruct family has limited context length (3 - 8k, 3.1 - 16k), so if the passages are too long, the model will not be able to answer (and will thus be given score 0).
381
- - Gaius-Lex v0.8 model is based on Llama-3-8B-Instruct with RoPE scaling = 2.0. It wasn't trained for `src_clf` task.
382
- """)
383
- gr.Markdown("""
384
- ### Language and RAG prompt
385
- - All tasks, passages and questions are in Polish. The models are instructed to answer in Polish.
386
-
387
- ```text
388
- Odpowiadasz tylko i wyłącznie po polsku. Twoim zadaniem jest odpowiedzieć na pytanie na podstawie źródeł. Podaj wszystkie interesujące informacje oraz argumenty i cytaty na dowód ich prawdziwości.
389
- Nie odpowiadaj na podstawie własnej wiedzy. Jeżeli w źródłach nie ma wymaganych informacji, powiedz to krótko.
390
- <relevant_info>
391
- {passages}
392
- </relevant_info>
393
-
394
- Odpowiedz na pytanie: `{question}` tylko i wyłącznie na podstawie źródeł. Nie odbiegaj od ich treści.
395
- Jeżeli odpowiedź nie jest zawarta w <relevant_info>, odpowiedz że nie ma odpowiedzi w źródłach.
396
- To jest kluczowe, że odpowiedź musi być oparta wyłącznie na <relevant_info>.
397
- ```
398
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
  return demo
401
 
 
4
  import pandas as pd
5
  import numpy as np
6
  import gradio as gr
7
+ from typing import Dict, List
8
+
9
+ import re
10
 
11
  DEFAULT_OUTPUT_JSON = str((Path(__file__).parent / "leaderboard.json").resolve())
12
+ DEFAULT_FAILURE_CASES_JSON = str((Path(__file__).parent / "failure_cases.json").resolve())
13
 
14
  # Predefined parameter bins for filtering (in billions)
15
  PARAM_BIN_CHOICES: list[str] = [
 
232
  return table_value
233
 
234
 
235
+ def build_view_only(
236
  json_path: str,
237
  name_filter: str = "",
238
  param_bins: list[str] | None = None,
239
  excluded_tasks: list[str] | None = None,
240
  ):
241
+ """Return only the table without updating the exclude-tasks control.
242
+
243
+ This prevents infinite loops when called from change handlers.
 
 
244
  """
245
  df = _prepare_dataframe(json_path)
246
 
 
307
  else:
308
  table_value = pd.DataFrame()
309
 
310
+ return table_value
311
+
312
 
313
+ def initialize_tasks_choices(json_path: str):
314
+ """Initialize the task choices for the exclude tasks checkbox.
315
+
316
+ This is separate from the table building to avoid infinite loops.
317
+ """
318
+ df = _prepare_dataframe(json_path)
319
+
320
+ # Determine all task-like columns
321
+ meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
322
+ tasks_all = [c for c in df.columns if c not in meta_cols_base]
323
+
324
+ # Return update for the exclude tasks checkbox with just the choices, no value change
325
+ tasks_update = gr.update(choices=tasks_all)
326
+
327
+ return tasks_update
328
+
329
+
330
+ def build_view_and_tasks(
331
+ json_path: str,
332
+ name_filter: str = "",
333
+ param_bins: list[str] | None = None,
334
+ excluded_tasks: list[str] | None = None,
335
+ ):
336
+ """Return the table and an update object for the exclude-tasks control.
337
+
338
+ Used only for initial loading to set up the choices.
339
+ """
340
+ table_value = build_view_only(json_path, name_filter, param_bins, excluded_tasks)
341
+ tasks_update = initialize_tasks_choices(json_path)
342
+
343
  return table_value, tasks_update
344
 
345
 
346
+ # ---------------------- Failure cases handling ----------------------
347
+
348
+ def load_failure_cases_json(json_path: str) -> Dict[str, List[Dict[str, str]]]:
349
+ """Load failure cases from JSON file.
350
+
351
+ Returns dict mapping model_id -> list of failure cases.
352
+ """
353
+ path = Path(json_path)
354
+ if not path.exists() or not path.is_file():
355
+ return {}
356
+ try:
357
+ with open(path, "r", encoding="utf-8") as f:
358
+ data = json.load(f)
359
+ if isinstance(data, dict):
360
+ return data
361
+ return {}
362
+ except Exception:
363
+ return {}
364
+
365
+
366
+ def get_available_models(failure_cases_data: Dict[str, List[Dict[str, str]]]) -> List[str]:
367
+ """Get list of available models from failure cases data."""
368
+ return sorted(failure_cases_data.keys()) if failure_cases_data else []
369
+
370
+
371
+ def render_failure_cases(
372
+ json_path: str,
373
+ selected_model: str
374
+ ) -> str:
375
+ """Render failure cases for selected model as JSON string."""
376
+ if not selected_model:
377
+ return "{}"
378
+
379
+ failure_cases_data = load_failure_cases_json(json_path)
380
+
381
+ if selected_model not in failure_cases_data:
382
+ return "{}"
383
+
384
+ cases = failure_cases_data[selected_model]
385
+ if not cases:
386
+ return "[]"
387
+
388
+ for case in cases:
389
+ score = re.search(r"(\d+\.\d+)", case["reasoning"])
390
+ if score:
391
+ case["score"] = float(score.group(1))
392
+
393
+ # Return formatted JSON string
394
+ return json.dumps(cases, ensure_ascii=False, indent=2)
395
+
396
+
397
+ def initialize_failure_cases_dropdown(json_path: str):
398
+ """Initialize the model dropdown for failure cases."""
399
+ failure_cases_data = load_failure_cases_json(json_path)
400
+ models = get_available_models(failure_cases_data)
401
+
402
+ if models:
403
+ return gr.update(choices=models, value=models[0] if models else None)
404
+ else:
405
+ return gr.update(choices=[], value=None)
406
+
407
+
408
  def ui() -> gr.Blocks:
409
  with gr.Blocks(title="Model Leaderboard") as demo:
410
  gr.Markdown("""
411
  ### Polish Legal RAG Leaderboard
412
 
413
  Explore and compare model performance on Polish legal QA tasks.
 
 
 
414
  """)
415
 
416
+ # Fixed internal state for the JSON paths
417
  json_path_state = gr.State(value=DEFAULT_OUTPUT_JSON)
418
+ failure_cases_path_state = gr.State(value=DEFAULT_FAILURE_CASES_JSON)
419
+
420
+ with gr.Tabs():
421
+ with gr.Tab("Leaderboard"):
422
+ gr.Markdown("""
423
+ - Use filters to narrow by name and parameter bins.
424
+ - Use "Exclude tasks" to hide selected metrics; avg_score updates accordingly.
425
+ - Click column headers to sort; data updates automatically as filters change.
426
+ """)
427
+
428
+ # Filters
429
+ with gr.Row():
430
+ name_filter_in = gr.Textbox(label="Filter by name", placeholder="e.g. llama", lines=1)
431
+ param_bins_in = gr.CheckboxGroup(
432
+ label="Parameter bins",
433
+ choices=PARAM_BIN_CHOICES,
434
+ value=[],
435
+ info="Select one or more bins"
436
+ )
437
+ excluded_tasks_in = gr.CheckboxGroup(
438
+ label="Exclude tasks",
439
+ choices=[],
440
+ value=[],
441
+ info="Select tasks to hide; all are shown by default",
442
+ )
443
+
444
+ # Non-interactive so Pandas Styler is respected; header sorting remains available
445
+ leaderboard_out = gr.Dataframe(label="Leaderboard", interactive=False)
446
+
447
+ demo.load(
448
+ fn=build_view_and_tasks,
449
+ inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
450
+ outputs=[leaderboard_out, excluded_tasks_in],
451
+ )
452
+
453
+ # Recompute table on filter changes
454
+ name_filter_in.change(
455
+ fn=build_view_only,
456
+ inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
457
+ outputs=[leaderboard_out],
458
+ )
459
+ param_bins_in.change(
460
+ fn=build_view_only,
461
+ inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
462
+ outputs=[leaderboard_out],
463
+ )
464
+ excluded_tasks_in.change(
465
+ fn=build_view_only,
466
+ inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
467
+ outputs=[leaderboard_out],
468
+ )
469
+
470
+ gr.Markdown("""
471
+ ### Methodology
472
+ - **`src_clf`**: Source classification of a fragment.
473
+ - **`sum_rag`**: RAG-style QA strictly from provided passages. Answers are graded by a judge gpt-4o model on a 0-2 scale; we report F1 score.
474
+ - **`sum_rag_v2`**: Like `sum_rag` but harder - with longer, augmented contexts and strict deranged negatives built. Same generation and 0-2 judging; we report F1 score.
475
+ - **`rag_v2`**: Advanced legal reasoning dataset with multiple question types:
476
+ - **Contradiction resolution**: Questions about resolving contradictions or ambiguities within legal texts, requiring analysis of conflicting rules or statements
477
+ - **Legal inference**: Questions testing whether hypothetical situations meet specific legal criteria, requiring identification of legal prerequisites and exceptions
478
+ """)
479
+ gr.Markdown("""
480
+ ### Notes
481
+ - GPT-5-nano sometimes fails to answer, responding with an empty string.
482
+ - GPT-4o has 100% precision on the `sum_rag_v2` task, but seems to have surprisingly low recall.
483
+ - Llama-3-8B-Instruct family has limited context length (3 - 8k, 3.1 - 16k), so if the passages are too long, the model will not be able to answer (and will thus be given score 0).
484
+ - Gaius-Lex v0.8 model is based on Llama-3-8B-Instruct with RoPE scaling = 2.0. It wasn't trained for `src_clf` task.
485
+ """)
486
+ gr.Markdown("""
487
+ ### Language and RAG prompt
488
+ - All tasks, passages and questions are in Polish. The models are instructed to answer in Polish.
489
+
490
+ ```text
491
+ Odpowiadasz tylko i wyłącznie po polsku. Twoim zadaniem jest odpowiedzieć na pytanie na podstawie źródeł. Podaj wszystkie interesujące informacje oraz argumenty i cytaty na dowód ich prawdziwości.
492
+ Nie odpowiadaj na podstawie własnej wiedzy. Jeżeli w źródłach nie ma wymaganych informacji, powiedz to krótko.
493
+ <relevant_info>
494
+ {passages}
495
+ </relevant_info>
496
+
497
+ Odpowiedz na pytanie: `{question}` tylko i wyłącznie na podstawie źródeł. Nie odbiegaj od ich treści.
498
+ Jeżeli odpowiedź nie jest zawarta w <relevant_info>, odpowiedz że nie ma odpowiedzi w źródłach.
499
+ To jest kluczowe, że odpowiedź musi być oparta wyłącznie na <relevant_info>.
500
+ ```
501
+ """)
502
+
503
+ with gr.Tab("Failure Cases"):
504
+ gr.Markdown("""
505
+ ### Failure Cases Analysis
506
+
507
+ Explore failure cases by model to understand where models struggle.
508
+ """)
509
+
510
+ with gr.Row():
511
+ model_dropdown = gr.Dropdown(
512
+ label="Select Model",
513
+ choices=[],
514
+ value=None,
515
+ info="Choose a model to view its failure cases"
516
+ )
517
+
518
+ failure_cases_out = gr.Code(
519
+ label="Failure Cases",
520
+ language="json",
521
+ interactive=False,
522
+ lines=15
523
+ )
524
+
525
+ # Initialize dropdown and load data
526
+ demo.load(
527
+ fn=initialize_failure_cases_dropdown,
528
+ inputs=[failure_cases_path_state],
529
+ outputs=[model_dropdown],
530
+ )
531
+
532
+ # Update failure cases when model selection changes
533
+ model_dropdown.change(
534
+ fn=render_failure_cases,
535
+ inputs=[failure_cases_path_state, model_dropdown],
536
+ outputs=[failure_cases_out],
537
+ )
538
 
539
  return demo
540