Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,8 +4,12 @@ from pathlib import Path
|
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
6 |
import gradio as gr
|
|
|
|
|
|
|
7 |
|
8 |
DEFAULT_OUTPUT_JSON = str((Path(__file__).parent / "leaderboard.json").resolve())
|
|
|
9 |
|
10 |
# Predefined parameter bins for filtering (in billions)
|
11 |
PARAM_BIN_CHOICES: list[str] = [
|
@@ -228,17 +232,15 @@ def build_view(json_path: str, name_filter: str = "", param_bins: list[str] | No
|
|
228 |
return table_value
|
229 |
|
230 |
|
231 |
-
def
|
232 |
json_path: str,
|
233 |
name_filter: str = "",
|
234 |
param_bins: list[str] | None = None,
|
235 |
excluded_tasks: list[str] | None = None,
|
236 |
):
|
237 |
-
"""Return the table
|
238 |
-
|
239 |
-
|
240 |
-
by excluding meta columns and helper columns.
|
241 |
-
- The table excludes the selected tasks and recomputes avg_score from only the included tasks.
|
242 |
"""
|
243 |
df = _prepare_dataframe(json_path)
|
244 |
|
@@ -305,97 +307,234 @@ def build_view_and_tasks(
|
|
305 |
else:
|
306 |
table_value = pd.DataFrame()
|
307 |
|
308 |
-
|
309 |
-
|
310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
return table_value, tasks_update
|
312 |
|
313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
def ui() -> gr.Blocks:
|
315 |
with gr.Blocks(title="Model Leaderboard") as demo:
|
316 |
gr.Markdown("""
|
317 |
### Polish Legal RAG Leaderboard
|
318 |
|
319 |
Explore and compare model performance on Polish legal QA tasks.
|
320 |
-
- Use filters to narrow by name and parameter bins.
|
321 |
-
- Use "Exclude tasks" to hide selected metrics; avg_score updates accordingly.
|
322 |
-
- Click column headers to sort; data updates automatically as filters change.
|
323 |
""")
|
324 |
|
325 |
-
# Fixed internal state for the JSON
|
326 |
json_path_state = gr.State(value=DEFAULT_OUTPUT_JSON)
|
327 |
-
|
328 |
-
|
329 |
-
with gr.
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
399 |
|
400 |
return demo
|
401 |
|
|
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
6 |
import gradio as gr
|
7 |
+
from typing import Dict, List
|
8 |
+
|
9 |
+
import re
|
10 |
|
11 |
DEFAULT_OUTPUT_JSON = str((Path(__file__).parent / "leaderboard.json").resolve())
|
12 |
+
DEFAULT_FAILURE_CASES_JSON = str((Path(__file__).parent / "failure_cases.json").resolve())
|
13 |
|
14 |
# Predefined parameter bins for filtering (in billions)
|
15 |
PARAM_BIN_CHOICES: list[str] = [
|
|
|
232 |
return table_value
|
233 |
|
234 |
|
235 |
+
def build_view_only(
|
236 |
json_path: str,
|
237 |
name_filter: str = "",
|
238 |
param_bins: list[str] | None = None,
|
239 |
excluded_tasks: list[str] | None = None,
|
240 |
):
|
241 |
+
"""Return only the table without updating the exclude-tasks control.
|
242 |
+
|
243 |
+
This prevents infinite loops when called from change handlers.
|
|
|
|
|
244 |
"""
|
245 |
df = _prepare_dataframe(json_path)
|
246 |
|
|
|
307 |
else:
|
308 |
table_value = pd.DataFrame()
|
309 |
|
310 |
+
return table_value
|
311 |
+
|
312 |
|
313 |
+
def initialize_tasks_choices(json_path: str):
|
314 |
+
"""Initialize the task choices for the exclude tasks checkbox.
|
315 |
+
|
316 |
+
This is separate from the table building to avoid infinite loops.
|
317 |
+
"""
|
318 |
+
df = _prepare_dataframe(json_path)
|
319 |
+
|
320 |
+
# Determine all task-like columns
|
321 |
+
meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
|
322 |
+
tasks_all = [c for c in df.columns if c not in meta_cols_base]
|
323 |
+
|
324 |
+
# Return update for the exclude tasks checkbox with just the choices, no value change
|
325 |
+
tasks_update = gr.update(choices=tasks_all)
|
326 |
+
|
327 |
+
return tasks_update
|
328 |
+
|
329 |
+
|
330 |
+
def build_view_and_tasks(
|
331 |
+
json_path: str,
|
332 |
+
name_filter: str = "",
|
333 |
+
param_bins: list[str] | None = None,
|
334 |
+
excluded_tasks: list[str] | None = None,
|
335 |
+
):
|
336 |
+
"""Return the table and an update object for the exclude-tasks control.
|
337 |
+
|
338 |
+
Used only for initial loading to set up the choices.
|
339 |
+
"""
|
340 |
+
table_value = build_view_only(json_path, name_filter, param_bins, excluded_tasks)
|
341 |
+
tasks_update = initialize_tasks_choices(json_path)
|
342 |
+
|
343 |
return table_value, tasks_update
|
344 |
|
345 |
|
346 |
+
# ---------------------- Failure cases handling ----------------------
|
347 |
+
|
348 |
+
def load_failure_cases_json(json_path: str) -> Dict[str, List[Dict[str, str]]]:
|
349 |
+
"""Load failure cases from JSON file.
|
350 |
+
|
351 |
+
Returns dict mapping model_id -> list of failure cases.
|
352 |
+
"""
|
353 |
+
path = Path(json_path)
|
354 |
+
if not path.exists() or not path.is_file():
|
355 |
+
return {}
|
356 |
+
try:
|
357 |
+
with open(path, "r", encoding="utf-8") as f:
|
358 |
+
data = json.load(f)
|
359 |
+
if isinstance(data, dict):
|
360 |
+
return data
|
361 |
+
return {}
|
362 |
+
except Exception:
|
363 |
+
return {}
|
364 |
+
|
365 |
+
|
366 |
+
def get_available_models(failure_cases_data: Dict[str, List[Dict[str, str]]]) -> List[str]:
|
367 |
+
"""Get list of available models from failure cases data."""
|
368 |
+
return sorted(failure_cases_data.keys()) if failure_cases_data else []
|
369 |
+
|
370 |
+
|
371 |
+
def render_failure_cases(
|
372 |
+
json_path: str,
|
373 |
+
selected_model: str
|
374 |
+
) -> str:
|
375 |
+
"""Render failure cases for selected model as JSON string."""
|
376 |
+
if not selected_model:
|
377 |
+
return "{}"
|
378 |
+
|
379 |
+
failure_cases_data = load_failure_cases_json(json_path)
|
380 |
+
|
381 |
+
if selected_model not in failure_cases_data:
|
382 |
+
return "{}"
|
383 |
+
|
384 |
+
cases = failure_cases_data[selected_model]
|
385 |
+
if not cases:
|
386 |
+
return "[]"
|
387 |
+
|
388 |
+
for case in cases:
|
389 |
+
score = re.search(r"(\d+\.\d+)", case["reasoning"])
|
390 |
+
if score:
|
391 |
+
case["score"] = float(score.group(1))
|
392 |
+
|
393 |
+
# Return formatted JSON string
|
394 |
+
return json.dumps(cases, ensure_ascii=False, indent=2)
|
395 |
+
|
396 |
+
|
397 |
+
def initialize_failure_cases_dropdown(json_path: str):
|
398 |
+
"""Initialize the model dropdown for failure cases."""
|
399 |
+
failure_cases_data = load_failure_cases_json(json_path)
|
400 |
+
models = get_available_models(failure_cases_data)
|
401 |
+
|
402 |
+
if models:
|
403 |
+
return gr.update(choices=models, value=models[0] if models else None)
|
404 |
+
else:
|
405 |
+
return gr.update(choices=[], value=None)
|
406 |
+
|
407 |
+
|
408 |
def ui() -> gr.Blocks:
|
409 |
with gr.Blocks(title="Model Leaderboard") as demo:
|
410 |
gr.Markdown("""
|
411 |
### Polish Legal RAG Leaderboard
|
412 |
|
413 |
Explore and compare model performance on Polish legal QA tasks.
|
|
|
|
|
|
|
414 |
""")
|
415 |
|
416 |
+
# Fixed internal state for the JSON paths
|
417 |
json_path_state = gr.State(value=DEFAULT_OUTPUT_JSON)
|
418 |
+
failure_cases_path_state = gr.State(value=DEFAULT_FAILURE_CASES_JSON)
|
419 |
+
|
420 |
+
with gr.Tabs():
|
421 |
+
with gr.Tab("Leaderboard"):
|
422 |
+
gr.Markdown("""
|
423 |
+
- Use filters to narrow by name and parameter bins.
|
424 |
+
- Use "Exclude tasks" to hide selected metrics; avg_score updates accordingly.
|
425 |
+
- Click column headers to sort; data updates automatically as filters change.
|
426 |
+
""")
|
427 |
+
|
428 |
+
# Filters
|
429 |
+
with gr.Row():
|
430 |
+
name_filter_in = gr.Textbox(label="Filter by name", placeholder="e.g. llama", lines=1)
|
431 |
+
param_bins_in = gr.CheckboxGroup(
|
432 |
+
label="Parameter bins",
|
433 |
+
choices=PARAM_BIN_CHOICES,
|
434 |
+
value=[],
|
435 |
+
info="Select one or more bins"
|
436 |
+
)
|
437 |
+
excluded_tasks_in = gr.CheckboxGroup(
|
438 |
+
label="Exclude tasks",
|
439 |
+
choices=[],
|
440 |
+
value=[],
|
441 |
+
info="Select tasks to hide; all are shown by default",
|
442 |
+
)
|
443 |
+
|
444 |
+
# Non-interactive so Pandas Styler is respected; header sorting remains available
|
445 |
+
leaderboard_out = gr.Dataframe(label="Leaderboard", interactive=False)
|
446 |
+
|
447 |
+
demo.load(
|
448 |
+
fn=build_view_and_tasks,
|
449 |
+
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
|
450 |
+
outputs=[leaderboard_out, excluded_tasks_in],
|
451 |
+
)
|
452 |
+
|
453 |
+
# Recompute table on filter changes
|
454 |
+
name_filter_in.change(
|
455 |
+
fn=build_view_only,
|
456 |
+
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
|
457 |
+
outputs=[leaderboard_out],
|
458 |
+
)
|
459 |
+
param_bins_in.change(
|
460 |
+
fn=build_view_only,
|
461 |
+
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
|
462 |
+
outputs=[leaderboard_out],
|
463 |
+
)
|
464 |
+
excluded_tasks_in.change(
|
465 |
+
fn=build_view_only,
|
466 |
+
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
|
467 |
+
outputs=[leaderboard_out],
|
468 |
+
)
|
469 |
+
|
470 |
+
gr.Markdown("""
|
471 |
+
### Methodology
|
472 |
+
- **`src_clf`**: Source classification of a fragment.
|
473 |
+
- **`sum_rag`**: RAG-style QA strictly from provided passages. Answers are graded by a judge gpt-4o model on a 0-2 scale; we report F1 score.
|
474 |
+
- **`sum_rag_v2`**: Like `sum_rag` but harder - with longer, augmented contexts and strict deranged negatives built. Same generation and 0-2 judging; we report F1 score.
|
475 |
+
- **`rag_v2`**: Advanced legal reasoning dataset with multiple question types:
|
476 |
+
- **Contradiction resolution**: Questions about resolving contradictions or ambiguities within legal texts, requiring analysis of conflicting rules or statements
|
477 |
+
- **Legal inference**: Questions testing whether hypothetical situations meet specific legal criteria, requiring identification of legal prerequisites and exceptions
|
478 |
+
""")
|
479 |
+
gr.Markdown("""
|
480 |
+
### Notes
|
481 |
+
- GPT-5-nano sometimes fails to answer, responding with an empty string.
|
482 |
+
- GPT-4o has 100% precision on the `sum_rag_v2` task, but seems to have surprisingly low recall.
|
483 |
+
- Llama-3-8B-Instruct family has limited context length (3 - 8k, 3.1 - 16k), so if the passages are too long, the model will not be able to answer (and will thus be given score 0).
|
484 |
+
- Gaius-Lex v0.8 model is based on Llama-3-8B-Instruct with RoPE scaling = 2.0. It wasn't trained for `src_clf` task.
|
485 |
+
""")
|
486 |
+
gr.Markdown("""
|
487 |
+
### Language and RAG prompt
|
488 |
+
- All tasks, passages and questions are in Polish. The models are instructed to answer in Polish.
|
489 |
+
|
490 |
+
```text
|
491 |
+
Odpowiadasz tylko i wyłącznie po polsku. Twoim zadaniem jest odpowiedzieć na pytanie na podstawie źródeł. Podaj wszystkie interesujące informacje oraz argumenty i cytaty na dowód ich prawdziwości.
|
492 |
+
Nie odpowiadaj na podstawie własnej wiedzy. Jeżeli w źródłach nie ma wymaganych informacji, powiedz to krótko.
|
493 |
+
<relevant_info>
|
494 |
+
{passages}
|
495 |
+
</relevant_info>
|
496 |
+
|
497 |
+
Odpowiedz na pytanie: `{question}` tylko i wyłącznie na podstawie źródeł. Nie odbiegaj od ich treści.
|
498 |
+
Jeżeli odpowiedź nie jest zawarta w <relevant_info>, odpowiedz że nie ma odpowiedzi w źródłach.
|
499 |
+
To jest kluczowe, że odpowiedź musi być oparta wyłącznie na <relevant_info>.
|
500 |
+
```
|
501 |
+
""")
|
502 |
+
|
503 |
+
with gr.Tab("Failure Cases"):
|
504 |
+
gr.Markdown("""
|
505 |
+
### Failure Cases Analysis
|
506 |
+
|
507 |
+
Explore failure cases by model to understand where models struggle.
|
508 |
+
""")
|
509 |
+
|
510 |
+
with gr.Row():
|
511 |
+
model_dropdown = gr.Dropdown(
|
512 |
+
label="Select Model",
|
513 |
+
choices=[],
|
514 |
+
value=None,
|
515 |
+
info="Choose a model to view its failure cases"
|
516 |
+
)
|
517 |
+
|
518 |
+
failure_cases_out = gr.Code(
|
519 |
+
label="Failure Cases",
|
520 |
+
language="json",
|
521 |
+
interactive=False,
|
522 |
+
lines=15
|
523 |
+
)
|
524 |
+
|
525 |
+
# Initialize dropdown and load data
|
526 |
+
demo.load(
|
527 |
+
fn=initialize_failure_cases_dropdown,
|
528 |
+
inputs=[failure_cases_path_state],
|
529 |
+
outputs=[model_dropdown],
|
530 |
+
)
|
531 |
+
|
532 |
+
# Update failure cases when model selection changes
|
533 |
+
model_dropdown.change(
|
534 |
+
fn=render_failure_cases,
|
535 |
+
inputs=[failure_cases_path_state, model_dropdown],
|
536 |
+
outputs=[failure_cases_out],
|
537 |
+
)
|
538 |
|
539 |
return demo
|
540 |
|