Spaces:
Sleeping
Sleeping
File size: 20,591 Bytes
d42c7ce e263dc5 d42c7ce e263dc5 d42c7ce b8ed3e3 d42c7ce e263dc5 b8ed3e3 e263dc5 b8ed3e3 e263dc5 b8ed3e3 e263dc5 b8ed3e3 e263dc5 d42c7ce b07df7a d42c7ce e263dc5 d42c7ce e263dc5 e320f9b e263dc5 d42c7ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 |
import json
from pathlib import Path
import pandas as pd
import numpy as np
import gradio as gr
from typing import Dict, List
import re
DEFAULT_OUTPUT_JSON = str((Path(__file__).parent / "leaderboard.json").resolve())
DEFAULT_FAILURE_CASES_JSON = str((Path(__file__).parent / "failure_cases.json").resolve())
# Predefined parameter bins for filtering (in billions)
PARAM_BIN_CHOICES: list[str] = [
"<10B",
"10B-25B",
"25B-50B",
"50B-100B",
"100B+",
]
def load_leaderboard_json(json_path: str) -> pd.DataFrame:
path = Path(json_path)
if not path.exists() or not path.is_file():
return pd.DataFrame()
try:
with open(path, "r", encoding="utf-8") as f:
records = json.load(f)
# records should be a list of dicts; fallback if dict
if isinstance(records, dict):
# If wrapped, try to unwrap common keys
for key in ["data", "records", "items", "leaderboard"]:
if key in records and isinstance(records[key], list):
records = records[key]
break
if not isinstance(records, list):
return pd.DataFrame()
return pd.DataFrame.from_records(records)
except Exception:
return pd.DataFrame()
def _hex_from_rgb(r: float, g: float, b: float) -> str:
r = max(0, min(255, int(round(r))))
g = max(0, min(255, int(round(g))))
b = max(0, min(255, int(round(b))))
return f"#{r:02x}{g:02x}{b:02x}"
def _bg_color_from_t(t: float) -> str:
t = max(0.0, min(1.0, float(t)))
# Green (small) -> Red (big)
g_start = (34, 197, 94) # #22c55e
r_end = (239, 68, 68) # #ef4444
r = g_start[0] + t * (r_end[0] - g_start[0])
g = g_start[1] + t * (r_end[1] - g_start[1])
b = g_start[2] + t * (r_end[2] - g_start[2])
return f"background-color: {_hex_from_rgb(r, g, b)}"
def _style_parameters(series: pd.Series) -> list[str]:
s = pd.to_numeric(series, errors="coerce")
s_pos = s[s > 0]
if s_pos.empty:
return [""] * len(series)
logs = np.log10(s_pos)
lmin = float(np.nanmin(logs))
lmax = float(np.nanmax(logs))
if not np.isfinite(lmin) or not np.isfinite(lmax):
return [""] * len(series)
colors: list[str] = []
for v in s:
if pd.isna(v) or v <= 0:
colors.append("")
else:
lv = np.log10(v)
if lmax == lmin:
t = 0.0
else:
t = (lv - lmin) / (lmax - lmin)
colors.append(_bg_color_from_t(float(t)))
return colors
def _format_value_minimal(v) -> str:
if pd.isna(v):
return ""
if isinstance(v, str):
return v
if isinstance(v, (int, np.integer)):
return str(int(v))
if isinstance(v, (float, np.floating)):
if abs(v - round(v)) < 1e-9:
return str(int(round(v)))
s = f"{float(v):.6f}".rstrip("0").rstrip(".")
return s
def _prepare_dataframe(json_path: str) -> pd.DataFrame:
df = load_leaderboard_json(json_path)
if df.empty:
return df
# Remove columns not to be displayed per schema (Quantization, any *_time or time)
columns_to_exclude = [
c for c in df.columns
if c.lower() == "quantization" or c.lower().endswith("_time") or c.lower() == "time"
]
df = df.drop(columns=columns_to_exclude, errors="ignore")
# Normalize types
if "Parameters" in df.columns:
df["Parameters"] = pd.to_numeric(df["Parameters"], errors="coerce")
if "src_clf" in df.columns:
df["src_clf"] = pd.to_numeric(df["src_clf"], errors="coerce")
# Compute avg_score across numeric metric columns (exclude meta)
meta_cols = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
metric_candidates = [c for c in df.columns if c not in meta_cols]
if metric_candidates:
numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in metric_candidates})
df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2)
# Sort by avg_score descending by default if present
if "avg_score" in df.columns:
df = df.sort_values(by="avg_score", ascending=False, na_position="last")
# Preferred column order
preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
remaining_cols = [c for c in df.columns if c not in preferred_order]
# Ensure avg_score is first among metric columns
if "avg_score" in remaining_cols:
remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"]
if preferred_order:
df = df[preferred_order + remaining_cols]
# Insert a visual separator column after Parameters to split meta from scores
if "Parameters" in df.columns:
sep_col_name = "—"
insert_at = df.columns.get_loc("Parameters") + 1
df.insert(insert_at, sep_col_name, "")
return df
def _param_bins_mask(param_series: pd.Series, selected_bins: list[str] | None) -> pd.Series:
"""Build a boolean mask for selected parameter bins.
Bins are in billions: <10B, 10B-25B, 25B-50B, 50B-100B, 100B-200B, 200B-300B, 300B+
Automatically converts raw counts to billions if values look large.
"""
if not selected_bins:
return pd.Series(True, index=param_series.index)
# Ensure numeric
s = pd.to_numeric(param_series, errors="coerce")
# Heuristic: if median is large, assume raw parameter counts and convert to billions
median_val = s.dropna().median()
if pd.notna(median_val) and median_val > 1e6:
s_b = s / 1e9
else:
s_b = s
bin_map: dict[str, tuple[float, float | None]] = {
"<10B": (0.0, 10.0),
"10B-25B": (10.0, 25.0),
"25B-50B": (25.0, 50.0),
"50B-100B": (50.0, 100.0),
"100B+": (100.0, None),
}
mask = pd.Series(False, index=s_b.index)
for label in selected_bins:
if label not in bin_map:
continue
low, high = bin_map[label]
if high is None:
mask |= s_b >= low
else:
mask |= (s_b >= low) & (s_b < high)
# Drop NaNs from consideration
mask &= s_b.notna()
return mask
def _apply_filters(df: pd.DataFrame, name_filter: str | None, param_bins: list[str] | None) -> pd.DataFrame:
if df.empty:
return df
mask = pd.Series(True, index=df.index)
# Name filter (case-insensitive substring match on Model)
if name_filter:
col = "Model" if "Model" in df.columns else None
if col is not None:
name_mask = df[col].astype(str).str.contains(name_filter, case=False, na=False)
mask &= name_mask
# Parameter bins filter
if param_bins and "Parameters" in df.columns:
bins_mask = _param_bins_mask(df["Parameters"], param_bins)
mask &= bins_mask
return df[mask]
def build_view(json_path: str, name_filter: str = "", param_bins: list[str] | None = None) -> object:
df = _prepare_dataframe(json_path)
df = df.dropna(subset=["src_clf", "sum_rag", "sum_rag_v2"], axis=0)
# Apply filters if provided
df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins)
# Produce a styled DataFrame (log-scale colors on Parameters, minimal decimals formatting)
if isinstance(df, pd.DataFrame) and not df.empty:
styler = df.style
if "Parameters" in df.columns:
styler = styler.apply(_style_parameters, subset=["Parameters"]) # type: ignore
styler = styler.format(_format_value_minimal)
table_value: object = styler
else:
# Empty DataFrame fallback
table_value = pd.DataFrame()
return table_value
def build_view_only(
json_path: str,
name_filter: str = "",
param_bins: list[str] | None = None,
excluded_tasks: list[str] | None = None,
):
"""Return only the table without updating the exclude-tasks control.
This prevents infinite loops when called from change handlers.
"""
df = _prepare_dataframe(json_path)
# Determine all task-like columns (before exclusion)
meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
tasks_all = [c for c in df.columns if c not in meta_cols_base]
excluded_set = set(excluded_tasks or [])
# Keep only tasks that actually exist
excluded_valid = [t for t in excluded_set if t in tasks_all]
included_tasks = [c for c in tasks_all if c not in excluded_set]
# Drop rows that are missing values for required tasks (only those that are included)
required_cols = [c for c in ["src_clf", "sum_rag", "sum_rag_v2"] if c in included_tasks]
if required_cols:
df = df.dropna(subset=required_cols, axis=0)
# Apply filters
df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins)
# Remove excluded task columns from view
if excluded_valid:
df = df.drop(columns=[c for c in excluded_valid if c in df.columns], errors="ignore")
# Recompute avg_score from only included tasks
# Determine tasks present in df after exclusion
meta_cols_after = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
current_metric_cols = [c for c in df.columns if c not in meta_cols_after]
# Drop existing avg_score before recomputation
if "avg_score" in df.columns:
df = df.drop(columns=["avg_score"]) # will be re-added below
if current_metric_cols:
numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in current_metric_cols})
df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2)
else:
# No metrics left; fill avg_score with NaN to keep schema consistent
df["avg_score"] = np.nan
# Sort and reorder columns similar to _prepare_dataframe
if "avg_score" in df.columns:
df = df.sort_values(by="avg_score", ascending=False, na_position="last")
preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
remaining_cols = [c for c in df.columns if c not in preferred_order]
if "avg_score" in remaining_cols:
remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"]
if preferred_order:
df = df[preferred_order + remaining_cols]
# Ensure separator column exists right after Parameters
if "Parameters" in df.columns and "—" not in df.columns:
insert_at = df.columns.get_loc("Parameters") + 1
df.insert(insert_at, "—", "")
# Style for display
if isinstance(df, pd.DataFrame) and not df.empty:
styler = df.style
if "Parameters" in df.columns:
styler = styler.apply(_style_parameters, subset=["Parameters"]) # type: ignore
styler = styler.format(_format_value_minimal)
table_value: object = styler
else:
table_value = pd.DataFrame()
return table_value
def initialize_tasks_choices(json_path: str):
"""Initialize the task choices for the exclude tasks checkbox.
This is separate from the table building to avoid infinite loops.
"""
df = _prepare_dataframe(json_path)
# Determine all task-like columns
meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
tasks_all = [c for c in df.columns if c not in meta_cols_base]
# Return update for the exclude tasks checkbox with just the choices, no value change
tasks_update = gr.update(choices=tasks_all)
return tasks_update
def build_view_and_tasks(
json_path: str,
name_filter: str = "",
param_bins: list[str] | None = None,
excluded_tasks: list[str] | None = None,
):
"""Return the table and an update object for the exclude-tasks control.
Used only for initial loading to set up the choices.
"""
table_value = build_view_only(json_path, name_filter, param_bins, excluded_tasks)
tasks_update = initialize_tasks_choices(json_path)
return table_value, tasks_update
# ---------------------- Failure cases handling ----------------------
def load_failure_cases_json(json_path: str) -> Dict[str, List[Dict[str, str]]]:
"""Load failure cases from JSON file.
Returns dict mapping model_id -> list of failure cases.
"""
path = Path(json_path)
if not path.exists() or not path.is_file():
return {}
try:
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, dict):
return data
return {}
except Exception:
return {}
def get_available_models(failure_cases_data: Dict[str, List[Dict[str, str]]]) -> List[str]:
"""Get list of available models from failure cases data."""
return sorted(failure_cases_data.keys()) if failure_cases_data else []
def render_failure_cases(
json_path: str,
selected_model: str
) -> str:
"""Render failure cases for selected model as JSON string."""
if not selected_model:
return "{}"
failure_cases_data = load_failure_cases_json(json_path)
if selected_model not in failure_cases_data:
return "{}"
cases = failure_cases_data[selected_model]
if not cases:
return "[]"
for case in cases:
score = re.search(r"(\d+\.\d+)", case["reasoning"])
if score:
case["score"] = float(score.group(1))
# Return formatted JSON string
return json.dumps(cases, ensure_ascii=False, indent=2)
def initialize_failure_cases_dropdown(json_path: str):
"""Initialize the model dropdown for failure cases."""
failure_cases_data = load_failure_cases_json(json_path)
models = get_available_models(failure_cases_data)
if models:
return gr.update(choices=models, value=models[0] if models else None)
else:
return gr.update(choices=[], value=None)
def ui() -> gr.Blocks:
with gr.Blocks(title="Model Leaderboard") as demo:
gr.Markdown("""
### Polish Legal RAG Leaderboard
Explore and compare model performance on Polish legal QA tasks.
""")
# Fixed internal state for the JSON paths
json_path_state = gr.State(value=DEFAULT_OUTPUT_JSON)
failure_cases_path_state = gr.State(value=DEFAULT_FAILURE_CASES_JSON)
with gr.Tabs():
with gr.Tab("Leaderboard"):
gr.Markdown("""
- Use filters to narrow by name and parameter bins.
- Use "Exclude tasks" to hide selected metrics; avg_score updates accordingly.
- Click column headers to sort; data updates automatically as filters change.
""")
# Filters
with gr.Row():
name_filter_in = gr.Textbox(label="Filter by name", placeholder="e.g. llama", lines=1)
param_bins_in = gr.CheckboxGroup(
label="Parameter bins",
choices=PARAM_BIN_CHOICES,
value=[],
info="Select one or more bins"
)
excluded_tasks_in = gr.CheckboxGroup(
label="Exclude tasks",
choices=[],
value=[],
info="Select tasks to hide; all are shown by default",
)
# Non-interactive so Pandas Styler is respected; header sorting remains available
leaderboard_out = gr.Dataframe(label="Leaderboard", interactive=False)
demo.load(
fn=build_view_and_tasks,
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
outputs=[leaderboard_out, excluded_tasks_in],
)
# Recompute table on filter changes
name_filter_in.change(
fn=build_view_only,
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
outputs=[leaderboard_out],
)
param_bins_in.change(
fn=build_view_only,
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
outputs=[leaderboard_out],
)
excluded_tasks_in.change(
fn=build_view_only,
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
outputs=[leaderboard_out],
)
gr.Markdown("""
### Methodology
- **`src_clf`**: Source classification of a fragment.
- **`sum_rag`**: RAG-style QA strictly from provided passages. Answers are graded by a judge gpt-4o model on a 0-2 scale; we report F1 score.
- **`sum_rag_v2`**: Advanced legal reasoning dataset with multiple question types:
- **Contradiction resolution**: Questions about resolving contradictions or ambiguities within legal texts, requiring analysis of conflicting rules or statements
- **Legal inference**: Questions testing whether hypothetical situations meet specific legal criteria, requiring identification of legal prerequisites and exceptions
""")
gr.Markdown("""
### Notes
- GPT-5-nano sometimes fails to answer, responding with an empty string.
- GPT-4o has 100% precision on the `sum_rag_v2` task, but seems to have surprisingly low recall.
- Llama-3-8B-Instruct family has limited context length (3 - 8k, 3.1 - 16k), so if the passages are too long, the model will not be able to answer (and will thus be given score 0).
- Gaius-Lex v0.8 model is based on Llama-3-8B-Instruct with RoPE scaling = 2.0. It wasn't trained for `src_clf` task.
""")
gr.Markdown("""
### Language and RAG prompt
- All tasks, passages and questions are in Polish. The models are instructed to answer in Polish.
```text
Odpowiadasz tylko i wyłącznie po polsku. Twoim zadaniem jest odpowiedzieć na pytanie na podstawie źródeł. Podaj wszystkie interesujące informacje oraz argumenty i cytaty na dowód ich prawdziwości.
Nie odpowiadaj na podstawie własnej wiedzy. Jeżeli w źródłach nie ma wymaganych informacji, powiedz to krótko.
<relevant_info>
{passages}
</relevant_info>
Odpowiedz na pytanie: `{question}` tylko i wyłącznie na podstawie źródeł. Nie odbiegaj od ich treści.
Jeżeli odpowiedź nie jest zawarta w <relevant_info>, odpowiedz że nie ma odpowiedzi w źródłach.
To jest kluczowe, że odpowiedź musi być oparta wyłącznie na <relevant_info>.
```
""")
with gr.Tab("Failure Cases"):
gr.Markdown("""
### Failure Cases Analysis
Explore failure cases by model to understand where models struggle.
""")
with gr.Row():
model_dropdown = gr.Dropdown(
label="Select Model",
choices=[],
value=None,
info="Choose a model to view its failure cases"
)
failure_cases_out = gr.Code(
label="Failure Cases",
language="json",
interactive=False,
lines=15
)
# Initialize dropdown and load data
demo.load(
fn=initialize_failure_cases_dropdown,
inputs=[failure_cases_path_state],
outputs=[model_dropdown],
)
# Update failure cases when model selection changes
model_dropdown.change(
fn=render_failure_cases,
inputs=[failure_cases_path_state, model_dropdown],
outputs=[failure_cases_out],
)
return demo
if __name__ == "__main__":
app = ui()
app.queue().launch(server_name="0.0.0.0", server_port=7860, show_api=False) |