import json
from pathlib import Path
import pandas as pd
import numpy as np
import gradio as gr
from typing import Dict, List
import re
DEFAULT_OUTPUT_JSON = str((Path(__file__).parent / "leaderboard.json").resolve())
DEFAULT_FAILURE_CASES_JSON = str((Path(__file__).parent / "failure_cases.json").resolve())
# Predefined parameter bins for filtering (in billions)
PARAM_BIN_CHOICES: list[str] = [
"<10B",
"10B-25B",
"25B-50B",
"50B-100B",
"100B+",
]
def load_leaderboard_json(json_path: str) -> pd.DataFrame:
path = Path(json_path)
if not path.exists() or not path.is_file():
return pd.DataFrame()
try:
with open(path, "r", encoding="utf-8") as f:
records = json.load(f)
# records should be a list of dicts; fallback if dict
if isinstance(records, dict):
# If wrapped, try to unwrap common keys
for key in ["data", "records", "items", "leaderboard"]:
if key in records and isinstance(records[key], list):
records = records[key]
break
if not isinstance(records, list):
return pd.DataFrame()
return pd.DataFrame.from_records(records)
except Exception:
return pd.DataFrame()
def _hex_from_rgb(r: float, g: float, b: float) -> str:
r = max(0, min(255, int(round(r))))
g = max(0, min(255, int(round(g))))
b = max(0, min(255, int(round(b))))
return f"#{r:02x}{g:02x}{b:02x}"
def _bg_color_from_t(t: float) -> str:
t = max(0.0, min(1.0, float(t)))
# Green (small) -> Red (big)
g_start = (34, 197, 94) # #22c55e
r_end = (239, 68, 68) # #ef4444
r = g_start[0] + t * (r_end[0] - g_start[0])
g = g_start[1] + t * (r_end[1] - g_start[1])
b = g_start[2] + t * (r_end[2] - g_start[2])
return f"background-color: {_hex_from_rgb(r, g, b)}"
def _style_parameters(series: pd.Series) -> list[str]:
s = pd.to_numeric(series, errors="coerce")
s_pos = s[s > 0]
if s_pos.empty:
return [""] * len(series)
logs = np.log10(s_pos)
lmin = float(np.nanmin(logs))
lmax = float(np.nanmax(logs))
if not np.isfinite(lmin) or not np.isfinite(lmax):
return [""] * len(series)
colors: list[str] = []
for v in s:
if pd.isna(v) or v <= 0:
colors.append("")
else:
lv = np.log10(v)
if lmax == lmin:
t = 0.0
else:
t = (lv - lmin) / (lmax - lmin)
colors.append(_bg_color_from_t(float(t)))
return colors
def _format_value_minimal(v) -> str:
if pd.isna(v):
return ""
if isinstance(v, str):
return v
if isinstance(v, (int, np.integer)):
return str(int(v))
if isinstance(v, (float, np.floating)):
if abs(v - round(v)) < 1e-9:
return str(int(round(v)))
s = f"{float(v):.6f}".rstrip("0").rstrip(".")
return s
def _prepare_dataframe(json_path: str) -> pd.DataFrame:
df = load_leaderboard_json(json_path)
if df.empty:
return df
# Remove columns not to be displayed per schema (Quantization, any *_time or time)
columns_to_exclude = [
c for c in df.columns
if c.lower() == "quantization" or c.lower().endswith("_time") or c.lower() == "time"
]
df = df.drop(columns=columns_to_exclude, errors="ignore")
# Normalize types
if "Parameters" in df.columns:
df["Parameters"] = pd.to_numeric(df["Parameters"], errors="coerce")
if "src_clf" in df.columns:
df["src_clf"] = pd.to_numeric(df["src_clf"], errors="coerce")
# Compute avg_score across numeric metric columns (exclude meta)
meta_cols = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
metric_candidates = [c for c in df.columns if c not in meta_cols]
if metric_candidates:
numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in metric_candidates})
df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2)
# Sort by avg_score descending by default if present
if "avg_score" in df.columns:
df = df.sort_values(by="avg_score", ascending=False, na_position="last")
# Preferred column order
preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
remaining_cols = [c for c in df.columns if c not in preferred_order]
# Ensure avg_score is first among metric columns
if "avg_score" in remaining_cols:
remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"]
if preferred_order:
df = df[preferred_order + remaining_cols]
# Insert a visual separator column after Parameters to split meta from scores
if "Parameters" in df.columns:
sep_col_name = "—"
insert_at = df.columns.get_loc("Parameters") + 1
df.insert(insert_at, sep_col_name, "")
return df
def _param_bins_mask(param_series: pd.Series, selected_bins: list[str] | None) -> pd.Series:
"""Build a boolean mask for selected parameter bins.
Bins are in billions: <10B, 10B-25B, 25B-50B, 50B-100B, 100B-200B, 200B-300B, 300B+
Automatically converts raw counts to billions if values look large.
"""
if not selected_bins:
return pd.Series(True, index=param_series.index)
# Ensure numeric
s = pd.to_numeric(param_series, errors="coerce")
# Heuristic: if median is large, assume raw parameter counts and convert to billions
median_val = s.dropna().median()
if pd.notna(median_val) and median_val > 1e6:
s_b = s / 1e9
else:
s_b = s
bin_map: dict[str, tuple[float, float | None]] = {
"<10B": (0.0, 10.0),
"10B-25B": (10.0, 25.0),
"25B-50B": (25.0, 50.0),
"50B-100B": (50.0, 100.0),
"100B+": (100.0, None),
}
mask = pd.Series(False, index=s_b.index)
for label in selected_bins:
if label not in bin_map:
continue
low, high = bin_map[label]
if high is None:
mask |= s_b >= low
else:
mask |= (s_b >= low) & (s_b < high)
# Drop NaNs from consideration
mask &= s_b.notna()
return mask
def _apply_filters(df: pd.DataFrame, name_filter: str | None, param_bins: list[str] | None) -> pd.DataFrame:
if df.empty:
return df
mask = pd.Series(True, index=df.index)
# Name filter (case-insensitive substring match on Model)
if name_filter:
col = "Model" if "Model" in df.columns else None
if col is not None:
name_mask = df[col].astype(str).str.contains(name_filter, case=False, na=False)
mask &= name_mask
# Parameter bins filter
if param_bins and "Parameters" in df.columns:
bins_mask = _param_bins_mask(df["Parameters"], param_bins)
mask &= bins_mask
return df[mask]
def build_view(json_path: str, name_filter: str = "", param_bins: list[str] | None = None) -> object:
df = _prepare_dataframe(json_path)
df = df.dropna(subset=["src_clf", "sum_rag", "sum_rag_v2"], axis=0)
# Apply filters if provided
df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins)
# Produce a styled DataFrame (log-scale colors on Parameters, minimal decimals formatting)
if isinstance(df, pd.DataFrame) and not df.empty:
styler = df.style
if "Parameters" in df.columns:
styler = styler.apply(_style_parameters, subset=["Parameters"]) # type: ignore
styler = styler.format(_format_value_minimal)
table_value: object = styler
else:
# Empty DataFrame fallback
table_value = pd.DataFrame()
return table_value
def build_view_only(
json_path: str,
name_filter: str = "",
param_bins: list[str] | None = None,
excluded_tasks: list[str] | None = None,
):
"""Return only the table without updating the exclude-tasks control.
This prevents infinite loops when called from change handlers.
"""
df = _prepare_dataframe(json_path)
# Determine all task-like columns (before exclusion)
meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
tasks_all = [c for c in df.columns if c not in meta_cols_base]
excluded_set = set(excluded_tasks or [])
# Keep only tasks that actually exist
excluded_valid = [t for t in excluded_set if t in tasks_all]
included_tasks = [c for c in tasks_all if c not in excluded_set]
# Drop rows that are missing values for required tasks (only those that are included)
required_cols = [c for c in ["src_clf", "sum_rag", "sum_rag_v2"] if c in included_tasks]
if required_cols:
df = df.dropna(subset=required_cols, axis=0)
# Apply filters
df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins)
# Remove excluded task columns from view
if excluded_valid:
df = df.drop(columns=[c for c in excluded_valid if c in df.columns], errors="ignore")
# Recompute avg_score from only included tasks
# Determine tasks present in df after exclusion
meta_cols_after = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
current_metric_cols = [c for c in df.columns if c not in meta_cols_after]
# Drop existing avg_score before recomputation
if "avg_score" in df.columns:
df = df.drop(columns=["avg_score"]) # will be re-added below
if current_metric_cols:
numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in current_metric_cols})
df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2)
else:
# No metrics left; fill avg_score with NaN to keep schema consistent
df["avg_score"] = np.nan
# Sort and reorder columns similar to _prepare_dataframe
if "avg_score" in df.columns:
df = df.sort_values(by="avg_score", ascending=False, na_position="last")
preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
remaining_cols = [c for c in df.columns if c not in preferred_order]
if "avg_score" in remaining_cols:
remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"]
if preferred_order:
df = df[preferred_order + remaining_cols]
# Ensure separator column exists right after Parameters
if "Parameters" in df.columns and "—" not in df.columns:
insert_at = df.columns.get_loc("Parameters") + 1
df.insert(insert_at, "—", "")
# Style for display
if isinstance(df, pd.DataFrame) and not df.empty:
styler = df.style
if "Parameters" in df.columns:
styler = styler.apply(_style_parameters, subset=["Parameters"]) # type: ignore
styler = styler.format(_format_value_minimal)
table_value: object = styler
else:
table_value = pd.DataFrame()
return table_value
def initialize_tasks_choices(json_path: str):
"""Initialize the task choices for the exclude tasks checkbox.
This is separate from the table building to avoid infinite loops.
"""
df = _prepare_dataframe(json_path)
# Determine all task-like columns
meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
tasks_all = [c for c in df.columns if c not in meta_cols_base]
# Return update for the exclude tasks checkbox with just the choices, no value change
tasks_update = gr.update(choices=tasks_all)
return tasks_update
def build_view_and_tasks(
json_path: str,
name_filter: str = "",
param_bins: list[str] | None = None,
excluded_tasks: list[str] | None = None,
):
"""Return the table and an update object for the exclude-tasks control.
Used only for initial loading to set up the choices.
"""
table_value = build_view_only(json_path, name_filter, param_bins, excluded_tasks)
tasks_update = initialize_tasks_choices(json_path)
return table_value, tasks_update
# ---------------------- Failure cases handling ----------------------
def load_failure_cases_json(json_path: str) -> Dict[str, List[Dict[str, str]]]:
"""Load failure cases from JSON file.
Returns dict mapping model_id -> list of failure cases.
"""
path = Path(json_path)
if not path.exists() or not path.is_file():
return {}
try:
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, dict):
return data
return {}
except Exception:
return {}
def get_available_models(failure_cases_data: Dict[str, List[Dict[str, str]]]) -> List[str]:
"""Get list of available models from failure cases data."""
return sorted(failure_cases_data.keys()) if failure_cases_data else []
def render_failure_cases(
json_path: str,
selected_model: str
) -> str:
"""Render failure cases for selected model as JSON string."""
if not selected_model:
return "{}"
failure_cases_data = load_failure_cases_json(json_path)
if selected_model not in failure_cases_data:
return "{}"
cases = failure_cases_data[selected_model]
if not cases:
return "[]"
for case in cases:
score = re.search(r"(\d+\.\d+)", case["reasoning"])
if score:
case["score"] = float(score.group(1))
# Return formatted JSON string
return json.dumps(cases, ensure_ascii=False, indent=2)
def initialize_failure_cases_dropdown(json_path: str):
"""Initialize the model dropdown for failure cases."""
failure_cases_data = load_failure_cases_json(json_path)
models = get_available_models(failure_cases_data)
if models:
return gr.update(choices=models, value=models[0] if models else None)
else:
return gr.update(choices=[], value=None)
def ui() -> gr.Blocks:
with gr.Blocks(title="Model Leaderboard") as demo:
gr.Markdown("""
### Polish Legal RAG Leaderboard
Explore and compare model performance on Polish legal QA tasks.
""")
# Fixed internal state for the JSON paths
json_path_state = gr.State(value=DEFAULT_OUTPUT_JSON)
failure_cases_path_state = gr.State(value=DEFAULT_FAILURE_CASES_JSON)
with gr.Tabs():
with gr.Tab("Leaderboard"):
gr.Markdown("""
- Use filters to narrow by name and parameter bins.
- Use "Exclude tasks" to hide selected metrics; avg_score updates accordingly.
- Click column headers to sort; data updates automatically as filters change.
""")
# Filters
with gr.Row():
name_filter_in = gr.Textbox(label="Filter by name", placeholder="e.g. llama", lines=1)
param_bins_in = gr.CheckboxGroup(
label="Parameter bins",
choices=PARAM_BIN_CHOICES,
value=[],
info="Select one or more bins"
)
excluded_tasks_in = gr.CheckboxGroup(
label="Exclude tasks",
choices=[],
value=[],
info="Select tasks to hide; all are shown by default",
)
# Non-interactive so Pandas Styler is respected; header sorting remains available
leaderboard_out = gr.Dataframe(label="Leaderboard", interactive=False)
demo.load(
fn=build_view_and_tasks,
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
outputs=[leaderboard_out, excluded_tasks_in],
)
# Recompute table on filter changes
name_filter_in.change(
fn=build_view_only,
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
outputs=[leaderboard_out],
)
param_bins_in.change(
fn=build_view_only,
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
outputs=[leaderboard_out],
)
excluded_tasks_in.change(
fn=build_view_only,
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
outputs=[leaderboard_out],
)
gr.Markdown("""
### Methodology
- **`src_clf`**: Source classification of a fragment.
- **`sum_rag`**: RAG-style QA strictly from provided passages. Answers are graded by a judge gpt-4o model on a 0-2 scale; we report F1 score.
- **`sum_rag_v2`**: Advanced legal reasoning dataset with multiple question types:
- **Contradiction resolution**: Questions about resolving contradictions or ambiguities within legal texts, requiring analysis of conflicting rules or statements
- **Legal inference**: Questions testing whether hypothetical situations meet specific legal criteria, requiring identification of legal prerequisites and exceptions
""")
gr.Markdown("""
### Notes
- GPT-5-nano sometimes fails to answer, responding with an empty string.
- GPT-4o has 100% precision on the `sum_rag_v2` task, but seems to have surprisingly low recall.
- Llama-3-8B-Instruct family has limited context length (3 - 8k, 3.1 - 16k), so if the passages are too long, the model will not be able to answer (and will thus be given score 0).
- Gaius-Lex v0.8 model is based on Llama-3-8B-Instruct with RoPE scaling = 2.0. It wasn't trained for `src_clf` task.
""")
gr.Markdown("""
### Language and RAG prompt
- All tasks, passages and questions are in Polish. The models are instructed to answer in Polish.
```text
Odpowiadasz tylko i wyłącznie po polsku. Twoim zadaniem jest odpowiedzieć na pytanie na podstawie źródeł. Podaj wszystkie interesujące informacje oraz argumenty i cytaty na dowód ich prawdziwości.
Nie odpowiadaj na podstawie własnej wiedzy. Jeżeli w źródłach nie ma wymaganych informacji, powiedz to krótko.
{passages}
Odpowiedz na pytanie: `{question}` tylko i wyłącznie na podstawie źródeł. Nie odbiegaj od ich treści.
Jeżeli odpowiedź nie jest zawarta w , odpowiedz że nie ma odpowiedzi w źródłach.
To jest kluczowe, że odpowiedź musi być oparta wyłącznie na .
```
""")
with gr.Tab("Failure Cases"):
gr.Markdown("""
### Failure Cases Analysis
Explore failure cases by model to understand where models struggle.
""")
with gr.Row():
model_dropdown = gr.Dropdown(
label="Select Model",
choices=[],
value=None,
info="Choose a model to view its failure cases"
)
failure_cases_out = gr.Code(
label="Failure Cases",
language="json",
interactive=False,
lines=15
)
# Initialize dropdown and load data
demo.load(
fn=initialize_failure_cases_dropdown,
inputs=[failure_cases_path_state],
outputs=[model_dropdown],
)
# Update failure cases when model selection changes
model_dropdown.change(
fn=render_failure_cases,
inputs=[failure_cases_path_state, model_dropdown],
outputs=[failure_cases_out],
)
return demo
if __name__ == "__main__":
app = ui()
app.queue().launch(server_name="0.0.0.0", server_port=7860, show_api=False)