pl-legal-rag / app.py
wwydmanski's picture
Update app.py
e320f9b verified
import json
from pathlib import Path
import pandas as pd
import numpy as np
import gradio as gr
from typing import Dict, List
import re
DEFAULT_OUTPUT_JSON = str((Path(__file__).parent / "leaderboard.json").resolve())
DEFAULT_FAILURE_CASES_JSON = str((Path(__file__).parent / "failure_cases.json").resolve())
# Predefined parameter bins for filtering (in billions)
PARAM_BIN_CHOICES: list[str] = [
"<10B",
"10B-25B",
"25B-50B",
"50B-100B",
"100B+",
]
def load_leaderboard_json(json_path: str) -> pd.DataFrame:
path = Path(json_path)
if not path.exists() or not path.is_file():
return pd.DataFrame()
try:
with open(path, "r", encoding="utf-8") as f:
records = json.load(f)
# records should be a list of dicts; fallback if dict
if isinstance(records, dict):
# If wrapped, try to unwrap common keys
for key in ["data", "records", "items", "leaderboard"]:
if key in records and isinstance(records[key], list):
records = records[key]
break
if not isinstance(records, list):
return pd.DataFrame()
return pd.DataFrame.from_records(records)
except Exception:
return pd.DataFrame()
def _hex_from_rgb(r: float, g: float, b: float) -> str:
r = max(0, min(255, int(round(r))))
g = max(0, min(255, int(round(g))))
b = max(0, min(255, int(round(b))))
return f"#{r:02x}{g:02x}{b:02x}"
def _bg_color_from_t(t: float) -> str:
t = max(0.0, min(1.0, float(t)))
# Green (small) -> Red (big)
g_start = (34, 197, 94) # #22c55e
r_end = (239, 68, 68) # #ef4444
r = g_start[0] + t * (r_end[0] - g_start[0])
g = g_start[1] + t * (r_end[1] - g_start[1])
b = g_start[2] + t * (r_end[2] - g_start[2])
return f"background-color: {_hex_from_rgb(r, g, b)}"
def _style_parameters(series: pd.Series) -> list[str]:
s = pd.to_numeric(series, errors="coerce")
s_pos = s[s > 0]
if s_pos.empty:
return [""] * len(series)
logs = np.log10(s_pos)
lmin = float(np.nanmin(logs))
lmax = float(np.nanmax(logs))
if not np.isfinite(lmin) or not np.isfinite(lmax):
return [""] * len(series)
colors: list[str] = []
for v in s:
if pd.isna(v) or v <= 0:
colors.append("")
else:
lv = np.log10(v)
if lmax == lmin:
t = 0.0
else:
t = (lv - lmin) / (lmax - lmin)
colors.append(_bg_color_from_t(float(t)))
return colors
def _format_value_minimal(v) -> str:
if pd.isna(v):
return ""
if isinstance(v, str):
return v
if isinstance(v, (int, np.integer)):
return str(int(v))
if isinstance(v, (float, np.floating)):
if abs(v - round(v)) < 1e-9:
return str(int(round(v)))
s = f"{float(v):.6f}".rstrip("0").rstrip(".")
return s
def _prepare_dataframe(json_path: str) -> pd.DataFrame:
df = load_leaderboard_json(json_path)
if df.empty:
return df
# Remove columns not to be displayed per schema (Quantization, any *_time or time)
columns_to_exclude = [
c for c in df.columns
if c.lower() == "quantization" or c.lower().endswith("_time") or c.lower() == "time"
]
df = df.drop(columns=columns_to_exclude, errors="ignore")
# Normalize types
if "Parameters" in df.columns:
df["Parameters"] = pd.to_numeric(df["Parameters"], errors="coerce")
if "src_clf" in df.columns:
df["src_clf"] = pd.to_numeric(df["src_clf"], errors="coerce")
# Compute avg_score across numeric metric columns (exclude meta)
meta_cols = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
metric_candidates = [c for c in df.columns if c not in meta_cols]
if metric_candidates:
numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in metric_candidates})
df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2)
# Sort by avg_score descending by default if present
if "avg_score" in df.columns:
df = df.sort_values(by="avg_score", ascending=False, na_position="last")
# Preferred column order
preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
remaining_cols = [c for c in df.columns if c not in preferred_order]
# Ensure avg_score is first among metric columns
if "avg_score" in remaining_cols:
remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"]
if preferred_order:
df = df[preferred_order + remaining_cols]
# Insert a visual separator column after Parameters to split meta from scores
if "Parameters" in df.columns:
sep_col_name = "—"
insert_at = df.columns.get_loc("Parameters") + 1
df.insert(insert_at, sep_col_name, "")
return df
def _param_bins_mask(param_series: pd.Series, selected_bins: list[str] | None) -> pd.Series:
"""Build a boolean mask for selected parameter bins.
Bins are in billions: <10B, 10B-25B, 25B-50B, 50B-100B, 100B-200B, 200B-300B, 300B+
Automatically converts raw counts to billions if values look large.
"""
if not selected_bins:
return pd.Series(True, index=param_series.index)
# Ensure numeric
s = pd.to_numeric(param_series, errors="coerce")
# Heuristic: if median is large, assume raw parameter counts and convert to billions
median_val = s.dropna().median()
if pd.notna(median_val) and median_val > 1e6:
s_b = s / 1e9
else:
s_b = s
bin_map: dict[str, tuple[float, float | None]] = {
"<10B": (0.0, 10.0),
"10B-25B": (10.0, 25.0),
"25B-50B": (25.0, 50.0),
"50B-100B": (50.0, 100.0),
"100B+": (100.0, None),
}
mask = pd.Series(False, index=s_b.index)
for label in selected_bins:
if label not in bin_map:
continue
low, high = bin_map[label]
if high is None:
mask |= s_b >= low
else:
mask |= (s_b >= low) & (s_b < high)
# Drop NaNs from consideration
mask &= s_b.notna()
return mask
def _apply_filters(df: pd.DataFrame, name_filter: str | None, param_bins: list[str] | None) -> pd.DataFrame:
if df.empty:
return df
mask = pd.Series(True, index=df.index)
# Name filter (case-insensitive substring match on Model)
if name_filter:
col = "Model" if "Model" in df.columns else None
if col is not None:
name_mask = df[col].astype(str).str.contains(name_filter, case=False, na=False)
mask &= name_mask
# Parameter bins filter
if param_bins and "Parameters" in df.columns:
bins_mask = _param_bins_mask(df["Parameters"], param_bins)
mask &= bins_mask
return df[mask]
def build_view(json_path: str, name_filter: str = "", param_bins: list[str] | None = None) -> object:
df = _prepare_dataframe(json_path)
df = df.dropna(subset=["src_clf", "sum_rag", "sum_rag_v2"], axis=0)
# Apply filters if provided
df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins)
# Produce a styled DataFrame (log-scale colors on Parameters, minimal decimals formatting)
if isinstance(df, pd.DataFrame) and not df.empty:
styler = df.style
if "Parameters" in df.columns:
styler = styler.apply(_style_parameters, subset=["Parameters"]) # type: ignore
styler = styler.format(_format_value_minimal)
table_value: object = styler
else:
# Empty DataFrame fallback
table_value = pd.DataFrame()
return table_value
def build_view_only(
json_path: str,
name_filter: str = "",
param_bins: list[str] | None = None,
excluded_tasks: list[str] | None = None,
):
"""Return only the table without updating the exclude-tasks control.
This prevents infinite loops when called from change handlers.
"""
df = _prepare_dataframe(json_path)
# Determine all task-like columns (before exclusion)
meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
tasks_all = [c for c in df.columns if c not in meta_cols_base]
excluded_set = set(excluded_tasks or [])
# Keep only tasks that actually exist
excluded_valid = [t for t in excluded_set if t in tasks_all]
included_tasks = [c for c in tasks_all if c not in excluded_set]
# Drop rows that are missing values for required tasks (only those that are included)
required_cols = [c for c in ["src_clf", "sum_rag", "sum_rag_v2"] if c in included_tasks]
if required_cols:
df = df.dropna(subset=required_cols, axis=0)
# Apply filters
df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins)
# Remove excluded task columns from view
if excluded_valid:
df = df.drop(columns=[c for c in excluded_valid if c in df.columns], errors="ignore")
# Recompute avg_score from only included tasks
# Determine tasks present in df after exclusion
meta_cols_after = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
current_metric_cols = [c for c in df.columns if c not in meta_cols_after]
# Drop existing avg_score before recomputation
if "avg_score" in df.columns:
df = df.drop(columns=["avg_score"]) # will be re-added below
if current_metric_cols:
numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in current_metric_cols})
df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2)
else:
# No metrics left; fill avg_score with NaN to keep schema consistent
df["avg_score"] = np.nan
# Sort and reorder columns similar to _prepare_dataframe
if "avg_score" in df.columns:
df = df.sort_values(by="avg_score", ascending=False, na_position="last")
preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
remaining_cols = [c for c in df.columns if c not in preferred_order]
if "avg_score" in remaining_cols:
remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"]
if preferred_order:
df = df[preferred_order + remaining_cols]
# Ensure separator column exists right after Parameters
if "Parameters" in df.columns and "—" not in df.columns:
insert_at = df.columns.get_loc("Parameters") + 1
df.insert(insert_at, "—", "")
# Style for display
if isinstance(df, pd.DataFrame) and not df.empty:
styler = df.style
if "Parameters" in df.columns:
styler = styler.apply(_style_parameters, subset=["Parameters"]) # type: ignore
styler = styler.format(_format_value_minimal)
table_value: object = styler
else:
table_value = pd.DataFrame()
return table_value
def initialize_tasks_choices(json_path: str):
"""Initialize the task choices for the exclude tasks checkbox.
This is separate from the table building to avoid infinite loops.
"""
df = _prepare_dataframe(json_path)
# Determine all task-like columns
meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
tasks_all = [c for c in df.columns if c not in meta_cols_base]
# Return update for the exclude tasks checkbox with just the choices, no value change
tasks_update = gr.update(choices=tasks_all)
return tasks_update
def build_view_and_tasks(
json_path: str,
name_filter: str = "",
param_bins: list[str] | None = None,
excluded_tasks: list[str] | None = None,
):
"""Return the table and an update object for the exclude-tasks control.
Used only for initial loading to set up the choices.
"""
table_value = build_view_only(json_path, name_filter, param_bins, excluded_tasks)
tasks_update = initialize_tasks_choices(json_path)
return table_value, tasks_update
# ---------------------- Failure cases handling ----------------------
def load_failure_cases_json(json_path: str) -> Dict[str, List[Dict[str, str]]]:
"""Load failure cases from JSON file.
Returns dict mapping model_id -> list of failure cases.
"""
path = Path(json_path)
if not path.exists() or not path.is_file():
return {}
try:
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, dict):
return data
return {}
except Exception:
return {}
def get_available_models(failure_cases_data: Dict[str, List[Dict[str, str]]]) -> List[str]:
"""Get list of available models from failure cases data."""
return sorted(failure_cases_data.keys()) if failure_cases_data else []
def render_failure_cases(
json_path: str,
selected_model: str
) -> str:
"""Render failure cases for selected model as JSON string."""
if not selected_model:
return "{}"
failure_cases_data = load_failure_cases_json(json_path)
if selected_model not in failure_cases_data:
return "{}"
cases = failure_cases_data[selected_model]
if not cases:
return "[]"
for case in cases:
score = re.search(r"(\d+\.\d+)", case["reasoning"])
if score:
case["score"] = float(score.group(1))
# Return formatted JSON string
return json.dumps(cases, ensure_ascii=False, indent=2)
def initialize_failure_cases_dropdown(json_path: str):
"""Initialize the model dropdown for failure cases."""
failure_cases_data = load_failure_cases_json(json_path)
models = get_available_models(failure_cases_data)
if models:
return gr.update(choices=models, value=models[0] if models else None)
else:
return gr.update(choices=[], value=None)
def ui() -> gr.Blocks:
with gr.Blocks(title="Model Leaderboard") as demo:
gr.Markdown("""
### Polish Legal RAG Leaderboard
Explore and compare model performance on Polish legal QA tasks.
""")
# Fixed internal state for the JSON paths
json_path_state = gr.State(value=DEFAULT_OUTPUT_JSON)
failure_cases_path_state = gr.State(value=DEFAULT_FAILURE_CASES_JSON)
with gr.Tabs():
with gr.Tab("Leaderboard"):
gr.Markdown("""
- Use filters to narrow by name and parameter bins.
- Use "Exclude tasks" to hide selected metrics; avg_score updates accordingly.
- Click column headers to sort; data updates automatically as filters change.
""")
# Filters
with gr.Row():
name_filter_in = gr.Textbox(label="Filter by name", placeholder="e.g. llama", lines=1)
param_bins_in = gr.CheckboxGroup(
label="Parameter bins",
choices=PARAM_BIN_CHOICES,
value=[],
info="Select one or more bins"
)
excluded_tasks_in = gr.CheckboxGroup(
label="Exclude tasks",
choices=[],
value=[],
info="Select tasks to hide; all are shown by default",
)
# Non-interactive so Pandas Styler is respected; header sorting remains available
leaderboard_out = gr.Dataframe(label="Leaderboard", interactive=False)
demo.load(
fn=build_view_and_tasks,
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
outputs=[leaderboard_out, excluded_tasks_in],
)
# Recompute table on filter changes
name_filter_in.change(
fn=build_view_only,
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
outputs=[leaderboard_out],
)
param_bins_in.change(
fn=build_view_only,
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
outputs=[leaderboard_out],
)
excluded_tasks_in.change(
fn=build_view_only,
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
outputs=[leaderboard_out],
)
gr.Markdown("""
### Methodology
- **`src_clf`**: Source classification of a fragment.
- **`sum_rag`**: RAG-style QA strictly from provided passages. Answers are graded by a judge gpt-4o model on a 0-2 scale; we report F1 score.
- **`sum_rag_v2`**: Advanced legal reasoning dataset with multiple question types:
- **Contradiction resolution**: Questions about resolving contradictions or ambiguities within legal texts, requiring analysis of conflicting rules or statements
- **Legal inference**: Questions testing whether hypothetical situations meet specific legal criteria, requiring identification of legal prerequisites and exceptions
""")
gr.Markdown("""
### Notes
- GPT-5-nano sometimes fails to answer, responding with an empty string.
- GPT-4o has 100% precision on the `sum_rag_v2` task, but seems to have surprisingly low recall.
- Llama-3-8B-Instruct family has limited context length (3 - 8k, 3.1 - 16k), so if the passages are too long, the model will not be able to answer (and will thus be given score 0).
- Gaius-Lex v0.8 model is based on Llama-3-8B-Instruct with RoPE scaling = 2.0. It wasn't trained for `src_clf` task.
""")
gr.Markdown("""
### Language and RAG prompt
- All tasks, passages and questions are in Polish. The models are instructed to answer in Polish.
```text
Odpowiadasz tylko i wyłącznie po polsku. Twoim zadaniem jest odpowiedzieć na pytanie na podstawie źródeł. Podaj wszystkie interesujące informacje oraz argumenty i cytaty na dowód ich prawdziwości.
Nie odpowiadaj na podstawie własnej wiedzy. Jeżeli w źródłach nie ma wymaganych informacji, powiedz to krótko.
<relevant_info>
{passages}
</relevant_info>
Odpowiedz na pytanie: `{question}` tylko i wyłącznie na podstawie źródeł. Nie odbiegaj od ich treści.
Jeżeli odpowiedź nie jest zawarta w <relevant_info>, odpowiedz że nie ma odpowiedzi w źródłach.
To jest kluczowe, że odpowiedź musi być oparta wyłącznie na <relevant_info>.
```
""")
with gr.Tab("Failure Cases"):
gr.Markdown("""
### Failure Cases Analysis
Explore failure cases by model to understand where models struggle.
""")
with gr.Row():
model_dropdown = gr.Dropdown(
label="Select Model",
choices=[],
value=None,
info="Choose a model to view its failure cases"
)
failure_cases_out = gr.Code(
label="Failure Cases",
language="json",
interactive=False,
lines=15
)
# Initialize dropdown and load data
demo.load(
fn=initialize_failure_cases_dropdown,
inputs=[failure_cases_path_state],
outputs=[model_dropdown],
)
# Update failure cases when model selection changes
model_dropdown.change(
fn=render_failure_cases,
inputs=[failure_cases_path_state, model_dropdown],
outputs=[failure_cases_out],
)
return demo
if __name__ == "__main__":
app = ui()
app.queue().launch(server_name="0.0.0.0", server_port=7860, show_api=False)