Spaces:
Running
Running
import json | |
from pathlib import Path | |
import pandas as pd | |
import numpy as np | |
import gradio as gr | |
from typing import Dict, List | |
import re | |
DEFAULT_OUTPUT_JSON = str((Path(__file__).parent / "leaderboard.json").resolve()) | |
DEFAULT_FAILURE_CASES_JSON = str((Path(__file__).parent / "failure_cases.json").resolve()) | |
# Predefined parameter bins for filtering (in billions) | |
PARAM_BIN_CHOICES: list[str] = [ | |
"<10B", | |
"10B-25B", | |
"25B-50B", | |
"50B-100B", | |
"100B+", | |
] | |
def load_leaderboard_json(json_path: str) -> pd.DataFrame: | |
path = Path(json_path) | |
if not path.exists() or not path.is_file(): | |
return pd.DataFrame() | |
try: | |
with open(path, "r", encoding="utf-8") as f: | |
records = json.load(f) | |
# records should be a list of dicts; fallback if dict | |
if isinstance(records, dict): | |
# If wrapped, try to unwrap common keys | |
for key in ["data", "records", "items", "leaderboard"]: | |
if key in records and isinstance(records[key], list): | |
records = records[key] | |
break | |
if not isinstance(records, list): | |
return pd.DataFrame() | |
return pd.DataFrame.from_records(records) | |
except Exception: | |
return pd.DataFrame() | |
def _hex_from_rgb(r: float, g: float, b: float) -> str: | |
r = max(0, min(255, int(round(r)))) | |
g = max(0, min(255, int(round(g)))) | |
b = max(0, min(255, int(round(b)))) | |
return f"#{r:02x}{g:02x}{b:02x}" | |
def _bg_color_from_t(t: float) -> str: | |
t = max(0.0, min(1.0, float(t))) | |
# Green (small) -> Red (big) | |
g_start = (34, 197, 94) # #22c55e | |
r_end = (239, 68, 68) # #ef4444 | |
r = g_start[0] + t * (r_end[0] - g_start[0]) | |
g = g_start[1] + t * (r_end[1] - g_start[1]) | |
b = g_start[2] + t * (r_end[2] - g_start[2]) | |
return f"background-color: {_hex_from_rgb(r, g, b)}" | |
def _style_parameters(series: pd.Series) -> list[str]: | |
s = pd.to_numeric(series, errors="coerce") | |
s_pos = s[s > 0] | |
if s_pos.empty: | |
return [""] * len(series) | |
logs = np.log10(s_pos) | |
lmin = float(np.nanmin(logs)) | |
lmax = float(np.nanmax(logs)) | |
if not np.isfinite(lmin) or not np.isfinite(lmax): | |
return [""] * len(series) | |
colors: list[str] = [] | |
for v in s: | |
if pd.isna(v) or v <= 0: | |
colors.append("") | |
else: | |
lv = np.log10(v) | |
if lmax == lmin: | |
t = 0.0 | |
else: | |
t = (lv - lmin) / (lmax - lmin) | |
colors.append(_bg_color_from_t(float(t))) | |
return colors | |
def _format_value_minimal(v) -> str: | |
if pd.isna(v): | |
return "" | |
if isinstance(v, str): | |
return v | |
if isinstance(v, (int, np.integer)): | |
return str(int(v)) | |
if isinstance(v, (float, np.floating)): | |
if abs(v - round(v)) < 1e-9: | |
return str(int(round(v))) | |
s = f"{float(v):.6f}".rstrip("0").rstrip(".") | |
return s | |
def _prepare_dataframe(json_path: str) -> pd.DataFrame: | |
df = load_leaderboard_json(json_path) | |
if df.empty: | |
return df | |
# Remove columns not to be displayed per schema (Quantization, any *_time or time) | |
columns_to_exclude = [ | |
c for c in df.columns | |
if c.lower() == "quantization" or c.lower().endswith("_time") or c.lower() == "time" | |
] | |
df = df.drop(columns=columns_to_exclude, errors="ignore") | |
# Normalize types | |
if "Parameters" in df.columns: | |
df["Parameters"] = pd.to_numeric(df["Parameters"], errors="coerce") | |
if "src_clf" in df.columns: | |
df["src_clf"] = pd.to_numeric(df["src_clf"], errors="coerce") | |
# Compute avg_score across numeric metric columns (exclude meta) | |
meta_cols = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns] | |
metric_candidates = [c for c in df.columns if c not in meta_cols] | |
if metric_candidates: | |
numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in metric_candidates}) | |
df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2) | |
# Sort by avg_score descending by default if present | |
if "avg_score" in df.columns: | |
df = df.sort_values(by="avg_score", ascending=False, na_position="last") | |
# Preferred column order | |
preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns] | |
remaining_cols = [c for c in df.columns if c not in preferred_order] | |
# Ensure avg_score is first among metric columns | |
if "avg_score" in remaining_cols: | |
remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"] | |
if preferred_order: | |
df = df[preferred_order + remaining_cols] | |
# Insert a visual separator column after Parameters to split meta from scores | |
if "Parameters" in df.columns: | |
sep_col_name = "—" | |
insert_at = df.columns.get_loc("Parameters") + 1 | |
df.insert(insert_at, sep_col_name, "") | |
return df | |
def _param_bins_mask(param_series: pd.Series, selected_bins: list[str] | None) -> pd.Series: | |
"""Build a boolean mask for selected parameter bins. | |
Bins are in billions: <10B, 10B-25B, 25B-50B, 50B-100B, 100B-200B, 200B-300B, 300B+ | |
Automatically converts raw counts to billions if values look large. | |
""" | |
if not selected_bins: | |
return pd.Series(True, index=param_series.index) | |
# Ensure numeric | |
s = pd.to_numeric(param_series, errors="coerce") | |
# Heuristic: if median is large, assume raw parameter counts and convert to billions | |
median_val = s.dropna().median() | |
if pd.notna(median_val) and median_val > 1e6: | |
s_b = s / 1e9 | |
else: | |
s_b = s | |
bin_map: dict[str, tuple[float, float | None]] = { | |
"<10B": (0.0, 10.0), | |
"10B-25B": (10.0, 25.0), | |
"25B-50B": (25.0, 50.0), | |
"50B-100B": (50.0, 100.0), | |
"100B+": (100.0, None), | |
} | |
mask = pd.Series(False, index=s_b.index) | |
for label in selected_bins: | |
if label not in bin_map: | |
continue | |
low, high = bin_map[label] | |
if high is None: | |
mask |= s_b >= low | |
else: | |
mask |= (s_b >= low) & (s_b < high) | |
# Drop NaNs from consideration | |
mask &= s_b.notna() | |
return mask | |
def _apply_filters(df: pd.DataFrame, name_filter: str | None, param_bins: list[str] | None) -> pd.DataFrame: | |
if df.empty: | |
return df | |
mask = pd.Series(True, index=df.index) | |
# Name filter (case-insensitive substring match on Model) | |
if name_filter: | |
col = "Model" if "Model" in df.columns else None | |
if col is not None: | |
name_mask = df[col].astype(str).str.contains(name_filter, case=False, na=False) | |
mask &= name_mask | |
# Parameter bins filter | |
if param_bins and "Parameters" in df.columns: | |
bins_mask = _param_bins_mask(df["Parameters"], param_bins) | |
mask &= bins_mask | |
return df[mask] | |
def build_view(json_path: str, name_filter: str = "", param_bins: list[str] | None = None) -> object: | |
df = _prepare_dataframe(json_path) | |
df = df.dropna(subset=["src_clf", "sum_rag", "sum_rag_v2"], axis=0) | |
# Apply filters if provided | |
df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins) | |
# Produce a styled DataFrame (log-scale colors on Parameters, minimal decimals formatting) | |
if isinstance(df, pd.DataFrame) and not df.empty: | |
styler = df.style | |
if "Parameters" in df.columns: | |
styler = styler.apply(_style_parameters, subset=["Parameters"]) # type: ignore | |
styler = styler.format(_format_value_minimal) | |
table_value: object = styler | |
else: | |
# Empty DataFrame fallback | |
table_value = pd.DataFrame() | |
return table_value | |
def build_view_only( | |
json_path: str, | |
name_filter: str = "", | |
param_bins: list[str] | None = None, | |
excluded_tasks: list[str] | None = None, | |
): | |
"""Return only the table without updating the exclude-tasks control. | |
This prevents infinite loops when called from change handlers. | |
""" | |
df = _prepare_dataframe(json_path) | |
# Determine all task-like columns (before exclusion) | |
meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns] | |
tasks_all = [c for c in df.columns if c not in meta_cols_base] | |
excluded_set = set(excluded_tasks or []) | |
# Keep only tasks that actually exist | |
excluded_valid = [t for t in excluded_set if t in tasks_all] | |
included_tasks = [c for c in tasks_all if c not in excluded_set] | |
# Drop rows that are missing values for required tasks (only those that are included) | |
required_cols = [c for c in ["src_clf", "sum_rag", "sum_rag_v2"] if c in included_tasks] | |
if required_cols: | |
df = df.dropna(subset=required_cols, axis=0) | |
# Apply filters | |
df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins) | |
# Remove excluded task columns from view | |
if excluded_valid: | |
df = df.drop(columns=[c for c in excluded_valid if c in df.columns], errors="ignore") | |
# Recompute avg_score from only included tasks | |
# Determine tasks present in df after exclusion | |
meta_cols_after = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns] | |
current_metric_cols = [c for c in df.columns if c not in meta_cols_after] | |
# Drop existing avg_score before recomputation | |
if "avg_score" in df.columns: | |
df = df.drop(columns=["avg_score"]) # will be re-added below | |
if current_metric_cols: | |
numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in current_metric_cols}) | |
df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2) | |
else: | |
# No metrics left; fill avg_score with NaN to keep schema consistent | |
df["avg_score"] = np.nan | |
# Sort and reorder columns similar to _prepare_dataframe | |
if "avg_score" in df.columns: | |
df = df.sort_values(by="avg_score", ascending=False, na_position="last") | |
preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns] | |
remaining_cols = [c for c in df.columns if c not in preferred_order] | |
if "avg_score" in remaining_cols: | |
remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"] | |
if preferred_order: | |
df = df[preferred_order + remaining_cols] | |
# Ensure separator column exists right after Parameters | |
if "Parameters" in df.columns and "—" not in df.columns: | |
insert_at = df.columns.get_loc("Parameters") + 1 | |
df.insert(insert_at, "—", "") | |
# Style for display | |
if isinstance(df, pd.DataFrame) and not df.empty: | |
styler = df.style | |
if "Parameters" in df.columns: | |
styler = styler.apply(_style_parameters, subset=["Parameters"]) # type: ignore | |
styler = styler.format(_format_value_minimal) | |
table_value: object = styler | |
else: | |
table_value = pd.DataFrame() | |
return table_value | |
def initialize_tasks_choices(json_path: str): | |
"""Initialize the task choices for the exclude tasks checkbox. | |
This is separate from the table building to avoid infinite loops. | |
""" | |
df = _prepare_dataframe(json_path) | |
# Determine all task-like columns | |
meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns] | |
tasks_all = [c for c in df.columns if c not in meta_cols_base] | |
# Return update for the exclude tasks checkbox with just the choices, no value change | |
tasks_update = gr.update(choices=tasks_all) | |
return tasks_update | |
def build_view_and_tasks( | |
json_path: str, | |
name_filter: str = "", | |
param_bins: list[str] | None = None, | |
excluded_tasks: list[str] | None = None, | |
): | |
"""Return the table and an update object for the exclude-tasks control. | |
Used only for initial loading to set up the choices. | |
""" | |
table_value = build_view_only(json_path, name_filter, param_bins, excluded_tasks) | |
tasks_update = initialize_tasks_choices(json_path) | |
return table_value, tasks_update | |
# ---------------------- Failure cases handling ---------------------- | |
def load_failure_cases_json(json_path: str) -> Dict[str, List[Dict[str, str]]]: | |
"""Load failure cases from JSON file. | |
Returns dict mapping model_id -> list of failure cases. | |
""" | |
path = Path(json_path) | |
if not path.exists() or not path.is_file(): | |
return {} | |
try: | |
with open(path, "r", encoding="utf-8") as f: | |
data = json.load(f) | |
if isinstance(data, dict): | |
return data | |
return {} | |
except Exception: | |
return {} | |
def get_available_models(failure_cases_data: Dict[str, List[Dict[str, str]]]) -> List[str]: | |
"""Get list of available models from failure cases data.""" | |
return sorted(failure_cases_data.keys()) if failure_cases_data else [] | |
def render_failure_cases( | |
json_path: str, | |
selected_model: str | |
) -> str: | |
"""Render failure cases for selected model as JSON string.""" | |
if not selected_model: | |
return "{}" | |
failure_cases_data = load_failure_cases_json(json_path) | |
if selected_model not in failure_cases_data: | |
return "{}" | |
cases = failure_cases_data[selected_model] | |
if not cases: | |
return "[]" | |
for case in cases: | |
score = re.search(r"(\d+\.\d+)", case["reasoning"]) | |
if score: | |
case["score"] = float(score.group(1)) | |
# Return formatted JSON string | |
return json.dumps(cases, ensure_ascii=False, indent=2) | |
def initialize_failure_cases_dropdown(json_path: str): | |
"""Initialize the model dropdown for failure cases.""" | |
failure_cases_data = load_failure_cases_json(json_path) | |
models = get_available_models(failure_cases_data) | |
if models: | |
return gr.update(choices=models, value=models[0] if models else None) | |
else: | |
return gr.update(choices=[], value=None) | |
def ui() -> gr.Blocks: | |
with gr.Blocks(title="Model Leaderboard") as demo: | |
gr.Markdown(""" | |
### Polish Legal RAG Leaderboard | |
Explore and compare model performance on Polish legal QA tasks. | |
""") | |
# Fixed internal state for the JSON paths | |
json_path_state = gr.State(value=DEFAULT_OUTPUT_JSON) | |
failure_cases_path_state = gr.State(value=DEFAULT_FAILURE_CASES_JSON) | |
with gr.Tabs(): | |
with gr.Tab("Leaderboard"): | |
gr.Markdown(""" | |
- Use filters to narrow by name and parameter bins. | |
- Use "Exclude tasks" to hide selected metrics; avg_score updates accordingly. | |
- Click column headers to sort; data updates automatically as filters change. | |
""") | |
# Filters | |
with gr.Row(): | |
name_filter_in = gr.Textbox(label="Filter by name", placeholder="e.g. llama", lines=1) | |
param_bins_in = gr.CheckboxGroup( | |
label="Parameter bins", | |
choices=PARAM_BIN_CHOICES, | |
value=[], | |
info="Select one or more bins" | |
) | |
excluded_tasks_in = gr.CheckboxGroup( | |
label="Exclude tasks", | |
choices=[], | |
value=[], | |
info="Select tasks to hide; all are shown by default", | |
) | |
# Non-interactive so Pandas Styler is respected; header sorting remains available | |
leaderboard_out = gr.Dataframe(label="Leaderboard", interactive=False) | |
demo.load( | |
fn=build_view_and_tasks, | |
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in], | |
outputs=[leaderboard_out, excluded_tasks_in], | |
) | |
# Recompute table on filter changes | |
name_filter_in.change( | |
fn=build_view_only, | |
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in], | |
outputs=[leaderboard_out], | |
) | |
param_bins_in.change( | |
fn=build_view_only, | |
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in], | |
outputs=[leaderboard_out], | |
) | |
excluded_tasks_in.change( | |
fn=build_view_only, | |
inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in], | |
outputs=[leaderboard_out], | |
) | |
gr.Markdown(""" | |
### Methodology | |
- **`src_clf`**: Source classification of a fragment. | |
- **`sum_rag`**: RAG-style QA strictly from provided passages. Answers are graded by a judge gpt-4o model on a 0-2 scale; we report F1 score. | |
- **`sum_rag_v2`**: Advanced legal reasoning dataset with multiple question types: | |
- **Contradiction resolution**: Questions about resolving contradictions or ambiguities within legal texts, requiring analysis of conflicting rules or statements | |
- **Legal inference**: Questions testing whether hypothetical situations meet specific legal criteria, requiring identification of legal prerequisites and exceptions | |
""") | |
gr.Markdown(""" | |
### Notes | |
- GPT-5-nano sometimes fails to answer, responding with an empty string. | |
- GPT-4o has 100% precision on the `sum_rag_v2` task, but seems to have surprisingly low recall. | |
- Llama-3-8B-Instruct family has limited context length (3 - 8k, 3.1 - 16k), so if the passages are too long, the model will not be able to answer (and will thus be given score 0). | |
- Gaius-Lex v0.8 model is based on Llama-3-8B-Instruct with RoPE scaling = 2.0. It wasn't trained for `src_clf` task. | |
""") | |
gr.Markdown(""" | |
### Language and RAG prompt | |
- All tasks, passages and questions are in Polish. The models are instructed to answer in Polish. | |
```text | |
Odpowiadasz tylko i wyłącznie po polsku. Twoim zadaniem jest odpowiedzieć na pytanie na podstawie źródeł. Podaj wszystkie interesujące informacje oraz argumenty i cytaty na dowód ich prawdziwości. | |
Nie odpowiadaj na podstawie własnej wiedzy. Jeżeli w źródłach nie ma wymaganych informacji, powiedz to krótko. | |
<relevant_info> | |
{passages} | |
</relevant_info> | |
Odpowiedz na pytanie: `{question}` tylko i wyłącznie na podstawie źródeł. Nie odbiegaj od ich treści. | |
Jeżeli odpowiedź nie jest zawarta w <relevant_info>, odpowiedz że nie ma odpowiedzi w źródłach. | |
To jest kluczowe, że odpowiedź musi być oparta wyłącznie na <relevant_info>. | |
``` | |
""") | |
with gr.Tab("Failure Cases"): | |
gr.Markdown(""" | |
### Failure Cases Analysis | |
Explore failure cases by model to understand where models struggle. | |
""") | |
with gr.Row(): | |
model_dropdown = gr.Dropdown( | |
label="Select Model", | |
choices=[], | |
value=None, | |
info="Choose a model to view its failure cases" | |
) | |
failure_cases_out = gr.Code( | |
label="Failure Cases", | |
language="json", | |
interactive=False, | |
lines=15 | |
) | |
# Initialize dropdown and load data | |
demo.load( | |
fn=initialize_failure_cases_dropdown, | |
inputs=[failure_cases_path_state], | |
outputs=[model_dropdown], | |
) | |
# Update failure cases when model selection changes | |
model_dropdown.change( | |
fn=render_failure_cases, | |
inputs=[failure_cases_path_state, model_dropdown], | |
outputs=[failure_cases_out], | |
) | |
return demo | |
if __name__ == "__main__": | |
app = ui() | |
app.queue().launch(server_name="0.0.0.0", server_port=7860, show_api=False) |