Spaces:

gaius-lex
/

pl-legal-rag

Sleeping

File size: 20,591 Bytes

import json
from pathlib import Path

import pandas as pd
import numpy as np
import gradio as gr
from typing import Dict, List

import re

DEFAULT_OUTPUT_JSON = str((Path(__file__).parent / "leaderboard.json").resolve())
DEFAULT_FAILURE_CASES_JSON = str((Path(__file__).parent / "failure_cases.json").resolve())

# Predefined parameter bins for filtering (in billions)
PARAM_BIN_CHOICES: list[str] = [
    "<10B",
    "10B-25B",
    "25B-50B",
    "50B-100B",
    "100B+",
]


def load_leaderboard_json(json_path: str) -> pd.DataFrame:
    path = Path(json_path)
    if not path.exists() or not path.is_file():
        return pd.DataFrame()
    try:
        with open(path, "r", encoding="utf-8") as f:
            records = json.load(f)
        # records should be a list of dicts; fallback if dict
        if isinstance(records, dict):
            # If wrapped, try to unwrap common keys
            for key in ["data", "records", "items", "leaderboard"]:
                if key in records and isinstance(records[key], list):
                    records = records[key]
                    break
        if not isinstance(records, list):
            return pd.DataFrame()
        return pd.DataFrame.from_records(records)
    except Exception:
        return pd.DataFrame()


def _hex_from_rgb(r: float, g: float, b: float) -> str:
    r = max(0, min(255, int(round(r))))
    g = max(0, min(255, int(round(g))))
    b = max(0, min(255, int(round(b))))
    return f"#{r:02x}{g:02x}{b:02x}"


def _bg_color_from_t(t: float) -> str:
    t = max(0.0, min(1.0, float(t)))
    # Green (small) -> Red (big)
    g_start = (34, 197, 94)   # #22c55e
    r_end = (239, 68, 68)     # #ef4444
    r = g_start[0] + t * (r_end[0] - g_start[0])
    g = g_start[1] + t * (r_end[1] - g_start[1])
    b = g_start[2] + t * (r_end[2] - g_start[2])
    return f"background-color: {_hex_from_rgb(r, g, b)}"


def _style_parameters(series: pd.Series) -> list[str]:
    s = pd.to_numeric(series, errors="coerce")
    s_pos = s[s > 0]
    if s_pos.empty:
        return [""] * len(series)
    logs = np.log10(s_pos)
    lmin = float(np.nanmin(logs))
    lmax = float(np.nanmax(logs))
    if not np.isfinite(lmin) or not np.isfinite(lmax):
        return [""] * len(series)

    colors: list[str] = []
    for v in s:
        if pd.isna(v) or v <= 0:
            colors.append("")
        else:
            lv = np.log10(v)
            if lmax == lmin:
                t = 0.0
            else:
                t = (lv - lmin) / (lmax - lmin)
            colors.append(_bg_color_from_t(float(t)))
    return colors


def _format_value_minimal(v) -> str:
    if pd.isna(v):
        return ""
    if isinstance(v, str):
        return v
    if isinstance(v, (int, np.integer)):
        return str(int(v))
    if isinstance(v, (float, np.floating)):
        if abs(v - round(v)) < 1e-9:
            return str(int(round(v)))
    s = f"{float(v):.6f}".rstrip("0").rstrip(".")
    return s



def _prepare_dataframe(json_path: str) -> pd.DataFrame:
    df = load_leaderboard_json(json_path)
    if df.empty:
        return df

    # Remove columns not to be displayed per schema (Quantization, any *_time or time)
    columns_to_exclude = [
        c for c in df.columns
        if c.lower() == "quantization" or c.lower().endswith("_time") or c.lower() == "time"
    ]
    df = df.drop(columns=columns_to_exclude, errors="ignore")

    # Normalize types
    if "Parameters" in df.columns:
        df["Parameters"] = pd.to_numeric(df["Parameters"], errors="coerce")
    if "src_clf" in df.columns:
        df["src_clf"] = pd.to_numeric(df["src_clf"], errors="coerce")

    # Compute avg_score across numeric metric columns (exclude meta)
    meta_cols = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
    metric_candidates = [c for c in df.columns if c not in meta_cols]
    if metric_candidates:
        numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in metric_candidates})
        df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2)

    # Sort by avg_score descending by default if present
    if "avg_score" in df.columns:
        df = df.sort_values(by="avg_score", ascending=False, na_position="last")

    # Preferred column order
    preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
    remaining_cols = [c for c in df.columns if c not in preferred_order]
    # Ensure avg_score is first among metric columns
    if "avg_score" in remaining_cols:
        remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"]
    if preferred_order:
        df = df[preferred_order + remaining_cols]

    # Insert a visual separator column after Parameters to split meta from scores
    if "Parameters" in df.columns:
        sep_col_name = "—"
        insert_at = df.columns.get_loc("Parameters") + 1
        df.insert(insert_at, sep_col_name, "")

    return df


def _param_bins_mask(param_series: pd.Series, selected_bins: list[str] | None) -> pd.Series:
    """Build a boolean mask for selected parameter bins.

    Bins are in billions: <10B, 10B-25B, 25B-50B, 50B-100B, 100B-200B, 200B-300B, 300B+
    Automatically converts raw counts to billions if values look large.
    """
    if not selected_bins:
        return pd.Series(True, index=param_series.index)

    # Ensure numeric
    s = pd.to_numeric(param_series, errors="coerce")

    # Heuristic: if median is large, assume raw parameter counts and convert to billions
    median_val = s.dropna().median()
    if pd.notna(median_val) and median_val > 1e6:
        s_b = s / 1e9
    else:
        s_b = s

    bin_map: dict[str, tuple[float, float | None]] = {
        "<10B": (0.0, 10.0),
        "10B-25B": (10.0, 25.0),
        "25B-50B": (25.0, 50.0),
        "50B-100B": (50.0, 100.0),
        "100B+": (100.0, None),
    }

    mask = pd.Series(False, index=s_b.index)
    for label in selected_bins:
        if label not in bin_map:
            continue
        low, high = bin_map[label]
        if high is None:
            mask |= s_b >= low
        else:
            mask |= (s_b >= low) & (s_b < high)
    # Drop NaNs from consideration
    mask &= s_b.notna()
    return mask


def _apply_filters(df: pd.DataFrame, name_filter: str | None, param_bins: list[str] | None) -> pd.DataFrame:
    if df.empty:
        return df

    mask = pd.Series(True, index=df.index)

    # Name filter (case-insensitive substring match on Model)
    if name_filter:
        col = "Model" if "Model" in df.columns else None
        if col is not None:
            name_mask = df[col].astype(str).str.contains(name_filter, case=False, na=False)
            mask &= name_mask

    # Parameter bins filter
    if param_bins and "Parameters" in df.columns:
        bins_mask = _param_bins_mask(df["Parameters"], param_bins)
        mask &= bins_mask

    return df[mask]


def build_view(json_path: str, name_filter: str = "", param_bins: list[str] | None = None) -> object:
    df = _prepare_dataframe(json_path)

    df = df.dropna(subset=["src_clf", "sum_rag", "sum_rag_v2"], axis=0)

    # Apply filters if provided
    df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins)

    # Produce a styled DataFrame (log-scale colors on Parameters, minimal decimals formatting)
    if isinstance(df, pd.DataFrame) and not df.empty:
        styler = df.style
        if "Parameters" in df.columns:
            styler = styler.apply(_style_parameters, subset=["Parameters"])  # type: ignore
        styler = styler.format(_format_value_minimal)
        table_value: object = styler
    else:
        # Empty DataFrame fallback
        table_value = pd.DataFrame()
        

    return table_value


def build_view_only(
    json_path: str,
    name_filter: str = "",
    param_bins: list[str] | None = None,
    excluded_tasks: list[str] | None = None,
):
    """Return only the table without updating the exclude-tasks control.
    
    This prevents infinite loops when called from change handlers.
    """
    df = _prepare_dataframe(json_path)

    # Determine all task-like columns (before exclusion)
    meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
    tasks_all = [c for c in df.columns if c not in meta_cols_base]

    excluded_set = set(excluded_tasks or [])
    # Keep only tasks that actually exist
    excluded_valid = [t for t in excluded_set if t in tasks_all]
    included_tasks = [c for c in tasks_all if c not in excluded_set]

    # Drop rows that are missing values for required tasks (only those that are included)
    required_cols = [c for c in ["src_clf", "sum_rag", "sum_rag_v2"] if c in included_tasks]
    if required_cols:
        df = df.dropna(subset=required_cols, axis=0)

    # Apply filters
    df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins)

    # Remove excluded task columns from view
    if excluded_valid:
        df = df.drop(columns=[c for c in excluded_valid if c in df.columns], errors="ignore")

    # Recompute avg_score from only included tasks
    # Determine tasks present in df after exclusion
    meta_cols_after = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
    current_metric_cols = [c for c in df.columns if c not in meta_cols_after]

    # Drop existing avg_score before recomputation
    if "avg_score" in df.columns:
        df = df.drop(columns=["avg_score"])  # will be re-added below

    if current_metric_cols:
        numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in current_metric_cols})
        df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2)
    else:
        # No metrics left; fill avg_score with NaN to keep schema consistent
        df["avg_score"] = np.nan

    # Sort and reorder columns similar to _prepare_dataframe
    if "avg_score" in df.columns:
        df = df.sort_values(by="avg_score", ascending=False, na_position="last")

    preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
    remaining_cols = [c for c in df.columns if c not in preferred_order]
    if "avg_score" in remaining_cols:
        remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"]
    if preferred_order:
        df = df[preferred_order + remaining_cols]

    # Ensure separator column exists right after Parameters
    if "Parameters" in df.columns and "—" not in df.columns:
        insert_at = df.columns.get_loc("Parameters") + 1
        df.insert(insert_at, "—", "")

    # Style for display
    if isinstance(df, pd.DataFrame) and not df.empty:
        styler = df.style
        if "Parameters" in df.columns:
            styler = styler.apply(_style_parameters, subset=["Parameters"])  # type: ignore
        styler = styler.format(_format_value_minimal)
        table_value: object = styler
    else:
        table_value = pd.DataFrame()

    return table_value


def initialize_tasks_choices(json_path: str):
    """Initialize the task choices for the exclude tasks checkbox.
    
    This is separate from the table building to avoid infinite loops.
    """
    df = _prepare_dataframe(json_path)
    
    # Determine all task-like columns
    meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
    tasks_all = [c for c in df.columns if c not in meta_cols_base]
    
    # Return update for the exclude tasks checkbox with just the choices, no value change
    tasks_update = gr.update(choices=tasks_all)
    
    return tasks_update


def build_view_and_tasks(
    json_path: str,
    name_filter: str = "",
    param_bins: list[str] | None = None,
    excluded_tasks: list[str] | None = None,
):
    """Return the table and an update object for the exclude-tasks control.
    
    Used only for initial loading to set up the choices.
    """
    table_value = build_view_only(json_path, name_filter, param_bins, excluded_tasks)
    tasks_update = initialize_tasks_choices(json_path)
    
    return table_value, tasks_update


# ---------------------- Failure cases handling ----------------------

def load_failure_cases_json(json_path: str) -> Dict[str, List[Dict[str, str]]]:
    """Load failure cases from JSON file.
    
    Returns dict mapping model_id -> list of failure cases.
    """
    path = Path(json_path)
    if not path.exists() or not path.is_file():
        return {}
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, dict):
            return data
        return {}
    except Exception:
        return {}


def get_available_models(failure_cases_data: Dict[str, List[Dict[str, str]]]) -> List[str]:
    """Get list of available models from failure cases data."""
    return sorted(failure_cases_data.keys()) if failure_cases_data else []


def render_failure_cases(
    json_path: str, 
    selected_model: str
) -> str:
    """Render failure cases for selected model as JSON string."""
    if not selected_model:
        return "{}"
    
    failure_cases_data = load_failure_cases_json(json_path)
    
    if selected_model not in failure_cases_data:
        return "{}"
    
    cases = failure_cases_data[selected_model]
    if not cases:
        return "[]"
    
    for case in cases:
        score = re.search(r"(\d+\.\d+)", case["reasoning"])
        if score:
            case["score"] = float(score.group(1))
    
    # Return formatted JSON string
    return json.dumps(cases, ensure_ascii=False, indent=2)


def initialize_failure_cases_dropdown(json_path: str):
    """Initialize the model dropdown for failure cases."""
    failure_cases_data = load_failure_cases_json(json_path)
    models = get_available_models(failure_cases_data)
    
    if models:
        return gr.update(choices=models, value=models[0] if models else None)
    else:
        return gr.update(choices=[], value=None)


def ui() -> gr.Blocks:
    with gr.Blocks(title="Model Leaderboard") as demo:
        gr.Markdown("""
        ### Polish Legal RAG Leaderboard

        Explore and compare model performance on Polish legal QA tasks.
        """)

        # Fixed internal state for the JSON paths
        json_path_state = gr.State(value=DEFAULT_OUTPUT_JSON)
        failure_cases_path_state = gr.State(value=DEFAULT_FAILURE_CASES_JSON)

        with gr.Tabs():
            with gr.Tab("Leaderboard"):
                gr.Markdown("""
                - Use filters to narrow by name and parameter bins.
                - Use "Exclude tasks" to hide selected metrics; avg_score updates accordingly.
                - Click column headers to sort; data updates automatically as filters change.
                """)

                # Filters
                with gr.Row():
                    name_filter_in = gr.Textbox(label="Filter by name", placeholder="e.g. llama", lines=1)
                    param_bins_in = gr.CheckboxGroup(
                        label="Parameter bins",
                        choices=PARAM_BIN_CHOICES,
                        value=[],
                        info="Select one or more bins"
                    )
                    excluded_tasks_in = gr.CheckboxGroup(
                        label="Exclude tasks",
                        choices=[],
                        value=[],
                        info="Select tasks to hide; all are shown by default",
                    )

                # Non-interactive so Pandas Styler is respected; header sorting remains available
                leaderboard_out = gr.Dataframe(label="Leaderboard", interactive=False)

                demo.load(
                    fn=build_view_and_tasks,
                    inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
                    outputs=[leaderboard_out, excluded_tasks_in],
                )

                # Recompute table on filter changes
                name_filter_in.change(
                    fn=build_view_only,
                    inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
                    outputs=[leaderboard_out],
                )
                param_bins_in.change(
                    fn=build_view_only,
                    inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
                    outputs=[leaderboard_out],
                )
                excluded_tasks_in.change(
                    fn=build_view_only,
                    inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
                    outputs=[leaderboard_out],
                )

                gr.Markdown("""
                ### Methodology
                - **`src_clf`**: Source classification of a fragment.
                - **`sum_rag`**: RAG-style QA strictly from provided passages. Answers are graded by a judge gpt-4o model on a 0-2 scale; we report F1 score.
                - **`sum_rag_v2`**: Advanced legal reasoning dataset with multiple question types:
                  - **Contradiction resolution**: Questions about resolving contradictions or ambiguities within legal texts, requiring analysis of conflicting rules or statements
                  - **Legal inference**: Questions testing whether hypothetical situations meet specific legal criteria, requiring identification of legal prerequisites and exceptions
                """)
                gr.Markdown("""
                ### Notes
                - GPT-5-nano sometimes fails to answer, responding with an empty string.
                - GPT-4o has 100% precision on the `sum_rag_v2` task, but seems to have surprisingly low recall.
                - Llama-3-8B-Instruct family has limited context length (3 - 8k, 3.1 - 16k), so if the passages are too long, the model will not be able to answer (and will thus be given score 0).
                - Gaius-Lex v0.8 model is based on Llama-3-8B-Instruct with RoPE scaling = 2.0. It wasn't trained for `src_clf` task.
                """)
                gr.Markdown("""
                ### Language and RAG prompt
                - All tasks, passages and questions are in Polish. The models are instructed to answer in Polish.

                ```text
                Odpowiadasz tylko i wyłącznie po polsku. Twoim zadaniem jest odpowiedzieć na pytanie na podstawie źródeł. Podaj wszystkie interesujące informacje oraz argumenty i cytaty na dowód ich prawdziwości.
                Nie odpowiadaj na podstawie własnej wiedzy. Jeżeli w źródłach nie ma wymaganych informacji, powiedz to krótko.
                <relevant_info>
                {passages}
                </relevant_info>

                Odpowiedz na pytanie: `{question}` tylko i wyłącznie na podstawie źródeł. Nie odbiegaj od ich treści.
                Jeżeli odpowiedź nie jest zawarta w <relevant_info>, odpowiedz że nie ma odpowiedzi w źródłach.
                To jest kluczowe, że odpowiedź musi być oparta wyłącznie na <relevant_info>.
                ```
                """)

            with gr.Tab("Failure Cases"):
                gr.Markdown("""
                ### Failure Cases Analysis
                
                Explore failure cases by model to understand where models struggle.
                """)

                with gr.Row():
                    model_dropdown = gr.Dropdown(
                        label="Select Model",
                        choices=[],
                        value=None,
                        info="Choose a model to view its failure cases"
                    )

                failure_cases_out = gr.Code(
                    label="Failure Cases", 
                    language="json",
                    interactive=False,
                    lines=15
                )

                # Initialize dropdown and load data
                demo.load(
                    fn=initialize_failure_cases_dropdown,
                    inputs=[failure_cases_path_state],
                    outputs=[model_dropdown],
                )

                # Update failure cases when model selection changes
                model_dropdown.change(
                    fn=render_failure_cases,
                    inputs=[failure_cases_path_state, model_dropdown],
                    outputs=[failure_cases_out],
                )

    return demo


if __name__ == "__main__":
    app = ui()
    app.queue().launch(server_name="0.0.0.0", server_port=7860, show_api=False)