import json from pathlib import Path import pandas as pd import numpy as np import gradio as gr from typing import Dict, List import re DEFAULT_OUTPUT_JSON = str((Path(__file__).parent / "leaderboard.json").resolve()) DEFAULT_FAILURE_CASES_JSON = str((Path(__file__).parent / "failure_cases.json").resolve()) # Predefined parameter bins for filtering (in billions) PARAM_BIN_CHOICES: list[str] = [ "<10B", "10B-25B", "25B-50B", "50B-100B", "100B+", ] def load_leaderboard_json(json_path: str) -> pd.DataFrame: path = Path(json_path) if not path.exists() or not path.is_file(): return pd.DataFrame() try: with open(path, "r", encoding="utf-8") as f: records = json.load(f) # records should be a list of dicts; fallback if dict if isinstance(records, dict): # If wrapped, try to unwrap common keys for key in ["data", "records", "items", "leaderboard"]: if key in records and isinstance(records[key], list): records = records[key] break if not isinstance(records, list): return pd.DataFrame() return pd.DataFrame.from_records(records) except Exception: return pd.DataFrame() def _hex_from_rgb(r: float, g: float, b: float) -> str: r = max(0, min(255, int(round(r)))) g = max(0, min(255, int(round(g)))) b = max(0, min(255, int(round(b)))) return f"#{r:02x}{g:02x}{b:02x}" def _bg_color_from_t(t: float) -> str: t = max(0.0, min(1.0, float(t))) # Green (small) -> Red (big) g_start = (34, 197, 94) # #22c55e r_end = (239, 68, 68) # #ef4444 r = g_start[0] + t * (r_end[0] - g_start[0]) g = g_start[1] + t * (r_end[1] - g_start[1]) b = g_start[2] + t * (r_end[2] - g_start[2]) return f"background-color: {_hex_from_rgb(r, g, b)}" def _style_parameters(series: pd.Series) -> list[str]: s = pd.to_numeric(series, errors="coerce") s_pos = s[s > 0] if s_pos.empty: return [""] * len(series) logs = np.log10(s_pos) lmin = float(np.nanmin(logs)) lmax = float(np.nanmax(logs)) if not np.isfinite(lmin) or not np.isfinite(lmax): return [""] * len(series) colors: list[str] = [] for v in s: if pd.isna(v) or v <= 0: colors.append("") else: lv = np.log10(v) if lmax == lmin: t = 0.0 else: t = (lv - lmin) / (lmax - lmin) colors.append(_bg_color_from_t(float(t))) return colors def _format_value_minimal(v) -> str: if pd.isna(v): return "" if isinstance(v, str): return v if isinstance(v, (int, np.integer)): return str(int(v)) if isinstance(v, (float, np.floating)): if abs(v - round(v)) < 1e-9: return str(int(round(v))) s = f"{float(v):.6f}".rstrip("0").rstrip(".") return s def _prepare_dataframe(json_path: str) -> pd.DataFrame: df = load_leaderboard_json(json_path) if df.empty: return df # Remove columns not to be displayed per schema (Quantization, any *_time or time) columns_to_exclude = [ c for c in df.columns if c.lower() == "quantization" or c.lower().endswith("_time") or c.lower() == "time" ] df = df.drop(columns=columns_to_exclude, errors="ignore") # Normalize types if "Parameters" in df.columns: df["Parameters"] = pd.to_numeric(df["Parameters"], errors="coerce") if "src_clf" in df.columns: df["src_clf"] = pd.to_numeric(df["src_clf"], errors="coerce") # Compute avg_score across numeric metric columns (exclude meta) meta_cols = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns] metric_candidates = [c for c in df.columns if c not in meta_cols] if metric_candidates: numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in metric_candidates}) df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2) # Sort by avg_score descending by default if present if "avg_score" in df.columns: df = df.sort_values(by="avg_score", ascending=False, na_position="last") # Preferred column order preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns] remaining_cols = [c for c in df.columns if c not in preferred_order] # Ensure avg_score is first among metric columns if "avg_score" in remaining_cols: remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"] if preferred_order: df = df[preferred_order + remaining_cols] # Insert a visual separator column after Parameters to split meta from scores if "Parameters" in df.columns: sep_col_name = "—" insert_at = df.columns.get_loc("Parameters") + 1 df.insert(insert_at, sep_col_name, "") return df def _param_bins_mask(param_series: pd.Series, selected_bins: list[str] | None) -> pd.Series: """Build a boolean mask for selected parameter bins. Bins are in billions: <10B, 10B-25B, 25B-50B, 50B-100B, 100B-200B, 200B-300B, 300B+ Automatically converts raw counts to billions if values look large. """ if not selected_bins: return pd.Series(True, index=param_series.index) # Ensure numeric s = pd.to_numeric(param_series, errors="coerce") # Heuristic: if median is large, assume raw parameter counts and convert to billions median_val = s.dropna().median() if pd.notna(median_val) and median_val > 1e6: s_b = s / 1e9 else: s_b = s bin_map: dict[str, tuple[float, float | None]] = { "<10B": (0.0, 10.0), "10B-25B": (10.0, 25.0), "25B-50B": (25.0, 50.0), "50B-100B": (50.0, 100.0), "100B+": (100.0, None), } mask = pd.Series(False, index=s_b.index) for label in selected_bins: if label not in bin_map: continue low, high = bin_map[label] if high is None: mask |= s_b >= low else: mask |= (s_b >= low) & (s_b < high) # Drop NaNs from consideration mask &= s_b.notna() return mask def _apply_filters(df: pd.DataFrame, name_filter: str | None, param_bins: list[str] | None) -> pd.DataFrame: if df.empty: return df mask = pd.Series(True, index=df.index) # Name filter (case-insensitive substring match on Model) if name_filter: col = "Model" if "Model" in df.columns else None if col is not None: name_mask = df[col].astype(str).str.contains(name_filter, case=False, na=False) mask &= name_mask # Parameter bins filter if param_bins and "Parameters" in df.columns: bins_mask = _param_bins_mask(df["Parameters"], param_bins) mask &= bins_mask return df[mask] def build_view(json_path: str, name_filter: str = "", param_bins: list[str] | None = None) -> object: df = _prepare_dataframe(json_path) df = df.dropna(subset=["src_clf", "sum_rag", "sum_rag_v2"], axis=0) # Apply filters if provided df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins) # Produce a styled DataFrame (log-scale colors on Parameters, minimal decimals formatting) if isinstance(df, pd.DataFrame) and not df.empty: styler = df.style if "Parameters" in df.columns: styler = styler.apply(_style_parameters, subset=["Parameters"]) # type: ignore styler = styler.format(_format_value_minimal) table_value: object = styler else: # Empty DataFrame fallback table_value = pd.DataFrame() return table_value def build_view_only( json_path: str, name_filter: str = "", param_bins: list[str] | None = None, excluded_tasks: list[str] | None = None, ): """Return only the table without updating the exclude-tasks control. This prevents infinite loops when called from change handlers. """ df = _prepare_dataframe(json_path) # Determine all task-like columns (before exclusion) meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns] tasks_all = [c for c in df.columns if c not in meta_cols_base] excluded_set = set(excluded_tasks or []) # Keep only tasks that actually exist excluded_valid = [t for t in excluded_set if t in tasks_all] included_tasks = [c for c in tasks_all if c not in excluded_set] # Drop rows that are missing values for required tasks (only those that are included) required_cols = [c for c in ["src_clf", "sum_rag", "sum_rag_v2"] if c in included_tasks] if required_cols: df = df.dropna(subset=required_cols, axis=0) # Apply filters df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins) # Remove excluded task columns from view if excluded_valid: df = df.drop(columns=[c for c in excluded_valid if c in df.columns], errors="ignore") # Recompute avg_score from only included tasks # Determine tasks present in df after exclusion meta_cols_after = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns] current_metric_cols = [c for c in df.columns if c not in meta_cols_after] # Drop existing avg_score before recomputation if "avg_score" in df.columns: df = df.drop(columns=["avg_score"]) # will be re-added below if current_metric_cols: numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in current_metric_cols}) df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2) else: # No metrics left; fill avg_score with NaN to keep schema consistent df["avg_score"] = np.nan # Sort and reorder columns similar to _prepare_dataframe if "avg_score" in df.columns: df = df.sort_values(by="avg_score", ascending=False, na_position="last") preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns] remaining_cols = [c for c in df.columns if c not in preferred_order] if "avg_score" in remaining_cols: remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"] if preferred_order: df = df[preferred_order + remaining_cols] # Ensure separator column exists right after Parameters if "Parameters" in df.columns and "—" not in df.columns: insert_at = df.columns.get_loc("Parameters") + 1 df.insert(insert_at, "—", "") # Style for display if isinstance(df, pd.DataFrame) and not df.empty: styler = df.style if "Parameters" in df.columns: styler = styler.apply(_style_parameters, subset=["Parameters"]) # type: ignore styler = styler.format(_format_value_minimal) table_value: object = styler else: table_value = pd.DataFrame() return table_value def initialize_tasks_choices(json_path: str): """Initialize the task choices for the exclude tasks checkbox. This is separate from the table building to avoid infinite loops. """ df = _prepare_dataframe(json_path) # Determine all task-like columns meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns] tasks_all = [c for c in df.columns if c not in meta_cols_base] # Return update for the exclude tasks checkbox with just the choices, no value change tasks_update = gr.update(choices=tasks_all) return tasks_update def build_view_and_tasks( json_path: str, name_filter: str = "", param_bins: list[str] | None = None, excluded_tasks: list[str] | None = None, ): """Return the table and an update object for the exclude-tasks control. Used only for initial loading to set up the choices. """ table_value = build_view_only(json_path, name_filter, param_bins, excluded_tasks) tasks_update = initialize_tasks_choices(json_path) return table_value, tasks_update # ---------------------- Failure cases handling ---------------------- def load_failure_cases_json(json_path: str) -> Dict[str, List[Dict[str, str]]]: """Load failure cases from JSON file. Returns dict mapping model_id -> list of failure cases. """ path = Path(json_path) if not path.exists() or not path.is_file(): return {} try: with open(path, "r", encoding="utf-8") as f: data = json.load(f) if isinstance(data, dict): return data return {} except Exception: return {} def get_available_models(failure_cases_data: Dict[str, List[Dict[str, str]]]) -> List[str]: """Get list of available models from failure cases data.""" return sorted(failure_cases_data.keys()) if failure_cases_data else [] def render_failure_cases( json_path: str, selected_model: str ) -> str: """Render failure cases for selected model as JSON string.""" if not selected_model: return "{}" failure_cases_data = load_failure_cases_json(json_path) if selected_model not in failure_cases_data: return "{}" cases = failure_cases_data[selected_model] if not cases: return "[]" for case in cases: score = re.search(r"(\d+\.\d+)", case["reasoning"]) if score: case["score"] = float(score.group(1)) # Return formatted JSON string return json.dumps(cases, ensure_ascii=False, indent=2) def initialize_failure_cases_dropdown(json_path: str): """Initialize the model dropdown for failure cases.""" failure_cases_data = load_failure_cases_json(json_path) models = get_available_models(failure_cases_data) if models: return gr.update(choices=models, value=models[0] if models else None) else: return gr.update(choices=[], value=None) def ui() -> gr.Blocks: with gr.Blocks(title="Model Leaderboard") as demo: gr.Markdown(""" ### Polish Legal RAG Leaderboard Explore and compare model performance on Polish legal QA tasks. """) # Fixed internal state for the JSON paths json_path_state = gr.State(value=DEFAULT_OUTPUT_JSON) failure_cases_path_state = gr.State(value=DEFAULT_FAILURE_CASES_JSON) with gr.Tabs(): with gr.Tab("Leaderboard"): gr.Markdown(""" - Use filters to narrow by name and parameter bins. - Use "Exclude tasks" to hide selected metrics; avg_score updates accordingly. - Click column headers to sort; data updates automatically as filters change. """) # Filters with gr.Row(): name_filter_in = gr.Textbox(label="Filter by name", placeholder="e.g. llama", lines=1) param_bins_in = gr.CheckboxGroup( label="Parameter bins", choices=PARAM_BIN_CHOICES, value=[], info="Select one or more bins" ) excluded_tasks_in = gr.CheckboxGroup( label="Exclude tasks", choices=[], value=[], info="Select tasks to hide; all are shown by default", ) # Non-interactive so Pandas Styler is respected; header sorting remains available leaderboard_out = gr.Dataframe(label="Leaderboard", interactive=False) demo.load( fn=build_view_and_tasks, inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in], outputs=[leaderboard_out, excluded_tasks_in], ) # Recompute table on filter changes name_filter_in.change( fn=build_view_only, inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in], outputs=[leaderboard_out], ) param_bins_in.change( fn=build_view_only, inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in], outputs=[leaderboard_out], ) excluded_tasks_in.change( fn=build_view_only, inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in], outputs=[leaderboard_out], ) gr.Markdown(""" ### Methodology - **`src_clf`**: Source classification of a fragment. - **`sum_rag`**: RAG-style QA strictly from provided passages. Answers are graded by a judge gpt-4o model on a 0-2 scale; we report F1 score. - **`sum_rag_v2`**: Advanced legal reasoning dataset with multiple question types: - **Contradiction resolution**: Questions about resolving contradictions or ambiguities within legal texts, requiring analysis of conflicting rules or statements - **Legal inference**: Questions testing whether hypothetical situations meet specific legal criteria, requiring identification of legal prerequisites and exceptions """) gr.Markdown(""" ### Notes - GPT-5-nano sometimes fails to answer, responding with an empty string. - GPT-4o has 100% precision on the `sum_rag_v2` task, but seems to have surprisingly low recall. - Llama-3-8B-Instruct family has limited context length (3 - 8k, 3.1 - 16k), so if the passages are too long, the model will not be able to answer (and will thus be given score 0). - Gaius-Lex v0.8 model is based on Llama-3-8B-Instruct with RoPE scaling = 2.0. It wasn't trained for `src_clf` task. """) gr.Markdown(""" ### Language and RAG prompt - All tasks, passages and questions are in Polish. The models are instructed to answer in Polish. ```text Odpowiadasz tylko i wyłącznie po polsku. Twoim zadaniem jest odpowiedzieć na pytanie na podstawie źródeł. Podaj wszystkie interesujące informacje oraz argumenty i cytaty na dowód ich prawdziwości. Nie odpowiadaj na podstawie własnej wiedzy. Jeżeli w źródłach nie ma wymaganych informacji, powiedz to krótko. {passages} Odpowiedz na pytanie: `{question}` tylko i wyłącznie na podstawie źródeł. Nie odbiegaj od ich treści. Jeżeli odpowiedź nie jest zawarta w , odpowiedz że nie ma odpowiedzi w źródłach. To jest kluczowe, że odpowiedź musi być oparta wyłącznie na . ``` """) with gr.Tab("Failure Cases"): gr.Markdown(""" ### Failure Cases Analysis Explore failure cases by model to understand where models struggle. """) with gr.Row(): model_dropdown = gr.Dropdown( label="Select Model", choices=[], value=None, info="Choose a model to view its failure cases" ) failure_cases_out = gr.Code( label="Failure Cases", language="json", interactive=False, lines=15 ) # Initialize dropdown and load data demo.load( fn=initialize_failure_cases_dropdown, inputs=[failure_cases_path_state], outputs=[model_dropdown], ) # Update failure cases when model selection changes model_dropdown.change( fn=render_failure_cases, inputs=[failure_cases_path_state, model_dropdown], outputs=[failure_cases_out], ) return demo if __name__ == "__main__": app = ui() app.queue().launch(server_name="0.0.0.0", server_port=7860, show_api=False)