Spaces:

gaius-lex
/

pl-legal-rag

Running

App Files Files Community

wwydmanski commited on about 1 month ago

Commit

d42c7ce

verified ·

1 Parent(s): cf4603d

Create app.py

Browse files

Files changed (1) hide show

app.py +310 -0

app.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import json
+from pathlib import Path
+import pandas as pd
+import numpy as np
+import gradio as gr
+DEFAULT_OUTPUT_JSON = str((Path(__file__).parent / "leaderboard.json").resolve())
+# Predefined parameter bins for filtering (in billions)
+PARAM_BIN_CHOICES: list[str] = [
+    "<10B",
+    "10B-25B",
+    "25B-50B",
+    "50B-100B",
+    "100B+",
+]
+def load_leaderboard_json(json_path: str) -> pd.DataFrame:
+    path = Path(json_path)
+    if not path.exists() or not path.is_file():
+        return pd.DataFrame()
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            records = json.load(f)
+        # records should be a list of dicts; fallback if dict
+        if isinstance(records, dict):
+            # If wrapped, try to unwrap common keys
+            for key in ["data", "records", "items", "leaderboard"]:
+                if key in records and isinstance(records[key], list):
+                    records = records[key]
+                    break
+        if not isinstance(records, list):
+            return pd.DataFrame()
+        return pd.DataFrame.from_records(records)
+    except Exception:
+        return pd.DataFrame()
+def _hex_from_rgb(r: float, g: float, b: float) -> str:
+    r = max(0, min(255, int(round(r))))
+    g = max(0, min(255, int(round(g))))
+    b = max(0, min(255, int(round(b))))
+    return f"#{r:02x}{g:02x}{b:02x}"
+def _bg_color_from_t(t: float) -> str:
+    t = max(0.0, min(1.0, float(t)))
+    # Green (small) -> Red (big)
+    g_start = (34, 197, 94)   # #22c55e
+    r_end = (239, 68, 68)     # #ef4444
+    r = g_start[0] + t * (r_end[0] - g_start[0])
+    g = g_start[1] + t * (r_end[1] - g_start[1])
+    b = g_start[2] + t * (r_end[2] - g_start[2])
+    return f"background-color: {_hex_from_rgb(r, g, b)}"
+def _style_parameters(series: pd.Series) -> list[str]:
+    s = pd.to_numeric(series, errors="coerce")
+    s_pos = s[s > 0]
+    if s_pos.empty:
+        return [""] * len(series)
+    logs = np.log10(s_pos)
+    lmin = float(np.nanmin(logs))
+    lmax = float(np.nanmax(logs))
+    if not np.isfinite(lmin) or not np.isfinite(lmax):
+        return [""] * len(series)
+    colors: list[str] = []
+    for v in s:
+        if pd.isna(v) or v <= 0:
+            colors.append("")
+        else:
+            lv = np.log10(v)
+            if lmax == lmin:
+                t = 0.0
+            else:
+                t = (lv - lmin) / (lmax - lmin)
+            colors.append(_bg_color_from_t(float(t)))
+    return colors
+def _format_value_minimal(v) -> str:
+    if pd.isna(v):
+        return ""
+    if isinstance(v, str):
+        return v
+    if isinstance(v, (int, np.integer)):
+        return str(int(v))
+    if isinstance(v, (float, np.floating)):
+        if abs(v - round(v)) < 1e-9:
+            return str(int(round(v)))
+        s = f"{float(v):.6f}".rstrip("0").rstrip(".")
+        return s
+    try:
+        return str(v)
+    except Exception:
+        return ""
+def _prepare_dataframe(json_path: str) -> pd.DataFrame:
+    df = load_leaderboard_json(json_path)
+    if df.empty:
+        return df
+    # Remove columns not to be displayed per schema (Quantization, any *_time or time)
+    columns_to_exclude = [
+        c for c in df.columns
+        if c.lower() == "quantization" or c.lower().endswith("_time") or c.lower() == "time"
+    ]
+    df = df.drop(columns=columns_to_exclude, errors="ignore")
+    # Normalize types
+    if "Parameters" in df.columns:
+        df["Parameters"] = pd.to_numeric(df["Parameters"], errors="coerce")
+    if "src_clf" in df.columns:
+        df["src_clf"] = pd.to_numeric(df["src_clf"], errors="coerce")
+    # Compute avg_score across numeric metric columns (exclude meta)
+    meta_cols = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
+    metric_candidates = [c for c in df.columns if c not in meta_cols]
+    if metric_candidates:
+        numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in metric_candidates})
+        df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2)
+    # Sort by avg_score descending by default if present
+    if "avg_score" in df.columns:
+        df = df.sort_values(by="avg_score", ascending=False, na_position="last")
+    # Preferred column order
+    preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
+    remaining_cols = [c for c in df.columns if c not in preferred_order]
+    # Ensure avg_score is first among metric columns
+    if "avg_score" in remaining_cols:
+        remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"]
+    if preferred_order:
+        df = df[preferred_order + remaining_cols]
+    # Insert a visual separator column after Parameters to split meta from scores
+    if "Parameters" in df.columns:
+        sep_col_name = "—"
+        insert_at = df.columns.get_loc("Parameters") + 1
+        df.insert(insert_at, sep_col_name, "")
+    return df
+def _param_bins_mask(param_series: pd.Series, selected_bins: list[str] | None) -> pd.Series:
+    """Build a boolean mask for selected parameter bins.
+    Bins are in billions: <10B, 10B-25B, 25B-50B, 50B-100B, 100B-200B, 200B-300B, 300B+
+    Automatically converts raw counts to billions if values look large.
+    """
+    if not selected_bins:
+        return pd.Series(True, index=param_series.index)
+    # Ensure numeric
+    s = pd.to_numeric(param_series, errors="coerce")
+    # Heuristic: if median is large, assume raw parameter counts and convert to billions
+    median_val = s.dropna().median()
+    if pd.notna(median_val) and median_val > 1e6:
+        s_b = s / 1e9
+    else:
+        s_b = s
+    bin_map: dict[str, tuple[float, float | None]] = {
+        "<10B": (0.0, 10.0),
+        "10B-25B": (10.0, 25.0),
+        "25B-50B": (25.0, 50.0),
+        "50B-100B": (50.0, 100.0),
+        "100B+": (100.0, None),
+    }
+    mask = pd.Series(False, index=s_b.index)
+    for label in selected_bins:
+        if label not in bin_map:
+            continue
+        low, high = bin_map[label]
+        if high is None:
+            mask |= s_b >= low
+        else:
+            mask |= (s_b >= low) & (s_b < high)
+    # Drop NaNs from consideration
+    mask &= s_b.notna()
+    return mask
+def _apply_filters(df: pd.DataFrame, name_filter: str | None, param_bins: list[str] | None) -> pd.DataFrame:
+    if df.empty:
+        return df
+    mask = pd.Series(True, index=df.index)
+    # Name filter (case-insensitive substring match on Model)
+    if name_filter:
+        col = "Model" if "Model" in df.columns else None
+        if col is not None:
+            name_mask = df[col].astype(str).str.contains(name_filter, case=False, na=False)
+            mask &= name_mask
+    # Parameter bins filter
+    if param_bins and "Parameters" in df.columns:
+        bins_mask = _param_bins_mask(df["Parameters"], param_bins)
+        mask &= bins_mask
+    return df[mask]
+def build_view(json_path: str, name_filter: str = "", param_bins: list[str] | None = None) -> object:
+    df = _prepare_dataframe(json_path)
+    df = df.dropna(subset=["src_clf", "sum_rag", "sum_rag_v2"], axis=0)
+    # Apply filters if provided
+    df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins)
+    # Produce a styled DataFrame (log-scale colors on Parameters, minimal decimals formatting)
+    if isinstance(df, pd.DataFrame) and not df.empty:
+        styler = df.style
+        if "Parameters" in df.columns:
+            styler = styler.apply(_style_parameters, subset=["Parameters"])  # type: ignore
+        styler = styler.format(_format_value_minimal)
+        table_value: object = styler
+    else:
+        # Empty DataFrame fallback
+        table_value = pd.DataFrame()
+    return table_value
+def ui() -> gr.Blocks:
+    with gr.Blocks(title="Model Leaderboard") as demo:
+        gr.Markdown("""
+        ### Leaderboard
+        Displays scores from a prepared JSON leaderboard file. Columns are read dynamically from the JSON.
+        """)
+        # Fixed internal state for the JSON path; users cannot change this
+        json_path_state = gr.State(value=DEFAULT_OUTPUT_JSON)
+        # Filters
+        with gr.Row():
+            name_filter_in = gr.Textbox(label="Filter by name", placeholder="e.g. llama", lines=1)
+            param_bins_in = gr.CheckboxGroup(
+                label="Parameter bins",
+                choices=PARAM_BIN_CHOICES,
+                value=[],
+                info="Select one or more bins"
+            )
+        # Non-interactive so Pandas Styler is respected; header sorting remains available
+        leaderboard_out = gr.Dataframe(label="Leaderboard", interactive=False)
+        demo.load(
+            fn=build_view,
+            inputs=[json_path_state, name_filter_in, param_bins_in],
+            outputs=[leaderboard_out],
+        )
+        # Recompute table on filter changes
+        name_filter_in.change(
+            fn=build_view,
+            inputs=[json_path_state, name_filter_in, param_bins_in],
+            outputs=[leaderboard_out],
+        )
+        param_bins_in.change(
+            fn=build_view,
+            inputs=[json_path_state, name_filter_in, param_bins_in],
+            outputs=[leaderboard_out],
+        )
+        gr.Markdown("""
+        ### Methodology
+        - **`src_clf`**: Source classification of a fragment.
+        - **`sum_rag`**: RAG-style QA strictly from provided passages. Answers are graded by a judge gpt-4o model on a 0-2 scale; we report F1 score.
+        - **`sum_rag_v2`**: Like `sum_rag` but harder - with longer, augmented contexts and strict deranged negatives built. Same generation and 0-2 judging; we report F1 score.
+        """)
+        gr.Markdown("""
+        ### Notes
+        - GPT-5-nano sometimes fails to answer, responding with an empty string.
+        - GPT-4o has 100% precision on the `sum_rag_v2` task, but seems to have surprisingly low recall.
+        - Llama-3-8B-Instruct family has limited context length (3 - 8k, 3.1 - 16k), so if the passages are too long, the model will not be able to answer (and will thus be given score 0).
+        - Gaius-Lex v0.8 model is based on Llama-3-8B-Instruct with RoPE scaling = 2.0.
+        """)
+        gr.Markdown("""
+        ### Language and RAG prompt
+        - All tasks, passages and questions are in Polish. The models are instructed to answer in Polish.
+        ```text
+        Odpowiadasz tylko i wyłącznie po polsku. Twoim zadaniem jest odpowiedzieć na pytanie na podstawie źródeł. Podaj wszystkie interesujące informacje oraz argumenty i cytaty na dowód ich prawdziwości.
+        Nie odpowiadaj na podstawie własnej wiedzy. Jeżeli w źródłach nie ma wymaganych informacji, powiedz to krótko.
+        <relevant_info>
+        {passages}
+        </relevant_info>
+        Odpowiedz na pytanie: `{question}` tylko i wyłącznie na podstawie źródeł. Nie odbiegaj od ich treści.
+        Jeżeli odpowiedź nie jest zawarta w <relevant_info>, odpowiedz że nie ma odpowiedzi w źródłach.
+        To jest kluczowe, że odpowiedź musi być oparta wyłącznie na <relevant_info>.
+        ```
+        """)
+    return demo
+if __name__ == "__main__":
+    app = ui()
+    app.queue().launch(server_name="0.0.0.0", server_port=7860, show_api=False)