File size: 20,591 Bytes
d42c7ce
 
 
 
 
 
e263dc5
 
 
d42c7ce
 
e263dc5
d42c7ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8ed3e3
 
 
d42c7ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e263dc5
b8ed3e3
 
 
 
 
e263dc5
 
 
b8ed3e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e263dc5
 
b8ed3e3
e263dc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8ed3e3
 
 
e263dc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d42c7ce
 
 
b07df7a
 
 
d42c7ce
 
e263dc5
d42c7ce
e263dc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e320f9b
e263dc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d42c7ce
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
import json
from pathlib import Path

import pandas as pd
import numpy as np
import gradio as gr
from typing import Dict, List

import re

DEFAULT_OUTPUT_JSON = str((Path(__file__).parent / "leaderboard.json").resolve())
DEFAULT_FAILURE_CASES_JSON = str((Path(__file__).parent / "failure_cases.json").resolve())

# Predefined parameter bins for filtering (in billions)
PARAM_BIN_CHOICES: list[str] = [
    "<10B",
    "10B-25B",
    "25B-50B",
    "50B-100B",
    "100B+",
]


def load_leaderboard_json(json_path: str) -> pd.DataFrame:
    path = Path(json_path)
    if not path.exists() or not path.is_file():
        return pd.DataFrame()
    try:
        with open(path, "r", encoding="utf-8") as f:
            records = json.load(f)
        # records should be a list of dicts; fallback if dict
        if isinstance(records, dict):
            # If wrapped, try to unwrap common keys
            for key in ["data", "records", "items", "leaderboard"]:
                if key in records and isinstance(records[key], list):
                    records = records[key]
                    break
        if not isinstance(records, list):
            return pd.DataFrame()
        return pd.DataFrame.from_records(records)
    except Exception:
        return pd.DataFrame()


def _hex_from_rgb(r: float, g: float, b: float) -> str:
    r = max(0, min(255, int(round(r))))
    g = max(0, min(255, int(round(g))))
    b = max(0, min(255, int(round(b))))
    return f"#{r:02x}{g:02x}{b:02x}"


def _bg_color_from_t(t: float) -> str:
    t = max(0.0, min(1.0, float(t)))
    # Green (small) -> Red (big)
    g_start = (34, 197, 94)   # #22c55e
    r_end = (239, 68, 68)     # #ef4444
    r = g_start[0] + t * (r_end[0] - g_start[0])
    g = g_start[1] + t * (r_end[1] - g_start[1])
    b = g_start[2] + t * (r_end[2] - g_start[2])
    return f"background-color: {_hex_from_rgb(r, g, b)}"


def _style_parameters(series: pd.Series) -> list[str]:
    s = pd.to_numeric(series, errors="coerce")
    s_pos = s[s > 0]
    if s_pos.empty:
        return [""] * len(series)
    logs = np.log10(s_pos)
    lmin = float(np.nanmin(logs))
    lmax = float(np.nanmax(logs))
    if not np.isfinite(lmin) or not np.isfinite(lmax):
        return [""] * len(series)

    colors: list[str] = []
    for v in s:
        if pd.isna(v) or v <= 0:
            colors.append("")
        else:
            lv = np.log10(v)
            if lmax == lmin:
                t = 0.0
            else:
                t = (lv - lmin) / (lmax - lmin)
            colors.append(_bg_color_from_t(float(t)))
    return colors


def _format_value_minimal(v) -> str:
    if pd.isna(v):
        return ""
    if isinstance(v, str):
        return v
    if isinstance(v, (int, np.integer)):
        return str(int(v))
    if isinstance(v, (float, np.floating)):
        if abs(v - round(v)) < 1e-9:
            return str(int(round(v)))
    s = f"{float(v):.6f}".rstrip("0").rstrip(".")
    return s



def _prepare_dataframe(json_path: str) -> pd.DataFrame:
    df = load_leaderboard_json(json_path)
    if df.empty:
        return df

    # Remove columns not to be displayed per schema (Quantization, any *_time or time)
    columns_to_exclude = [
        c for c in df.columns
        if c.lower() == "quantization" or c.lower().endswith("_time") or c.lower() == "time"
    ]
    df = df.drop(columns=columns_to_exclude, errors="ignore")

    # Normalize types
    if "Parameters" in df.columns:
        df["Parameters"] = pd.to_numeric(df["Parameters"], errors="coerce")
    if "src_clf" in df.columns:
        df["src_clf"] = pd.to_numeric(df["src_clf"], errors="coerce")

    # Compute avg_score across numeric metric columns (exclude meta)
    meta_cols = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
    metric_candidates = [c for c in df.columns if c not in meta_cols]
    if metric_candidates:
        numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in metric_candidates})
        df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2)

    # Sort by avg_score descending by default if present
    if "avg_score" in df.columns:
        df = df.sort_values(by="avg_score", ascending=False, na_position="last")

    # Preferred column order
    preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
    remaining_cols = [c for c in df.columns if c not in preferred_order]
    # Ensure avg_score is first among metric columns
    if "avg_score" in remaining_cols:
        remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"]
    if preferred_order:
        df = df[preferred_order + remaining_cols]

    # Insert a visual separator column after Parameters to split meta from scores
    if "Parameters" in df.columns:
        sep_col_name = "—"
        insert_at = df.columns.get_loc("Parameters") + 1
        df.insert(insert_at, sep_col_name, "")

    return df


def _param_bins_mask(param_series: pd.Series, selected_bins: list[str] | None) -> pd.Series:
    """Build a boolean mask for selected parameter bins.

    Bins are in billions: <10B, 10B-25B, 25B-50B, 50B-100B, 100B-200B, 200B-300B, 300B+
    Automatically converts raw counts to billions if values look large.
    """
    if not selected_bins:
        return pd.Series(True, index=param_series.index)

    # Ensure numeric
    s = pd.to_numeric(param_series, errors="coerce")

    # Heuristic: if median is large, assume raw parameter counts and convert to billions
    median_val = s.dropna().median()
    if pd.notna(median_val) and median_val > 1e6:
        s_b = s / 1e9
    else:
        s_b = s

    bin_map: dict[str, tuple[float, float | None]] = {
        "<10B": (0.0, 10.0),
        "10B-25B": (10.0, 25.0),
        "25B-50B": (25.0, 50.0),
        "50B-100B": (50.0, 100.0),
        "100B+": (100.0, None),
    }

    mask = pd.Series(False, index=s_b.index)
    for label in selected_bins:
        if label not in bin_map:
            continue
        low, high = bin_map[label]
        if high is None:
            mask |= s_b >= low
        else:
            mask |= (s_b >= low) & (s_b < high)
    # Drop NaNs from consideration
    mask &= s_b.notna()
    return mask


def _apply_filters(df: pd.DataFrame, name_filter: str | None, param_bins: list[str] | None) -> pd.DataFrame:
    if df.empty:
        return df

    mask = pd.Series(True, index=df.index)

    # Name filter (case-insensitive substring match on Model)
    if name_filter:
        col = "Model" if "Model" in df.columns else None
        if col is not None:
            name_mask = df[col].astype(str).str.contains(name_filter, case=False, na=False)
            mask &= name_mask

    # Parameter bins filter
    if param_bins and "Parameters" in df.columns:
        bins_mask = _param_bins_mask(df["Parameters"], param_bins)
        mask &= bins_mask

    return df[mask]


def build_view(json_path: str, name_filter: str = "", param_bins: list[str] | None = None) -> object:
    df = _prepare_dataframe(json_path)

    df = df.dropna(subset=["src_clf", "sum_rag", "sum_rag_v2"], axis=0)

    # Apply filters if provided
    df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins)

    # Produce a styled DataFrame (log-scale colors on Parameters, minimal decimals formatting)
    if isinstance(df, pd.DataFrame) and not df.empty:
        styler = df.style
        if "Parameters" in df.columns:
            styler = styler.apply(_style_parameters, subset=["Parameters"])  # type: ignore
        styler = styler.format(_format_value_minimal)
        table_value: object = styler
    else:
        # Empty DataFrame fallback
        table_value = pd.DataFrame()
        

    return table_value


def build_view_only(
    json_path: str,
    name_filter: str = "",
    param_bins: list[str] | None = None,
    excluded_tasks: list[str] | None = None,
):
    """Return only the table without updating the exclude-tasks control.
    
    This prevents infinite loops when called from change handlers.
    """
    df = _prepare_dataframe(json_path)

    # Determine all task-like columns (before exclusion)
    meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
    tasks_all = [c for c in df.columns if c not in meta_cols_base]

    excluded_set = set(excluded_tasks or [])
    # Keep only tasks that actually exist
    excluded_valid = [t for t in excluded_set if t in tasks_all]
    included_tasks = [c for c in tasks_all if c not in excluded_set]

    # Drop rows that are missing values for required tasks (only those that are included)
    required_cols = [c for c in ["src_clf", "sum_rag", "sum_rag_v2"] if c in included_tasks]
    if required_cols:
        df = df.dropna(subset=required_cols, axis=0)

    # Apply filters
    df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins)

    # Remove excluded task columns from view
    if excluded_valid:
        df = df.drop(columns=[c for c in excluded_valid if c in df.columns], errors="ignore")

    # Recompute avg_score from only included tasks
    # Determine tasks present in df after exclusion
    meta_cols_after = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
    current_metric_cols = [c for c in df.columns if c not in meta_cols_after]

    # Drop existing avg_score before recomputation
    if "avg_score" in df.columns:
        df = df.drop(columns=["avg_score"])  # will be re-added below

    if current_metric_cols:
        numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in current_metric_cols})
        df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2)
    else:
        # No metrics left; fill avg_score with NaN to keep schema consistent
        df["avg_score"] = np.nan

    # Sort and reorder columns similar to _prepare_dataframe
    if "avg_score" in df.columns:
        df = df.sort_values(by="avg_score", ascending=False, na_position="last")

    preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
    remaining_cols = [c for c in df.columns if c not in preferred_order]
    if "avg_score" in remaining_cols:
        remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"]
    if preferred_order:
        df = df[preferred_order + remaining_cols]

    # Ensure separator column exists right after Parameters
    if "Parameters" in df.columns and "—" not in df.columns:
        insert_at = df.columns.get_loc("Parameters") + 1
        df.insert(insert_at, "—", "")

    # Style for display
    if isinstance(df, pd.DataFrame) and not df.empty:
        styler = df.style
        if "Parameters" in df.columns:
            styler = styler.apply(_style_parameters, subset=["Parameters"])  # type: ignore
        styler = styler.format(_format_value_minimal)
        table_value: object = styler
    else:
        table_value = pd.DataFrame()

    return table_value


def initialize_tasks_choices(json_path: str):
    """Initialize the task choices for the exclude tasks checkbox.
    
    This is separate from the table building to avoid infinite loops.
    """
    df = _prepare_dataframe(json_path)
    
    # Determine all task-like columns
    meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
    tasks_all = [c for c in df.columns if c not in meta_cols_base]
    
    # Return update for the exclude tasks checkbox with just the choices, no value change
    tasks_update = gr.update(choices=tasks_all)
    
    return tasks_update


def build_view_and_tasks(
    json_path: str,
    name_filter: str = "",
    param_bins: list[str] | None = None,
    excluded_tasks: list[str] | None = None,
):
    """Return the table and an update object for the exclude-tasks control.
    
    Used only for initial loading to set up the choices.
    """
    table_value = build_view_only(json_path, name_filter, param_bins, excluded_tasks)
    tasks_update = initialize_tasks_choices(json_path)
    
    return table_value, tasks_update


# ---------------------- Failure cases handling ----------------------

def load_failure_cases_json(json_path: str) -> Dict[str, List[Dict[str, str]]]:
    """Load failure cases from JSON file.
    
    Returns dict mapping model_id -> list of failure cases.
    """
    path = Path(json_path)
    if not path.exists() or not path.is_file():
        return {}
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, dict):
            return data
        return {}
    except Exception:
        return {}


def get_available_models(failure_cases_data: Dict[str, List[Dict[str, str]]]) -> List[str]:
    """Get list of available models from failure cases data."""
    return sorted(failure_cases_data.keys()) if failure_cases_data else []


def render_failure_cases(
    json_path: str, 
    selected_model: str
) -> str:
    """Render failure cases for selected model as JSON string."""
    if not selected_model:
        return "{}"
    
    failure_cases_data = load_failure_cases_json(json_path)
    
    if selected_model not in failure_cases_data:
        return "{}"
    
    cases = failure_cases_data[selected_model]
    if not cases:
        return "[]"
    
    for case in cases:
        score = re.search(r"(\d+\.\d+)", case["reasoning"])
        if score:
            case["score"] = float(score.group(1))
    
    # Return formatted JSON string
    return json.dumps(cases, ensure_ascii=False, indent=2)


def initialize_failure_cases_dropdown(json_path: str):
    """Initialize the model dropdown for failure cases."""
    failure_cases_data = load_failure_cases_json(json_path)
    models = get_available_models(failure_cases_data)
    
    if models:
        return gr.update(choices=models, value=models[0] if models else None)
    else:
        return gr.update(choices=[], value=None)


def ui() -> gr.Blocks:
    with gr.Blocks(title="Model Leaderboard") as demo:
        gr.Markdown("""
        ### Polish Legal RAG Leaderboard

        Explore and compare model performance on Polish legal QA tasks.
        """)

        # Fixed internal state for the JSON paths
        json_path_state = gr.State(value=DEFAULT_OUTPUT_JSON)
        failure_cases_path_state = gr.State(value=DEFAULT_FAILURE_CASES_JSON)

        with gr.Tabs():
            with gr.Tab("Leaderboard"):
                gr.Markdown("""
                - Use filters to narrow by name and parameter bins.
                - Use "Exclude tasks" to hide selected metrics; avg_score updates accordingly.
                - Click column headers to sort; data updates automatically as filters change.
                """)

                # Filters
                with gr.Row():
                    name_filter_in = gr.Textbox(label="Filter by name", placeholder="e.g. llama", lines=1)
                    param_bins_in = gr.CheckboxGroup(
                        label="Parameter bins",
                        choices=PARAM_BIN_CHOICES,
                        value=[],
                        info="Select one or more bins"
                    )
                    excluded_tasks_in = gr.CheckboxGroup(
                        label="Exclude tasks",
                        choices=[],
                        value=[],
                        info="Select tasks to hide; all are shown by default",
                    )

                # Non-interactive so Pandas Styler is respected; header sorting remains available
                leaderboard_out = gr.Dataframe(label="Leaderboard", interactive=False)

                demo.load(
                    fn=build_view_and_tasks,
                    inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
                    outputs=[leaderboard_out, excluded_tasks_in],
                )

                # Recompute table on filter changes
                name_filter_in.change(
                    fn=build_view_only,
                    inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
                    outputs=[leaderboard_out],
                )
                param_bins_in.change(
                    fn=build_view_only,
                    inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
                    outputs=[leaderboard_out],
                )
                excluded_tasks_in.change(
                    fn=build_view_only,
                    inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
                    outputs=[leaderboard_out],
                )

                gr.Markdown("""
                ### Methodology
                - **`src_clf`**: Source classification of a fragment.
                - **`sum_rag`**: RAG-style QA strictly from provided passages. Answers are graded by a judge gpt-4o model on a 0-2 scale; we report F1 score.
                - **`sum_rag_v2`**: Advanced legal reasoning dataset with multiple question types:
                  - **Contradiction resolution**: Questions about resolving contradictions or ambiguities within legal texts, requiring analysis of conflicting rules or statements
                  - **Legal inference**: Questions testing whether hypothetical situations meet specific legal criteria, requiring identification of legal prerequisites and exceptions
                """)
                gr.Markdown("""
                ### Notes
                - GPT-5-nano sometimes fails to answer, responding with an empty string.
                - GPT-4o has 100% precision on the `sum_rag_v2` task, but seems to have surprisingly low recall.
                - Llama-3-8B-Instruct family has limited context length (3 - 8k, 3.1 - 16k), so if the passages are too long, the model will not be able to answer (and will thus be given score 0).
                - Gaius-Lex v0.8 model is based on Llama-3-8B-Instruct with RoPE scaling = 2.0. It wasn't trained for `src_clf` task.
                """)
                gr.Markdown("""
                ### Language and RAG prompt
                - All tasks, passages and questions are in Polish. The models are instructed to answer in Polish.

                ```text
                Odpowiadasz tylko i wyłącznie po polsku. Twoim zadaniem jest odpowiedzieć na pytanie na podstawie źródeł. Podaj wszystkie interesujące informacje oraz argumenty i cytaty na dowód ich prawdziwości.
                Nie odpowiadaj na podstawie własnej wiedzy. Jeżeli w źródłach nie ma wymaganych informacji, powiedz to krótko.
                <relevant_info>
                {passages}
                </relevant_info>

                Odpowiedz na pytanie: `{question}` tylko i wyłącznie na podstawie źródeł. Nie odbiegaj od ich treści.
                Jeżeli odpowiedź nie jest zawarta w <relevant_info>, odpowiedz że nie ma odpowiedzi w źródłach.
                To jest kluczowe, że odpowiedź musi być oparta wyłącznie na <relevant_info>.
                ```
                """)

            with gr.Tab("Failure Cases"):
                gr.Markdown("""
                ### Failure Cases Analysis
                
                Explore failure cases by model to understand where models struggle.
                """)

                with gr.Row():
                    model_dropdown = gr.Dropdown(
                        label="Select Model",
                        choices=[],
                        value=None,
                        info="Choose a model to view its failure cases"
                    )

                failure_cases_out = gr.Code(
                    label="Failure Cases", 
                    language="json",
                    interactive=False,
                    lines=15
                )

                # Initialize dropdown and load data
                demo.load(
                    fn=initialize_failure_cases_dropdown,
                    inputs=[failure_cases_path_state],
                    outputs=[model_dropdown],
                )

                # Update failure cases when model selection changes
                model_dropdown.change(
                    fn=render_failure_cases,
                    inputs=[failure_cases_path_state, model_dropdown],
                    outputs=[failure_cases_out],
                )

    return demo


if __name__ == "__main__":
    app = ui()
    app.queue().launch(server_name="0.0.0.0", server_port=7860, show_api=False)