wwydmanski commited on
Commit
d42c7ce
·
verified ·
1 Parent(s): cf4603d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +310 -0
app.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ import numpy as np
6
+ import gradio as gr
7
+
8
+ DEFAULT_OUTPUT_JSON = str((Path(__file__).parent / "leaderboard.json").resolve())
9
+
10
+ # Predefined parameter bins for filtering (in billions)
11
+ PARAM_BIN_CHOICES: list[str] = [
12
+ "<10B",
13
+ "10B-25B",
14
+ "25B-50B",
15
+ "50B-100B",
16
+ "100B+",
17
+ ]
18
+
19
+
20
+ def load_leaderboard_json(json_path: str) -> pd.DataFrame:
21
+ path = Path(json_path)
22
+ if not path.exists() or not path.is_file():
23
+ return pd.DataFrame()
24
+ try:
25
+ with open(path, "r", encoding="utf-8") as f:
26
+ records = json.load(f)
27
+ # records should be a list of dicts; fallback if dict
28
+ if isinstance(records, dict):
29
+ # If wrapped, try to unwrap common keys
30
+ for key in ["data", "records", "items", "leaderboard"]:
31
+ if key in records and isinstance(records[key], list):
32
+ records = records[key]
33
+ break
34
+ if not isinstance(records, list):
35
+ return pd.DataFrame()
36
+ return pd.DataFrame.from_records(records)
37
+ except Exception:
38
+ return pd.DataFrame()
39
+
40
+
41
+ def _hex_from_rgb(r: float, g: float, b: float) -> str:
42
+ r = max(0, min(255, int(round(r))))
43
+ g = max(0, min(255, int(round(g))))
44
+ b = max(0, min(255, int(round(b))))
45
+ return f"#{r:02x}{g:02x}{b:02x}"
46
+
47
+
48
+ def _bg_color_from_t(t: float) -> str:
49
+ t = max(0.0, min(1.0, float(t)))
50
+ # Green (small) -> Red (big)
51
+ g_start = (34, 197, 94) # #22c55e
52
+ r_end = (239, 68, 68) # #ef4444
53
+ r = g_start[0] + t * (r_end[0] - g_start[0])
54
+ g = g_start[1] + t * (r_end[1] - g_start[1])
55
+ b = g_start[2] + t * (r_end[2] - g_start[2])
56
+ return f"background-color: {_hex_from_rgb(r, g, b)}"
57
+
58
+
59
+ def _style_parameters(series: pd.Series) -> list[str]:
60
+ s = pd.to_numeric(series, errors="coerce")
61
+ s_pos = s[s > 0]
62
+ if s_pos.empty:
63
+ return [""] * len(series)
64
+ logs = np.log10(s_pos)
65
+ lmin = float(np.nanmin(logs))
66
+ lmax = float(np.nanmax(logs))
67
+ if not np.isfinite(lmin) or not np.isfinite(lmax):
68
+ return [""] * len(series)
69
+
70
+ colors: list[str] = []
71
+ for v in s:
72
+ if pd.isna(v) or v <= 0:
73
+ colors.append("")
74
+ else:
75
+ lv = np.log10(v)
76
+ if lmax == lmin:
77
+ t = 0.0
78
+ else:
79
+ t = (lv - lmin) / (lmax - lmin)
80
+ colors.append(_bg_color_from_t(float(t)))
81
+ return colors
82
+
83
+
84
+ def _format_value_minimal(v) -> str:
85
+ if pd.isna(v):
86
+ return ""
87
+ if isinstance(v, str):
88
+ return v
89
+ if isinstance(v, (int, np.integer)):
90
+ return str(int(v))
91
+ if isinstance(v, (float, np.floating)):
92
+ if abs(v - round(v)) < 1e-9:
93
+ return str(int(round(v)))
94
+ s = f"{float(v):.6f}".rstrip("0").rstrip(".")
95
+ return s
96
+ try:
97
+ return str(v)
98
+ except Exception:
99
+ return ""
100
+
101
+
102
+ def _prepare_dataframe(json_path: str) -> pd.DataFrame:
103
+ df = load_leaderboard_json(json_path)
104
+ if df.empty:
105
+ return df
106
+
107
+ # Remove columns not to be displayed per schema (Quantization, any *_time or time)
108
+ columns_to_exclude = [
109
+ c for c in df.columns
110
+ if c.lower() == "quantization" or c.lower().endswith("_time") or c.lower() == "time"
111
+ ]
112
+ df = df.drop(columns=columns_to_exclude, errors="ignore")
113
+
114
+ # Normalize types
115
+ if "Parameters" in df.columns:
116
+ df["Parameters"] = pd.to_numeric(df["Parameters"], errors="coerce")
117
+ if "src_clf" in df.columns:
118
+ df["src_clf"] = pd.to_numeric(df["src_clf"], errors="coerce")
119
+
120
+ # Compute avg_score across numeric metric columns (exclude meta)
121
+ meta_cols = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
122
+ metric_candidates = [c for c in df.columns if c not in meta_cols]
123
+ if metric_candidates:
124
+ numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in metric_candidates})
125
+ df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2)
126
+
127
+ # Sort by avg_score descending by default if present
128
+ if "avg_score" in df.columns:
129
+ df = df.sort_values(by="avg_score", ascending=False, na_position="last")
130
+
131
+ # Preferred column order
132
+ preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
133
+ remaining_cols = [c for c in df.columns if c not in preferred_order]
134
+ # Ensure avg_score is first among metric columns
135
+ if "avg_score" in remaining_cols:
136
+ remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"]
137
+ if preferred_order:
138
+ df = df[preferred_order + remaining_cols]
139
+
140
+ # Insert a visual separator column after Parameters to split meta from scores
141
+ if "Parameters" in df.columns:
142
+ sep_col_name = "—"
143
+ insert_at = df.columns.get_loc("Parameters") + 1
144
+ df.insert(insert_at, sep_col_name, "")
145
+
146
+ return df
147
+
148
+
149
+ def _param_bins_mask(param_series: pd.Series, selected_bins: list[str] | None) -> pd.Series:
150
+ """Build a boolean mask for selected parameter bins.
151
+
152
+ Bins are in billions: <10B, 10B-25B, 25B-50B, 50B-100B, 100B-200B, 200B-300B, 300B+
153
+ Automatically converts raw counts to billions if values look large.
154
+ """
155
+ if not selected_bins:
156
+ return pd.Series(True, index=param_series.index)
157
+
158
+ # Ensure numeric
159
+ s = pd.to_numeric(param_series, errors="coerce")
160
+
161
+ # Heuristic: if median is large, assume raw parameter counts and convert to billions
162
+ median_val = s.dropna().median()
163
+ if pd.notna(median_val) and median_val > 1e6:
164
+ s_b = s / 1e9
165
+ else:
166
+ s_b = s
167
+
168
+ bin_map: dict[str, tuple[float, float | None]] = {
169
+ "<10B": (0.0, 10.0),
170
+ "10B-25B": (10.0, 25.0),
171
+ "25B-50B": (25.0, 50.0),
172
+ "50B-100B": (50.0, 100.0),
173
+ "100B+": (100.0, None),
174
+ }
175
+
176
+ mask = pd.Series(False, index=s_b.index)
177
+ for label in selected_bins:
178
+ if label not in bin_map:
179
+ continue
180
+ low, high = bin_map[label]
181
+ if high is None:
182
+ mask |= s_b >= low
183
+ else:
184
+ mask |= (s_b >= low) & (s_b < high)
185
+ # Drop NaNs from consideration
186
+ mask &= s_b.notna()
187
+ return mask
188
+
189
+
190
+ def _apply_filters(df: pd.DataFrame, name_filter: str | None, param_bins: list[str] | None) -> pd.DataFrame:
191
+ if df.empty:
192
+ return df
193
+
194
+ mask = pd.Series(True, index=df.index)
195
+
196
+ # Name filter (case-insensitive substring match on Model)
197
+ if name_filter:
198
+ col = "Model" if "Model" in df.columns else None
199
+ if col is not None:
200
+ name_mask = df[col].astype(str).str.contains(name_filter, case=False, na=False)
201
+ mask &= name_mask
202
+
203
+ # Parameter bins filter
204
+ if param_bins and "Parameters" in df.columns:
205
+ bins_mask = _param_bins_mask(df["Parameters"], param_bins)
206
+ mask &= bins_mask
207
+
208
+ return df[mask]
209
+
210
+
211
+ def build_view(json_path: str, name_filter: str = "", param_bins: list[str] | None = None) -> object:
212
+ df = _prepare_dataframe(json_path)
213
+
214
+ df = df.dropna(subset=["src_clf", "sum_rag", "sum_rag_v2"], axis=0)
215
+
216
+ # Apply filters if provided
217
+ df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins)
218
+
219
+ # Produce a styled DataFrame (log-scale colors on Parameters, minimal decimals formatting)
220
+ if isinstance(df, pd.DataFrame) and not df.empty:
221
+ styler = df.style
222
+ if "Parameters" in df.columns:
223
+ styler = styler.apply(_style_parameters, subset=["Parameters"]) # type: ignore
224
+ styler = styler.format(_format_value_minimal)
225
+ table_value: object = styler
226
+ else:
227
+ # Empty DataFrame fallback
228
+ table_value = pd.DataFrame()
229
+
230
+
231
+ return table_value
232
+
233
+
234
+ def ui() -> gr.Blocks:
235
+ with gr.Blocks(title="Model Leaderboard") as demo:
236
+ gr.Markdown("""
237
+ ### Leaderboard
238
+ Displays scores from a prepared JSON leaderboard file. Columns are read dynamically from the JSON.
239
+ """)
240
+
241
+ # Fixed internal state for the JSON path; users cannot change this
242
+ json_path_state = gr.State(value=DEFAULT_OUTPUT_JSON)
243
+
244
+ # Filters
245
+ with gr.Row():
246
+ name_filter_in = gr.Textbox(label="Filter by name", placeholder="e.g. llama", lines=1)
247
+ param_bins_in = gr.CheckboxGroup(
248
+ label="Parameter bins",
249
+ choices=PARAM_BIN_CHOICES,
250
+ value=[],
251
+ info="Select one or more bins"
252
+ )
253
+
254
+ # Non-interactive so Pandas Styler is respected; header sorting remains available
255
+ leaderboard_out = gr.Dataframe(label="Leaderboard", interactive=False)
256
+
257
+ demo.load(
258
+ fn=build_view,
259
+ inputs=[json_path_state, name_filter_in, param_bins_in],
260
+ outputs=[leaderboard_out],
261
+ )
262
+
263
+ # Recompute table on filter changes
264
+ name_filter_in.change(
265
+ fn=build_view,
266
+ inputs=[json_path_state, name_filter_in, param_bins_in],
267
+ outputs=[leaderboard_out],
268
+ )
269
+ param_bins_in.change(
270
+ fn=build_view,
271
+ inputs=[json_path_state, name_filter_in, param_bins_in],
272
+ outputs=[leaderboard_out],
273
+ )
274
+
275
+ gr.Markdown("""
276
+ ### Methodology
277
+ - **`src_clf`**: Source classification of a fragment.
278
+ - **`sum_rag`**: RAG-style QA strictly from provided passages. Answers are graded by a judge gpt-4o model on a 0-2 scale; we report F1 score.
279
+ - **`sum_rag_v2`**: Like `sum_rag` but harder - with longer, augmented contexts and strict deranged negatives built. Same generation and 0-2 judging; we report F1 score.
280
+ """)
281
+ gr.Markdown("""
282
+ ### Notes
283
+ - GPT-5-nano sometimes fails to answer, responding with an empty string.
284
+ - GPT-4o has 100% precision on the `sum_rag_v2` task, but seems to have surprisingly low recall.
285
+ - Llama-3-8B-Instruct family has limited context length (3 - 8k, 3.1 - 16k), so if the passages are too long, the model will not be able to answer (and will thus be given score 0).
286
+ - Gaius-Lex v0.8 model is based on Llama-3-8B-Instruct with RoPE scaling = 2.0.
287
+ """)
288
+ gr.Markdown("""
289
+ ### Language and RAG prompt
290
+ - All tasks, passages and questions are in Polish. The models are instructed to answer in Polish.
291
+
292
+ ```text
293
+ Odpowiadasz tylko i wyłącznie po polsku. Twoim zadaniem jest odpowiedzieć na pytanie na podstawie źródeł. Podaj wszystkie interesujące informacje oraz argumenty i cytaty na dowód ich prawdziwości.
294
+ Nie odpowiadaj na podstawie własnej wiedzy. Jeżeli w źródłach nie ma wymaganych informacji, powiedz to krótko.
295
+ <relevant_info>
296
+ {passages}
297
+ </relevant_info>
298
+
299
+ Odpowiedz na pytanie: `{question}` tylko i wyłącznie na podstawie źródeł. Nie odbiegaj od ich treści.
300
+ Jeżeli odpowiedź nie jest zawarta w <relevant_info>, odpowiedz że nie ma odpowiedzi w źródłach.
301
+ To jest kluczowe, że odpowiedź musi być oparta wyłącznie na <relevant_info>.
302
+ ```
303
+ """)
304
+
305
+ return demo
306
+
307
+
308
+ if __name__ == "__main__":
309
+ app = ui()
310
+ app.queue().launch(server_name="0.0.0.0", server_port=7860, show_api=False)