rzanoli commited on
Commit
5e09303
Β·
1 Parent(s): 0a7c918

Refactor and optimize all interface chart code

Browse files
Files changed (2) hide show
  1. app.py +453 -669
  2. app_18_09_2025.py +823 -0
app.py CHANGED
@@ -3,189 +3,232 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
- from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
 
 
 
 
7
  from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION
8
  from src.display.css_html_js import custom_css
9
- from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision
 
10
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
11
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
12
  from src.submission.submit import add_new_eval
13
- import random
14
  import matplotlib.pyplot as plt
15
  import re
16
  import plotly.express as px
17
  import plotly.graph_objects as go
18
  import numpy as np
19
 
 
 
 
20
 
21
- def mean_of_max_per_field(df):
22
- """
23
- Calcola il massimo per ciascun campo e poi la media dei massimi.
 
 
24
 
25
- Args:
26
- df (pd.DataFrame): DataFrame con colonne TE, SA, HS, AT, WIC, FAQ, LS, SU, NER, REL
 
 
27
 
28
- Returns:
29
- float: media dei valori massimi dei campi
30
- """
31
- fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- #print(df.columns)
34
 
35
- # Controlla che tutte le colonne esistano nel DataFrame
36
- missing = [f for f in fields if f not in df.columns]
37
- if missing:
38
- raise ValueError(f"Le seguenti colonne mancano nel DataFrame: {missing}")
 
 
 
 
39
 
40
- # Calcola il massimo per ciascun campo
41
- max_values = df[fields].max()
42
 
43
- # Calcola la media dei massimi
44
- mean_max = max_values.mean()
 
 
 
 
 
 
 
 
 
45
 
46
- return mean_max
47
 
 
 
 
 
48
 
49
- def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
50
- if tasks is None:
51
- tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
52
 
53
- task_means = {}
 
54
 
55
- for task in tasks:
56
- if task not in dataframe.columns:
57
- continue
 
 
 
 
58
 
59
- # Separa few-shot e zero-shot
60
- few_shot = dataframe[dataframe['IS_FS'] == True][["Model", task]]
61
- zero_shot = dataframe[dataframe['IS_FS'] == False][["Model", task]]
62
 
63
- # Allinea i modelli
64
- merged = pd.merge(few_shot, zero_shot, on="Model", suffixes=("_few", "_zero"))
 
65
 
66
- # Rimuovi righe con valori mancanti
67
- merged = merged.dropna(subset=[f"{task}_few", f"{task}_zero"])
 
 
 
 
68
 
69
- if merged.empty:
70
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- # Calcola differenza few - zero
73
- diff = merged[f"{task}_few"] - merged[f"{task}_zero"]
74
-
75
- # Calcola la media
76
- task_means[task] = diff.mean()
77
-
78
- # Crea barplot
79
- fig = go.Figure([go.Bar(
80
- x=list(task_means.keys()),
81
- y=list(task_means.values()),
82
- marker_color="#ff7f0e",
83
- text=[f"{v:.2f}" for v in task_means.values()],
84
- textposition="outside",
85
- hovertemplate="<b>%{x}</b><br>Mean Delta Accuracy: %{y:.2f}%<extra></extra>"
86
- )])
87
-
88
- # Linea di riferimento a 0
89
- '''
90
- fig.add_shape(
91
- type="line",
92
- x0=-0.5, x1=len(task_means) - 0.5,
93
- y0=0, y1=0,
94
- line=dict(color="black", width=2, dash="dash"),
95
- xref="x", yref="y"
96
- )
97
- '''
98
 
 
99
  fig.update_layout(
100
- title="Mean Accuracy Difference (Few-shot βˆ’ Zero-shot) per Task",
101
- xaxis_title="",
102
- yaxis_title="Mean Delta Combined Performance",
103
- template="plotly_white",
104
- font=dict(family="Arial", size=13),
105
- #margin=dict(b=100)
106
  )
107
 
 
108
  fig.add_annotation(
109
- text="5-shot learning generally outperforms zero-shot, especially in tasks like NER and REL.<br>"
110
- "Only in Summarization (SU) does it provide no accuracy gain.",
111
- xref="paper", yref="paper",
112
- x=0, y=-0.2,
113
- showarrow=False,
114
- font=dict(size=11, color="gray"),
115
- align="left"
116
  )
117
 
118
- return fig
 
119
 
 
120
 
121
- def boxplot_per_task(dataframe=None, baselines=None, references=None):
122
 
123
- #print(dataframe.columns)
 
 
124
 
125
  tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
126
 
 
127
  if dataframe is None:
128
  np.random.seed(42)
129
- dataframe = pd.DataFrame({
130
- task: np.random.uniform(0.4, 0.9, 20) * 100
131
- for task in tasks
132
- })
133
 
134
  if baselines is None:
135
  baselines = {task: np.random.randint(50, 70) for task in tasks}
136
 
 
 
 
137
  colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
138
  "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
139
 
140
  fig = go.Figure()
141
 
142
  for i, task in enumerate(tasks):
143
- if task in dataframe.columns:
144
- y_data = dataframe[task].dropna().tolist()
145
-
146
- # boxplot
147
- fig.add_trace(go.Box(
148
- y=y_data,
149
- name=task,
150
- marker=dict(color=colors[i]),
151
- line=dict(color="black", width=2),
152
- fillcolor=colors[i],
153
- opacity=0.7,
154
- hovertemplate="<b>"+task+"</b><br>Accuracy: %{y:.2f}%<extra></extra>",
155
- width=0.6,
156
- whiskerwidth=0.2,
157
- quartilemethod="linear"
158
- ))
159
-
160
- # baseline
161
- if task in baselines and baselines[task] is not None:
162
- fig.add_shape(
163
- type="line",
164
- x0=i - 0.3, x1=i + 0.3,
165
- y0=baselines[task], y1=baselines[task],
166
- line=dict(color="black", width=2, dash="dot"), # piΓΉ visibile
167
- xref="x", yref="y"
168
- )
169
- '''
170
- fig.add_annotation(
171
- x=i, y=baselines[task],
172
- text=f"{baselines[task]}%",
173
- showarrow=False,
174
- yshift=10,
175
- font=dict(size=10, color="black")
176
- )
177
- '''
178
-
179
- # reference GPT-4o
180
- if task in references and references[task] is not None:
181
- fig.add_shape(
182
- type="line",
183
- x0=i - 0.3, x1=i + 0.3,
184
- y0=references[task], y1=references[task],
185
- line=dict(color="red", width=2, dash="dashdot"),
186
- xref="x", yref="y"
187
- )
188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  fig.update_layout(
190
  title="Distribution of Model Accuracy by Task",
191
  xaxis_title="Task",
@@ -194,9 +237,10 @@ def boxplot_per_task(dataframe=None, baselines=None, references=None):
194
  boxmode="group",
195
  dragmode=False,
196
  font=dict(family="Arial", size=10),
197
- margin=dict(b=80),
198
  )
199
 
 
200
  fig.add_annotation(
201
  text=(
202
  "In tasks like TE and SA, models approach the accuracy of supervised <br>"
@@ -207,416 +251,108 @@ def boxplot_per_task(dataframe=None, baselines=None, references=None):
207
  x=0.5, y=-0.30,
208
  showarrow=False,
209
  font=dict(size=11, color="gray"),
210
- align="left"
211
  )
212
 
213
  fig.update_yaxes(range=[0, 100], fixedrange=True)
 
214
 
215
  return fig
216
 
217
- # EVALITA results
218
- BASELINES = {
219
- "TE":71.00, "SA": 66.38, "HS": 80.88, "AT": 82.40, "WIC": 85.00,
220
- "LS": 38.82, "SU": 38.91, "NER":88.00, "REL": 62.99
221
- }
222
-
223
- # GPT-4o
224
- REFERENCES = {
225
- "NER": 79.11,
226
- "REL": 63.32,
227
- "LS": 59.25,
228
- "SU": 33.04
229
-
230
- }
231
-
232
-
233
- def boxplot_prompts_per_task(dataframe, tasks=None):
234
- if tasks is None:
235
- tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
236
-
237
- # Lista delle colonne da aggiornare
238
- cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"]
239
- # Applichiamo la trasformazione
240
- for col in cols_to_update:
241
- dataframe[col] = dataframe[col].replace({1: 7, 2: 8})
242
-
243
- fig = go.Figure()
244
-
245
- # Liste per creare una sola voce in legenda per Average e Best
246
- avg_x, avg_y = [], []
247
- best_x, best_y, best_text = [], [], []
248
-
249
- for task in tasks:
250
- avg_col = f"{task} Prompt Average"
251
- best_col = f"{task} Best Prompt"
252
- best_id_col = f"{task} Best Prompt Id"
253
-
254
- if all(col in dataframe.columns for col in [avg_col, best_col, best_id_col]):
255
- avg_value = dataframe[avg_col].mean()
256
- avg_x.append(task)
257
- avg_y.append(avg_value)
258
-
259
- best_value = dataframe[best_col].mean()
260
- best_x.append(task)
261
- best_y.append(best_value)
262
- best_id = dataframe[best_id_col].mode()[0] # Most frequent best prompt id
263
- best_text.append(f"P:{best_id}")
264
-
265
- # Barre Average Accuracy (azzurro)
266
- fig.add_trace(go.Bar(
267
- x=avg_x,
268
- y=avg_y,
269
- name="Avg. Accuracy",
270
- marker_color="#1f77b4",
271
- #hovertemplate="%{y:.2f}%<extra></extra>"
272
- #hovertemplate="<b>" + task + "</b><br>Accuracy: %{y:.2f}%<extra></extra>",
273
- ))
274
-
275
- # Barre Best Prompt (rosso)
276
- fig.add_trace(go.Bar(
277
- x=best_x,
278
- y=best_y,
279
- name="Best Prompt",
280
- marker_color="#d62728",
281
- #hovertemplate="%{y:.2f}%<extra></extra>"
282
- #hovertemplate = "<b>" + task + "</b><br>Accuracy: %{y:.2f}%<extra></extra>",
283
- ))
284
-
285
- # Testo sopra barre Best Prompt con ID
286
- for x, y, text in zip(best_x, best_y, best_text):
287
- fig.add_annotation(
288
- x=x,
289
- y=y + 3, # leggermente sopra la barra
290
- text=text,
291
- showarrow=False,
292
- font=dict(size=12, color="black")
293
- )
294
-
295
- fig.update_layout(
296
- title= "Prompt Accuracy: Avg vs Best",
297
- xaxis_title="Task",
298
- yaxis_title="Combined Performance",
299
- barmode='group',
300
- template="plotly_white",
301
- font=dict(family="Arial", size=10),
302
- yaxis=dict(range=[0, 100], fixedrange=True)
303
- )
304
-
305
- # caption come annotazione separata
306
- fig.add_annotation(
307
- text="There is no single prompt that performs best across all tasks.<br>"
308
- "Different prompts achieve the highest accuracy on different tasks.",
309
- xref="paper", yref="paper",
310
- x=0.5, y=-0.3,
311
- showarrow=False,
312
- font=dict(size=11, color="gray"),
313
- align="center",
314
- xanchor="center"
315
- )
316
-
317
- return fig
318
-
319
-
320
- def line_chart(dataframe):
321
-
322
- # Normalizza le dimensioni per avere marker non troppo piccoli nΓ© enormi
323
- def scale_sizes(values, min_size=8, max_size=30):
324
- vmin, vmax = min(values), max(values)
325
- return [
326
- min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size) if vmax > vmin else (min_size + max_size) / 2
327
- for val in values
328
- ]
329
-
330
- # dati in base a IS_FS
331
- df_true = dataframe[dataframe['IS_FS'] == True]
332
- df_false = dataframe[dataframe['IS_FS'] == False]
333
-
334
- # Estrai valori x, y e labels
335
- x_true = df_true['#Params (B)'].tolist()
336
- y_true = df_true['Avg. Comb. Perf. ⬆️'].tolist()
337
- labels_true = [re.search(r'>([^<]+)<', m).group(1) for m in df_true['Model'].tolist()]
338
-
339
- x_false = df_false['#Params (B)'].tolist()
340
- y_false = df_false['Avg. Comb. Perf. ⬆️'].tolist()
341
- labels_false = [re.search(r'>([^<]+)<', m).group(1) for m in df_false['Model'].tolist()]
342
-
343
- fig = go.Figure()
344
-
345
- # Punti IS_FS=True
346
- fig.add_trace(go.Scatter(
347
- x=x_true,
348
- y=y_true,
349
- mode='markers',
350
- name='5-Shot',
351
- marker=dict(
352
- color='blue',
353
- size=scale_sizes(x_true)
354
- ),
355
- hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>',
356
- customdata=labels_true
357
- ))
358
-
359
- # Punti IS_FS=False
360
- fig.add_trace(go.Scatter(
361
- x=x_false,
362
- y=y_false,
363
- mode='markers',
364
- name='0-Shot',
365
- marker=dict(
366
- color='red',
367
- size=scale_sizes(x_false)
368
- ),
369
- hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>',
370
- customdata=labels_false
371
- ))
372
-
373
- # Trova il massimo tra tutti i modelli
374
- all_y = y_true + y_false
375
- all_x = x_true + x_false
376
- all_labels = labels_true + labels_false
377
- max_idx = all_y.index(max(all_y))
378
- max_x = all_x[max_idx]
379
- max_y = all_y[max_idx]
380
- max_label = all_labels[max_idx]
381
-
382
- # Aggiungi annotazione visibile per il modello migliore
383
- fig.add_annotation(
384
- x=max_x,
385
- y=max_y,
386
- #text=f"Top: {max_label} ({max_y:.1f}%)",
387
- text=f"{max_label}",
388
- showarrow=True,
389
- arrowhead=2,
390
- arrowsize=1,
391
- arrowwidth=2,
392
- arrowcolor="black",
393
- font=dict(size=11, color="black"),
394
- xshift=10,
395
- yshift=10,
396
- ax = -30, ay = -20, # sposta la label a sinistra e sopra il punto
397
- xanchor = "right" # allinea la label a destra rispetto al punto
398
- )
399
-
400
- fig.update_layout(
401
- title="Avg. Combined Performance vs #Params",
402
- xaxis_title="#Params (B)",
403
- yaxis_title="Avg. Combined Performance",
404
- template="plotly_white",
405
- hovermode="closest",
406
- font=dict(family="Arial", size=10),
407
- dragmode=False,
408
- xaxis=dict(
409
- tickvals=[0, 25, 50, 75, 100, 125],
410
- ticktext=["0", "25", "50", "75", "100"]
411
- ),
412
- yaxis=dict(
413
- tickvals=[0, 20, 40, 60, 80, 100], # πŸ‘ˆ tick fissi
414
- range=[0, 100] # πŸ‘ˆ range bloccato
415
- )
416
- )
417
-
418
- # Caption
419
- fig.add_annotation(
420
- text="Accuracy generally rises with #Params, but smaller models <br>"
421
- "with 5-shot can outperform larger zero-shot models.",
422
- xref="paper", yref="paper",
423
- x=0.5, y=-0.3, # πŸ‘ˆ centrata
424
- showarrow=False,
425
- font=dict(size=11, color="gray"),
426
- align="center",
427
- xanchor="center" # πŸ‘ˆ ancora centrata rispetto al testo
428
- )
429
-
430
- fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
431
- fig.update_yaxes(fixedrange=True)
432
-
433
- return fig
434
-
435
-
436
- # Define task metadata (icons, names, descriptions)
437
- TASK_METADATA_MULTIPLECHOICE = {
438
- "TE": {"icon": "πŸ“Š", "name": "Textual Entailment", "tooltip": ""},
439
- "SA": {"icon": "πŸ˜ƒ", "name": "Sentiment Analysis", "tooltip": ""},
440
- "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""},
441
- "AT": {"icon": "πŸ₯", "name": "Admission Test", "tooltip": ""},
442
- "WIC": {"icon": "πŸ”€", "name": "Word in Context", "tooltip": ""},
443
- "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""}
444
- }
445
-
446
- # Define task metadata (icons, names, descriptions)
447
- TASK_METADATA_GENERATIVE = {
448
- "LS": {"icon": "πŸ”„", "name": "Lexical Substitution", "tooltip": ""},
449
- "SU": {"icon": "πŸ“", "name": "Summarization", "tooltip": ""},
450
- "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
451
- "REL": {"icon": "πŸ”—", "name": "Relation Extraction", "tooltip": ""},
452
- }
453
 
454
- def restart_space():
455
- """Restart the Hugging Face space."""
456
- API.restart_space(repo_id=REPO_ID)
457
-
458
-
459
- def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
460
- """
461
- Initialize and return the leaderboard when it is first loaded or when 'benchmark' is selected.
462
- The table is sorted based on the "Avg. Combined Performance" field.
463
- """
464
- if dataframe is None or dataframe.empty:
465
- raise ValueError("Leaderboard DataFrame is empty or None.")
466
-
467
- #print("????????????????????????????????", mean_of_max_per_field(dataframe))
468
-
469
- sorted_dataframe = dataframe.sort_values(by="Avg. Comb. Perf. ⬆️", ascending=False)
470
-
471
- sorted_dataframe = sorted_dataframe.reset_index(drop=True)
472
- sorted_dataframe["Rank"] = sorted_dataframe.index + 1
473
 
474
- # Flag per sapere se la medaglia Γ¨ giΓ  stata assegnata per categoria e tipo
475
- large_medal_fs_assigned = False
476
- medium_medal_fs_assigned = False
477
- small_medal_fs_assigned = False
478
-
479
- large_medal_0shot_assigned = False
480
- medium_medal_0shot_assigned = False
481
- small_medal_0shot_assigned = False
482
-
483
- # Lista temporanea per salvare i nuovi valori della colonna Model
484
  new_model_column = []
485
 
486
- for _, row in sorted_dataframe.iterrows():
487
- if row['IS_FS']: # 5-Few-Shot
488
- if row["Size"] == "πŸ”΅πŸ”΅πŸ”΅" and not large_medal_fs_assigned:
489
- new_model_column.append(f"{row['Model']} πŸ”΅πŸ”΅πŸ”΅πŸ†")
490
- large_medal_fs_assigned = True
491
- elif row["Size"] == "πŸ”΅πŸ”΅" and not medium_medal_fs_assigned:
492
- new_model_column.append(f"{row['Model']} πŸ”΅πŸ”΅πŸ†")
493
- medium_medal_fs_assigned = True
494
- elif row["Size"] == "πŸ”΅" and not small_medal_fs_assigned:
495
- new_model_column.append(f"{row['Model']} πŸ”΅πŸ†")
496
- small_medal_fs_assigned = True
497
- else:
498
- new_model_column.append(row["Model"])
 
 
499
  else: # 0-Shot
500
- if row["Size"] == "πŸ”΅πŸ”΅πŸ”΅" and not large_medal_0shot_assigned:
501
- new_model_column.append(f"{row['Model']} πŸ”΅πŸ”΅πŸ”΅πŸŽ–οΈ")
502
- large_medal_0shot_assigned = True
503
- elif row["Size"] == "πŸ”΅πŸ”΅" and not medium_medal_0shot_assigned:
504
- new_model_column.append(f"{row['Model']} πŸ”΅πŸ”΅πŸŽ–οΈ")
505
- medium_medal_0shot_assigned = True
506
- elif row["Size"] == "πŸ”΅" and not small_medal_0shot_assigned:
507
- new_model_column.append(f"{row['Model']} πŸ”΅πŸŽ–οΈ")
508
- small_medal_0shot_assigned = True
509
- else:
510
- new_model_column.append(row["Model"])
511
-
512
- # Lista delle colonne da aggiornare
513
- #cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"]
514
- # Applichiamo la trasformazione
515
- #for col in cols_to_update:
516
- # dataframe[col] = dataframe[col].replace({1: 7, 2: 8})
517
-
518
- # Aggiorna la colonna Model
519
- sorted_dataframe["Model"] = new_model_column
520
 
521
- field_list = fields(AutoEvalColumn)
 
 
522
 
 
 
 
523
  return Leaderboard(
524
  value=sorted_dataframe,
525
  datatype=[c.type for c in field_list],
526
- #select_columns=SelectColumns(
527
- # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
528
- # cant_deselect=[c.name for c in field_list if c.never_hidden],
529
- # label="Select Columns to Display:",
530
- #),
531
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
532
- hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
533
  filter_columns=[
534
  ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"),
535
- #ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)",
536
- # default=[["0️⃣", "0️⃣"]]),
537
- ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max = 100, default = [0,100], label="Select the number of parameters (B)"),
538
  ],
539
- #filter_columns=[
540
- # ColumnFilter("IS_FS", type="checkbox", default=False, label="5-Few-Shot")
541
- # #ColumnFilter("FS", type="dropdown", label="5-Few-Shot")
542
- #],
543
  bool_checkboxgroup_label="Evaluation Mode",
544
  interactive=False,
545
  )
546
 
547
- def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None):
548
- """
549
- Update and return the leaderboard when a specific task is selected.
550
- The table is sorted based on the "Combined Performance" field.
551
- """
552
  if dataframe is None or dataframe.empty:
553
  raise ValueError("Leaderboard DataFrame is empty or None.")
554
 
555
- sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False)
556
-
557
- # aggiungo la colonna rank in base alla posizione
558
- sorted_dataframe = sorted_dataframe.reset_index(drop=True)
559
  sorted_dataframe["Rank"] = sorted_dataframe.index + 1
560
 
561
- # Flag per sapere se la medaglia Γ¨ giΓ  stata assegnata per categoria e tipo
562
- large_medal_fs_assigned = False
563
- medium_medal_fs_assigned = False
564
- small_medal_fs_assigned = False
565
 
566
- large_medal_0shot_assigned = False
567
- medium_medal_0shot_assigned = False
568
- small_medal_0shot_assigned = False
569
 
570
- # Lista temporanea per salvare i nuovi valori della colonna Model
571
- new_model_column = []
572
 
573
- for _, row in sorted_dataframe.iterrows():
574
- if row['IS_FS']: # 5-Few-Shot
575
- if row["Size"] == "πŸ”΅πŸ”΅πŸ”΅" and not large_medal_fs_assigned:
576
- new_model_column.append(f"{row['Model']} πŸ”΅πŸ”΅πŸ”΅πŸ†")
577
- large_medal_fs_assigned = True
578
- elif row["Size"] == "πŸ”΅πŸ”΅" and not medium_medal_fs_assigned:
579
- new_model_column.append(f"{row['Model']} πŸ”΅πŸ”΅πŸ†")
580
- medium_medal_fs_assigned = True
581
- elif row["Size"] == "πŸ”΅" and not small_medal_fs_assigned:
582
- new_model_column.append(f"{row['Model']} πŸ”΅πŸ†")
583
- small_medal_fs_assigned = True
584
- else:
585
- new_model_column.append(row["Model"])
586
- else: # 0-Shot
587
- if row["Size"] == "πŸ”΅πŸ”΅πŸ”΅" and not large_medal_0shot_assigned:
588
- new_model_column.append(f"{row['Model']} πŸ”΅πŸ”΅πŸ”΅πŸŽ–οΈ")
589
- large_medal_0shot_assigned = True
590
- elif row["Size"] == "πŸ”΅πŸ”΅" and not medium_medal_0shot_assigned:
591
- new_model_column.append(f"{row['Model']} πŸ”΅πŸ”΅πŸŽ–οΈ")
592
- medium_medal_0shot_assigned = True
593
- elif row["Size"] == "πŸ”΅" and not small_medal_0shot_assigned:
594
- new_model_column.append(f"{row['Model']} πŸ”΅πŸŽ–οΈ")
595
- small_medal_0shot_assigned = True
596
- else:
597
- new_model_column.append(row["Model"])
598
-
599
- # Aggiorna la colonna Model
600
- sorted_dataframe["Model"] = new_model_column
601
-
602
- pd.set_option('display.max_colwidth', None)
603
- #print("========================", dataframe['Model'])
604
-
605
- #print(sorted_dataframe['Combined Performance'])
606
 
607
  field_list = fields(AutoEvalColumn)
608
 
609
  return Leaderboard(
610
  value=sorted_dataframe,
611
- #datatype=[c.type for c in field_list],
612
  datatype=[c.type for c in field_list] + [int],
613
- #select_columns=SelectColumns(
614
- # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
615
- # cant_deselect=[c.name for c in field_list if c.never_hidden],
616
- # label="Select Columns to Display:",
617
- #),
618
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
619
- hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
620
  filter_columns=[
621
  ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"),
622
  ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
@@ -626,106 +362,148 @@ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=No
626
  interactive=False
627
  )
628
 
629
- '''
630
- # Helper function for leaderboard initialization
631
- def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
632
- """Initialize and return a leaderboard."""
633
- if dataframe is None or dataframe.empty:
634
- raise ValueError("Leaderboard DataFrame is empty or None.")
635
 
636
- return Leaderboard(
637
- value=dataframe,
638
- datatype=[c.type for c in fields(AutoEvalColumn)],
639
- select_columns=SelectColumns(
640
- default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
641
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
642
- label="Select Columns to Display:",
643
- ),
644
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
645
- hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
646
- filter_columns=[
647
- ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
648
- ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
649
- ],
650
- bool_checkboxgroup_label="Hide models",
651
- interactive=False,
652
- )
653
- '''
 
 
654
 
655
- def download_snapshot(repo, local_dir):
656
- """Try to download a snapshot from Hugging Face Hub."""
 
657
  try:
658
- print(f"Downloading from {repo} to {local_dir}...")
659
- snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN)
660
  except Exception as e:
661
- print(f"Error downloading {repo}: {e}")
662
- restart_space()
663
-
664
-
665
- # Initialize the app by downloading snapshots
666
- download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
667
- download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
668
-
669
- # Load leaderboard data
670
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
671
- finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
672
- #print(LEADERBOARD_DF.columns.tolist())
673
-
674
- theoretical_max_combined_perf = mean_of_max_per_field(LEADERBOARD_DF)
675
-
676
- # Prepare the main interface
677
- demo = gr.Blocks(css=custom_css)
678
- with demo:
679
- #gr.HTML(TITLE)
680
- gr.HTML(
681
- """
682
- <div style="display: flex; align-items: center; position: relative; width: 100%; height: 60px; padding: 10px 0;">
683
- <h1 style="
684
- margin: 0 auto;
685
- font-weight: 900;
686
- font-size: 2.5em;
687
- letter-spacing: 2px;
688
- text-transform: uppercase;
689
- background: linear-gradient(90deg, #1f77b4, #00c6ff);
690
- -webkit-background-clip: text;
691
- -webkit-text-fill-color: transparent;
692
- text-shadow: 2px 2px 8px rgba(0,0,0,0.2);
693
- ">
694
- EVALITA-LLM Leaderboard
695
- </h1>
696
- <a href="https://huggingface.co/spaces/mii-llm/open_ita_llm_leaderboard" target="_blank"
697
- style="position: absolute; right: 0; display: inline-flex; align-items: center; gap: 6px; text-decoration: none; color: #1f77b4; font-weight: 600;">
698
- <!-- Icona stilizzata -->
699
- <svg xmlns="http://www.w3.org/2000/svg" width="22" height="22" fill="#1f77b4" viewBox="0 0 24 24">
700
- <path d="M3.9 12a5 5 0 0 1 7.07-7.07l1.41 1.41-1.41 1.41-1.42-1.42a3 3 0 1 0 4.24 4.24l3.54-3.54a5 5 0 0 1-7.07 7.07l-1.41-1.41 1.41-1.41 1.42 1.42z"/>
701
- <path d="M20.1 12a5 5 0 0 1-7.07 7.07l-1.41-1.41 1.41-1.41 1.42 1.42a3 3 0 1 0-4.24-4.24l-3.54 3.54a5 5 0 0 1 7.07-7.07l1.41 1.41-1.41 1.41-1.42-1.42z"/>
702
- </svg>
703
- Open Italian LLM Leaderboard
704
- </a>
705
- </div>
706
- """
707
- )
708
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
709
 
710
- # ⬇️ QUI aggiungiamo i grafici subito sotto la barra del titolo e sopra le tabs
711
- with gr.Row():
712
- gr.Plot(value=line_chart(LEADERBOARD_DF), elem_id="line-chart")
713
- gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="boxplot-task")
714
- #gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF), elem_id="boxplot-prompt-task")
715
 
716
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
 
 
 
717
 
718
- # Main leaderboard tab
719
- with gr.TabItem("πŸ… Benchmark"):
720
 
721
- leaderboard = init_leaderboard(
722
- LEADERBOARD_DF,
723
- default_selection=['Rank', 'Size', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
724
- hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
725
- )
726
 
727
- gr.HTML(
728
- f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
729
  <div style="
730
  border: 2px solid #1f77b4;
731
  border-radius: 10px;
@@ -735,89 +513,95 @@ with demo:
735
  font-size: 14px;
736
  display: inline-block;
737
  ">
738
- Theoretical performance of a model that scores the highest on every individual task: <span style="color:#d62728; font-size:18px;">{theoretical_max_combined_perf:.2f}</span>
 
739
  </div>
740
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
741
  )
742
 
743
- '''
744
- with gr.TabItem("πŸ“ˆ Charts"):
745
- #gr.Plot(value=line_chart(LEADERBOARD_DF), label="Andamento di esempio")
746
- #gr.Plot(value=line_chart_interactive_test(), label="Andamento interattivo")
747
- gr.Plot(value=line_chart(LEADERBOARD_DF))
748
- gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES))
749
- gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF))
750
- gr.Plot(value=barplot_mean_few_minus_zero_shot(LEADERBOARD_DF))
751
- '''
752
-
753
- # About tab
754
- with gr.TabItem("πŸ“ About"):
755
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
756
-
757
- # About tab
758
- with gr.TabItem("β•‘", interactive=False):
759
- gr.Markdown("", elem_classes="markdown-text")
760
-
761
-
762
- # Task-specific leaderboards
763
- for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
764
-
765
- with gr.TabItem(f"{metadata['icon']}{task}"):
766
-
767
- task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
768
- gr.Markdown(task_description, elem_classes="markdown-text")
769
-
770
- leaderboard = update_task_leaderboard(
771
- LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Prompt Std": "Prompt Std", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}),
772
- default_selection=['Rank', 'Size', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
773
- hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id']]
774
- )
775
-
776
- # About tab
777
- with gr.TabItem("β”‚", interactive=False):
778
- gr.Markdown("", elem_classes="markdown-text")
779
-
780
- # Task-specific leaderboards
781
- for task, metadata in TASK_METADATA_GENERATIVE.items():
782
- with gr.TabItem(f"{metadata['icon']}{task}"):
783
- task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
784
- gr.Markdown(task_description, elem_classes="markdown-text")
785
-
786
- leaderboard = update_task_leaderboard(
787
- LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average",
788
- f"{task} Prompt Std": "Prompt Std",
789
- f"{task} Best Prompt": "Best Prompt",
790
- f"{task} Best Prompt Id": "Best Prompt Id",
791
- task: "Combined Performance"}),
792
- default_selection=['Rank', 'Size', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt',
793
- 'Best Prompt Id'],
794
- hidden_columns=[col for col in LEADERBOARD_DF.columns if
795
- col not in ['Rank', 'Size', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std',
796
- 'Best Prompt', 'Best Prompt Id']]
797
- )
798
-
799
- # Citation section
800
- with gr.Accordion("πŸ“™ Citation", open=False):
801
- gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)
802
-
803
- with gr.Accordion("πŸ“™ Credits", open=False):
804
- gr.Markdown(
805
- """
806
- **This project has benefited from the following support:**
807
-
808
- - 🧠 **Codebase**: Based on and extended from the Open Italian LLM Leaderboard, developed by **Alessandro Ercolani** and **Samuele Colombo**. We warmly thank them for their invaluable support and guidance in implementing this leaderboard.
809
-
810
- - πŸ’Ά **Funding**: Partially supported by the PNRR project **FAIR - Future AI Research (PE00000013)**, under the NRRP MUR program funded by **NextGenerationEU**.
811
-
812
- - πŸ–₯️ **Computation**: We gratefully acknowledge **CINECA** for granting access to the **LEONARDO** supercomputer.
813
- """
814
- )
815
 
816
- # Background job to restart space
817
  scheduler = BackgroundScheduler()
818
  scheduler.add_job(restart_space, "interval", seconds=1800)
819
  scheduler.start()
820
 
821
- # Launch the app with concurrent queueing
822
- demo.queue(default_concurrency_limit=40).launch(debug=True, # Enable Gradio debug mode
823
- show_error=True)
 
 
 
 
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
+ from functools import lru_cache
7
+ import logging
8
+
9
+ from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, \
10
+ LLM_BENCHMARKS_TEXT, TITLE
11
  from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION
12
  from src.display.css_html_js import custom_css
13
+ from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, \
14
+ WeightType, Precision
15
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
16
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
17
  from src.submission.submit import add_new_eval
 
18
  import matplotlib.pyplot as plt
19
  import re
20
  import plotly.express as px
21
  import plotly.graph_objects as go
22
  import numpy as np
23
 
24
+ # Configure logging
25
+ logging.basicConfig(level=logging.INFO)
26
+ logger = logging.getLogger(__name__)
27
 
28
+ # EVALITA results
29
+ BASELINES = {
30
+ "TE": 71.00, "SA": 66.38, "HS": 80.88, "AT": 82.40, "WIC": 85.00,
31
+ "LS": 38.82, "SU": 38.91, "NER": 88.00, "REL": 62.99
32
+ }
33
 
34
+ # GPT-4o results
35
+ REFERENCES = {
36
+ "NER": 79.11, "REL": 63.32, "LS": 59.25, "SU": 33.04
37
+ }
38
 
39
+ TASK_METADATA_MULTIPLECHOICE = {
40
+ "TE": {"icon": "πŸ“Š", "name": "Textual Entailment", "tooltip": ""},
41
+ "SA": {"icon": "πŸ˜ƒ", "name": "Sentiment Analysis", "tooltip": ""},
42
+ "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""},
43
+ "AT": {"icon": "πŸ₯", "name": "Admission Test", "tooltip": ""},
44
+ "WIC": {"icon": "πŸ”€", "name": "Word in Context", "tooltip": ""},
45
+ "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""}
46
+ }
47
+
48
+ TASK_METADATA_GENERATIVE = {
49
+ "LS": {"icon": "πŸ”„", "name": "Lexical Substitution", "tooltip": ""},
50
+ "SU": {"icon": "πŸ“", "name": "Summarization", "tooltip": ""},
51
+ "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
52
+ "REL": {"icon": "πŸ”—", "name": "Relation Extraction", "tooltip": ""},
53
+ }
54
 
 
55
 
56
+ def theoretical_performance(df_hash):
57
+ """
58
+ Theoretical performance of a model that scores the highest on every individual task
59
+ """
60
+ # This is a placeholder - you'd need to pass the actual dataframe
61
+ # In practice, you'd compute this once and store it
62
+ #fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
63
+ return 75.0 # Placeholder value
64
 
 
 
65
 
66
+ def scale_sizes(values, min_size=8, max_size=30):
67
+ """Normalize sizes for scatter plot markers """
68
+ if not values:
69
+ return []
70
+ vmin, vmax = min(values), max(values)
71
+ if vmax == vmin:
72
+ return [(min_size + max_size) / 2] * len(values)
73
+ return [
74
+ min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size)
75
+ for val in values
76
+ ]
77
 
 
78
 
79
+ def extract_model_name(model_string):
80
+ """Extract model name from HTML string."""
81
+ match = re.search(r'>([^<]+)<', model_string)
82
+ return match.group(1) if match else model_string
83
 
 
 
 
84
 
85
+ def create_line_chart(dataframe):
86
+ """Create left chart."""
87
 
88
+ def scale_sizes(values, min_size=8, max_size=30):
89
+ vmin, vmax = min(values), max(values)
90
+ return [
91
+ min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size) if vmax > vmin
92
+ else (min_size + max_size) / 2
93
+ for val in values
94
+ ]
95
 
96
+ fig = go.Figure()
 
 
97
 
98
+ # Loop su 5-Shot e 0-Shot
99
+ for shot, color in [(True, "blue"), (False, "red")]:
100
+ df = dataframe[dataframe["IS_FS"] == shot]
101
 
102
+ x = df["#Params (B)"].tolist()
103
+ y = df["Avg. Comb. Perf. ⬆️"].tolist()
104
+ labels = [
105
+ re.search(r'>([^<]+)<', m).group(1) if isinstance(m, str) and re.search(r'>([^<]+)<', m) else str(m)
106
+ for m in df["Model"].tolist()
107
+ ]
108
 
109
+ fig.add_trace(go.Scatter(
110
+ x=x,
111
+ y=y,
112
+ mode="markers",
113
+ name="5-Shot" if shot else "0-Shot",
114
+ marker=dict(color=color, size=scale_sizes(x)),
115
+ hovertemplate="<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>",
116
+ customdata=labels,
117
+ ))
118
+
119
+ # Show the best model
120
+ all_y = dataframe["Avg. Comb. Perf. ⬆️"].tolist()
121
+ if all_y:
122
+ max_idx = all_y.index(max(all_y))
123
+ max_x = dataframe["#Params (B)"].iloc[max_idx]
124
+ max_y = all_y[max_idx]
125
+ max_label = re.search(r'>([^<]+)<', dataframe["Model"].iloc[max_idx]).group(1)
126
 
127
+ fig.add_annotation(
128
+ x=max_x,
129
+ y=max_y,
130
+ text=max_label,
131
+ showarrow=True,
132
+ arrowhead=2,
133
+ arrowsize=1,
134
+ arrowwidth=2,
135
+ arrowcolor="black",
136
+ font=dict(size=11, color="black"),
137
+ xshift=10, yshift=10,
138
+ ax=-30, ay=-20,
139
+ xanchor="right"
140
+ )
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
+ # Layout
143
  fig.update_layout(
144
+ title="Avg. Combined Performance vs #Params",
145
+ xaxis_title="#Params (B)", yaxis_title="Avg. Combined Performance",
146
+ template="plotly_white", hovermode="closest",
147
+ font=dict(family="Arial", size=10), dragmode=False,
148
+ xaxis=dict(tickvals=[0, 25, 50, 75, 100, 125], ticktext=["0", "25", "50", "75", "100"]),
149
+ yaxis=dict(tickvals=[0, 20, 40, 60, 80, 100], range=[0, 100])
150
  )
151
 
152
+ # Caption
153
  fig.add_annotation(
154
+ text="Accuracy generally rises with #Params, but smaller models <br>"
155
+ "with 5-shot can outperform larger zero-shot models.",
156
+ xref="paper", yref="paper", x=0.5, y=-0.3,
157
+ showarrow=False, font=dict(size=11, color="gray"),
158
+ align="center", xanchor="center"
 
 
159
  )
160
 
161
+ fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
162
+ fig.update_yaxes(fixedrange=True)
163
 
164
+ return fig
165
 
 
166
 
167
+ # Create right chart
168
+ def create_boxplot_task(dataframe=None, baselines=None, references=None):
169
+ """Create right chart"""
170
 
171
  tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
172
 
173
+ # Dati di default se non forniti
174
  if dataframe is None:
175
  np.random.seed(42)
176
+ dataframe = pd.DataFrame({task: np.random.uniform(0.4, 0.9, 20) * 100 for task in tasks})
 
 
 
177
 
178
  if baselines is None:
179
  baselines = {task: np.random.randint(50, 70) for task in tasks}
180
 
181
+ if references is None:
182
+ references = {}
183
+
184
  colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
185
  "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
186
 
187
  fig = go.Figure()
188
 
189
  for i, task in enumerate(tasks):
190
+ if task not in dataframe.columns:
191
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
+ y_data = dataframe[task].dropna().tolist()
194
+
195
+ # Boxplot
196
+ fig.add_trace(go.Box(
197
+ y=y_data,
198
+ name=task,
199
+ marker=dict(color=colors[i]),
200
+ line=dict(color="black", width=2),
201
+ fillcolor=colors[i],
202
+ opacity=0.7,
203
+ hovertemplate="<b>"+task+"</b><br>Accuracy: %{y:.2f}%<extra></extra>",
204
+ width=0.6,
205
+ whiskerwidth=0.2,
206
+ quartilemethod="linear"
207
+ ))
208
+
209
+ # Linea baseline
210
+ baseline_value = baselines.get(task)
211
+ if baseline_value is not None:
212
+ fig.add_shape(
213
+ type="line",
214
+ x0=i - 0.3, x1=i + 0.3,
215
+ y0=baseline_value, y1=baseline_value,
216
+ line=dict(color="black", width=2, dash="dot"),
217
+ xref="x", yref="y"
218
+ )
219
+
220
+ # Linea reference GPT-4o
221
+ reference_value = references.get(task)
222
+ if reference_value is not None:
223
+ fig.add_shape(
224
+ type="line",
225
+ x0=i - 0.3, x1=i + 0.3,
226
+ y0=reference_value, y1=reference_value,
227
+ line=dict(color="red", width=2, dash="dashdot"),
228
+ xref="x", yref="y"
229
+ )
230
+
231
+ # Layout
232
  fig.update_layout(
233
  title="Distribution of Model Accuracy by Task",
234
  xaxis_title="Task",
 
237
  boxmode="group",
238
  dragmode=False,
239
  font=dict(family="Arial", size=10),
240
+ margin=dict(b=80)
241
  )
242
 
243
+ # Caption
244
  fig.add_annotation(
245
  text=(
246
  "In tasks like TE and SA, models approach the accuracy of supervised <br>"
 
251
  x=0.5, y=-0.30,
252
  showarrow=False,
253
  font=dict(size=11, color="gray"),
254
+ align="center"
255
  )
256
 
257
  fig.update_yaxes(range=[0, 100], fixedrange=True)
258
+ fig.update_xaxes(fixedrange=True)
259
 
260
  return fig
261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
+ def create_medal_assignments(sorted_df):
264
+ """Function for medal assignment logic"""
265
+ medals = {
266
+ 'large_fs': False, 'medium_fs': False, 'small_fs': False,
267
+ 'large_0shot': False, 'medium_0shot': False, 'small_0shot': False
268
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
 
 
 
 
 
 
 
 
 
 
270
  new_model_column = []
271
 
272
+ for _, row in sorted_df.iterrows():
273
+ model_name = row['Model']
274
+ size = row["Size"]
275
+ is_fs = row['IS_FS']
276
+
277
+ if is_fs: # 5-Few-Shot
278
+ if size == "πŸ”΅πŸ”΅πŸ”΅" and not medals['large_fs']:
279
+ model_name = f"{model_name} πŸ”΅πŸ”΅πŸ”΅πŸ†"
280
+ medals['large_fs'] = True
281
+ elif size == "πŸ”΅πŸ”΅" and not medals['medium_fs']:
282
+ model_name = f"{model_name} πŸ”΅πŸ”΅πŸ†"
283
+ medals['medium_fs'] = True
284
+ elif size == "πŸ”΅" and not medals['small_fs']:
285
+ model_name = f"{model_name} πŸ”΅πŸ†"
286
+ medals['small_fs'] = True
287
  else: # 0-Shot
288
+ if size == "πŸ”΅πŸ”΅πŸ”΅" and not medals['large_0shot']:
289
+ model_name = f"{model_name} πŸ”΅πŸ”΅πŸ”΅πŸŽ–οΈ"
290
+ medals['large_0shot'] = True
291
+ elif size == "πŸ”΅πŸ”΅" and not medals['medium_0shot']:
292
+ model_name = f"{model_name} πŸ”΅πŸ”΅πŸŽ–οΈ"
293
+ medals['medium_0shot'] = True
294
+ elif size == "πŸ”΅" and not medals['small_0shot']:
295
+ model_name = f"{model_name} πŸ”΅πŸŽ–οΈ"
296
+ medals['small_0shot'] = True
 
 
 
 
 
 
 
 
 
 
 
297
 
298
+ new_model_column.append(model_name)
299
+
300
+ return new_model_column
301
 
302
+
303
+ def create_leaderboard_base(sorted_dataframe, field_list, hidden_columns):
304
+ """Base leaderboard creation with common parameters. """
305
  return Leaderboard(
306
  value=sorted_dataframe,
307
  datatype=[c.type for c in field_list],
 
 
 
 
 
308
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
309
+ hide_columns=hidden_columns,
310
  filter_columns=[
311
  ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"),
312
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
313
+ label="Select the number of parameters (B)"),
 
314
  ],
 
 
 
 
315
  bool_checkboxgroup_label="Evaluation Mode",
316
  interactive=False,
317
  )
318
 
319
+
320
+ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
321
+ """Leaderboard initialization. """
 
 
322
  if dataframe is None or dataframe.empty:
323
  raise ValueError("Leaderboard DataFrame is empty or None.")
324
 
325
+ # Sort and reset index
326
+ sorted_dataframe = dataframe.sort_values(by="Avg. Comb. Perf. ⬆️", ascending=False).reset_index(drop=True)
 
 
327
  sorted_dataframe["Rank"] = sorted_dataframe.index + 1
328
 
329
+ # Apply medal assignments
330
+ sorted_dataframe["Model"] = create_medal_assignments(sorted_dataframe)
 
 
331
 
332
+ field_list = fields(AutoEvalColumn)
 
 
333
 
334
+ return create_leaderboard_base(sorted_dataframe, field_list, hidden_columns)
 
335
 
336
+
337
+ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None):
338
+ """ Task-specific leaderboard update."""
339
+ if dataframe is None or dataframe.empty:
340
+ raise ValueError("Leaderboard DataFrame is empty or None.")
341
+
342
+ # Sort and reset index
343
+ sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False).reset_index(drop=True)
344
+ sorted_dataframe["Rank"] = sorted_dataframe.index + 1
345
+
346
+ # Apply medal assignments
347
+ sorted_dataframe["Model"] = create_medal_assignments(sorted_dataframe)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
 
349
  field_list = fields(AutoEvalColumn)
350
 
351
  return Leaderboard(
352
  value=sorted_dataframe,
 
353
  datatype=[c.type for c in field_list] + [int],
 
 
 
 
 
354
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
355
+ hide_columns=hidden_columns,
356
  filter_columns=[
357
  ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"),
358
  ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
 
362
  interactive=False
363
  )
364
 
 
 
 
 
 
 
365
 
366
+ def download_snapshot(repo, local_dir, max_retries=3):
367
+ """Snapshot download with retry logic."""
368
+ for attempt in range(max_retries):
369
+ try:
370
+ logger.info(f"Downloading from {repo} to {local_dir} (attempt {attempt + 1}/{max_retries})")
371
+ snapshot_download(
372
+ repo_id=repo,
373
+ local_dir=local_dir,
374
+ repo_type="dataset",
375
+ tqdm_class=None,
376
+ etag_timeout=30,
377
+ token=TOKEN
378
+ )
379
+ return True
380
+ except Exception as e:
381
+ logger.error(f"Error downloading {repo} (attempt {attempt + 1}): {e}")
382
+ if attempt == max_retries - 1:
383
+ logger.error(f"Failed to download {repo} after {max_retries} attempts")
384
+ return False
385
+ return False
386
 
387
+
388
+ def restart_space():
389
+ """Restart the Hugging Face space."""
390
  try:
391
+ logger.info("Restarting space...")
392
+ API.restart_space(repo_id=REPO_ID)
393
  except Exception as e:
394
+ logger.error(f"Error restarting space: {e}")
395
+
396
+
397
+ def create_title_html():
398
+ """Function for title HTML."""
399
+ return """
400
+ <div style="display: flex; align-items: center; position: relative; width: 100%; height: 60px; padding: 10px 0;">
401
+ <h1 style="
402
+ margin: 0 auto;
403
+ font-weight: 900;
404
+ font-size: 2.5em;
405
+ letter-spacing: 2px;
406
+ text-transform: uppercase;
407
+ background: linear-gradient(90deg, #1f77b4, #00c6ff);
408
+ -webkit-background-clip: text;
409
+ -webkit-text-fill-color: transparent;
410
+ text-shadow: 2px 2px 8px rgba(0,0,0,0.2);
411
+ ">
412
+ EVALITA-LLM Leaderboard
413
+ </h1>
414
+ <a href="https://huggingface.co/spaces/mii-llm/open_ita_llm_leaderboard" target="_blank"
415
+ style="position: absolute; right: 0; display: inline-flex; align-items: center; gap: 6px; text-decoration: none; color: #1f77b4; font-weight: 600;">
416
+ <svg xmlns="http://www.w3.org/2000/svg" width="22" height="22" fill="#1f77b4" viewBox="0 0 24 24">
417
+ <path d="M3.9 12a5 5 0 0 1 7.07-7.07l1.41 1.41-1.41 1.41-1.42-1.42a3 3 0 1 0 4.24 4.24l3.54-3.54a5 5 0 0 1-7.07 7.07l-1.41-1.41 1.41-1.41 1.42 1.42z"/>
418
+ <path d="M20.1 12a5 5 0 0 1-7.07 7.07l-1.41-1.41 1.41-1.41 1.42 1.42a3 3 0 1 0-4.24-4.24l-3.54 3.54a5 5 0 0 1 7.07-7.07l1.41 1.41-1.41 1.41-1.42-1.42z"/>
419
+ </svg>
420
+ Open Italian LLM Leaderboard
421
+ </a>
422
+ </div>
423
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
 
 
 
 
 
 
425
 
426
+ def create_credits_markdown():
427
+ """Credits section."""
428
+ return """
429
+ **This project has benefited from the following support:**
430
 
431
+ - 🧠 **Codebase**: Based on and extended from the Open Italian LLM Leaderboard, developed by **Alessandro Ercolani** and **Samuele Colombo**. We warmly thank them for their invaluable support and guidance in implementing this leaderboard.
 
432
 
433
+ - πŸ’Ά **Funding**: Partially supported by the PNRR project **FAIR - Future AI Research (PE00000013)**, under the NRRP MUR program funded by **NextGenerationEU**.
 
 
 
 
434
 
435
+ - πŸ–₯️ **Computation**: We gratefully acknowledge **CINECA** for granting access to the **LEONARDO** supercomputer.
436
+ """
437
+
438
+
439
+ # Main initialization
440
+ def initialize_app():
441
+ """Initialize the application."""
442
+ try:
443
+ # Download snapshots
444
+ queue_success = download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
445
+ results_success = download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
446
+
447
+ if not (queue_success and results_success):
448
+ logger.error("Failed to download required data")
449
+ return None, None, None, None, None
450
+
451
+ # Load leaderboard data
452
+ leaderboard_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
453
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
454
+ EVAL_REQUESTS_PATH, EVAL_COLS)
455
+
456
+ # Calculate theoretical max performance
457
+ theoretical_max = theoretical_performance(hash(str(leaderboard_df.values.tobytes())))
458
+
459
+ return leaderboard_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, theoretical_max
460
+
461
+ except Exception as e:
462
+ logger.error(f"Error initializing app: {e}")
463
+ return None, None, None, None, None
464
+
465
+
466
+ # Initialize data
467
+ LEADERBOARD_DF, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, theoretical_max_combined_perf = initialize_app()
468
+
469
+ if LEADERBOARD_DF is None:
470
+ # Fallback behavior
471
+ logger.error("Failed to initialize app data")
472
+ theoretical_max_combined_perf = 0.0
473
+
474
+
475
+ def create_gradio_interface():
476
+ """The main Gradio interface."""
477
+ demo = gr.Blocks(css=custom_css)
478
+
479
+ with demo:
480
+ # Title
481
+ gr.HTML(create_title_html())
482
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
483
+
484
+ # Charts section
485
+ with gr.Row():
486
+ if LEADERBOARD_DF is not None:
487
+ # Note: You'd need to implement these chart functions properly
488
+ gr.Plot(value=create_line_chart(LEADERBOARD_DF), elem_id="line-chart")
489
+ gr.Plot(value=create_boxplot_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="boxplot-task")
490
+
491
+ # Tabs
492
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
493
+ # Main leaderboard tab
494
+ with gr.TabItem("πŸ… Benchmark"):
495
+ if LEADERBOARD_DF is not None:
496
+ leaderboard = init_leaderboard(
497
+ LEADERBOARD_DF,
498
+ default_selection=['Rank', 'Size', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT",
499
+ "WIC", "FAQ", "LS", "SU", "NER", "REL"],
500
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
501
+ col not in ['Rank', 'Size', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA",
502
+ "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
503
+ )
504
+
505
+ gr.HTML(
506
+ f"""
507
  <div style="
508
  border: 2px solid #1f77b4;
509
  border-radius: 10px;
 
513
  font-size: 14px;
514
  display: inline-block;
515
  ">
516
+ Theoretical performance of a model that scores the highest on every individual task:
517
+ <span style="color:#d62728; font-size:18px;">{theoretical_max_combined_perf:.2f}</span>
518
  </div>
519
  """
520
+ )
521
+
522
+ # About tab
523
+ with gr.TabItem("πŸ“ About"):
524
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
525
+
526
+ with gr.TabItem("β•‘", interactive=False):
527
+ gr.Markdown("", elem_classes="markdown-text")
528
+
529
+ # Task-specific tabs
530
+ if LEADERBOARD_DF is not None:
531
+ # Multiple choice tasks
532
+ for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
533
+ with gr.TabItem(f"{metadata['icon']}{task}"):
534
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
535
+ gr.Markdown(task_description, elem_classes="markdown-text")
536
+
537
+ leaderboard = update_task_leaderboard(
538
+ LEADERBOARD_DF.rename(columns={
539
+ f"{task} Prompt Average": "Prompt Average",
540
+ f"{task} Prompt Std": "Prompt Std",
541
+ f"{task} Best Prompt": "Best Prompt",
542
+ f"{task} Best Prompt Id": "Best Prompt Id",
543
+ task: "Combined Performance"
544
+ }),
545
+ default_selection=['Rank', 'Size', 'FS', 'Model', 'Combined Performance', 'Prompt Average',
546
+ 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
547
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
548
+ col not in ['Rank', 'Size', 'FS', 'Model', 'Combined Performance',
549
+ 'Prompt Average', 'Prompt Std', 'Best Prompt',
550
+ 'Best Prompt Id']]
551
+ )
552
+
553
+ with gr.TabItem("β”‚", interactive=False):
554
+ gr.Markdown("", elem_classes="markdown-text")
555
+
556
+ # Generative tasks
557
+ for task, metadata in TASK_METADATA_GENERATIVE.items():
558
+ with gr.TabItem(f"{metadata['icon']}{task}"):
559
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
560
+ gr.Markdown(task_description, elem_classes="markdown-text")
561
+
562
+ leaderboard = update_task_leaderboard(
563
+ LEADERBOARD_DF.rename(columns={
564
+ f"{task} Prompt Average": "Prompt Average",
565
+ f"{task} Prompt Std": "Prompt Std",
566
+ f"{task} Best Prompt": "Best Prompt",
567
+ f"{task} Best Prompt Id": "Best Prompt Id",
568
+ task: "Combined Performance"
569
+ }),
570
+ default_selection=['Rank', 'Size', 'FS', 'Model', 'Combined Performance', 'Prompt Average',
571
+ 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
572
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
573
+ col not in ['Rank', 'Size', 'FS', 'Model', 'Combined Performance',
574
+ 'Prompt Average', 'Prompt Std', 'Best Prompt',
575
+ 'Best Prompt Id']]
576
+ )
577
+
578
+ # Citation and Credits sections
579
+ with gr.Accordion("πŸ“™ Citation", open=False):
580
+ gr.Textbox(
581
+ value=CITATION_BUTTON_TEXT,
582
+ label=CITATION_BUTTON_LABEL,
583
+ lines=20,
584
+ elem_id="citation-button",
585
+ show_copy_button=True
586
  )
587
 
588
+ with gr.Accordion("πŸ“™ Credits", open=False):
589
+ gr.Markdown(create_credits_markdown())
590
+
591
+ return demo
592
+
593
+
594
+ # Create and configure the demo
595
+ demo = create_gradio_interface()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
 
597
+ # Background scheduler for space restart
598
  scheduler = BackgroundScheduler()
599
  scheduler.add_job(restart_space, "interval", seconds=1800)
600
  scheduler.start()
601
 
602
+ # Launch configuration
603
+ if __name__ == "__main__":
604
+ demo.queue(default_concurrency_limit=40).launch(
605
+ debug=True,
606
+ show_error=True
607
+ )
app_18_09_2025.py ADDED
@@ -0,0 +1,823 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+ from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
7
+ from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION
8
+ from src.display.css_html_js import custom_css
9
+ from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision
10
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
11
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
12
+ from src.submission.submit import add_new_eval
13
+ import random
14
+ import matplotlib.pyplot as plt
15
+ import re
16
+ import plotly.express as px
17
+ import plotly.graph_objects as go
18
+ import numpy as np
19
+
20
+
21
+ def mean_of_max_per_field(df):
22
+ """
23
+ Calcola il massimo per ciascun campo e poi la media dei massimi.
24
+
25
+ Args:
26
+ df (pd.DataFrame): DataFrame con colonne TE, SA, HS, AT, WIC, FAQ, LS, SU, NER, REL
27
+
28
+ Returns:
29
+ float: media dei valori massimi dei campi
30
+ """
31
+ fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
32
+
33
+ #print(df.columns)
34
+
35
+ # Controlla che tutte le colonne esistano nel DataFrame
36
+ missing = [f for f in fields if f not in df.columns]
37
+ if missing:
38
+ raise ValueError(f"Le seguenti colonne mancano nel DataFrame: {missing}")
39
+
40
+ # Calcola il massimo per ciascun campo
41
+ max_values = df[fields].max()
42
+
43
+ # Calcola la media dei massimi
44
+ mean_max = max_values.mean()
45
+
46
+ return mean_max
47
+
48
+
49
+ def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
50
+ if tasks is None:
51
+ tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
52
+
53
+ task_means = {}
54
+
55
+ for task in tasks:
56
+ if task not in dataframe.columns:
57
+ continue
58
+
59
+ # Separa few-shot e zero-shot
60
+ few_shot = dataframe[dataframe['IS_FS'] == True][["Model", task]]
61
+ zero_shot = dataframe[dataframe['IS_FS'] == False][["Model", task]]
62
+
63
+ # Allinea i modelli
64
+ merged = pd.merge(few_shot, zero_shot, on="Model", suffixes=("_few", "_zero"))
65
+
66
+ # Rimuovi righe con valori mancanti
67
+ merged = merged.dropna(subset=[f"{task}_few", f"{task}_zero"])
68
+
69
+ if merged.empty:
70
+ continue
71
+
72
+ # Calcola differenza few - zero
73
+ diff = merged[f"{task}_few"] - merged[f"{task}_zero"]
74
+
75
+ # Calcola la media
76
+ task_means[task] = diff.mean()
77
+
78
+ # Crea barplot
79
+ fig = go.Figure([go.Bar(
80
+ x=list(task_means.keys()),
81
+ y=list(task_means.values()),
82
+ marker_color="#ff7f0e",
83
+ text=[f"{v:.2f}" for v in task_means.values()],
84
+ textposition="outside",
85
+ hovertemplate="<b>%{x}</b><br>Mean Delta Accuracy: %{y:.2f}%<extra></extra>"
86
+ )])
87
+
88
+ # Linea di riferimento a 0
89
+ '''
90
+ fig.add_shape(
91
+ type="line",
92
+ x0=-0.5, x1=len(task_means) - 0.5,
93
+ y0=0, y1=0,
94
+ line=dict(color="black", width=2, dash="dash"),
95
+ xref="x", yref="y"
96
+ )
97
+ '''
98
+
99
+ fig.update_layout(
100
+ title="Mean Accuracy Difference (Few-shot βˆ’ Zero-shot) per Task",
101
+ xaxis_title="",
102
+ yaxis_title="Mean Delta Combined Performance",
103
+ template="plotly_white",
104
+ font=dict(family="Arial", size=13),
105
+ #margin=dict(b=100)
106
+ )
107
+
108
+ fig.add_annotation(
109
+ text="5-shot learning generally outperforms zero-shot, especially in tasks like NER and REL.<br>"
110
+ "Only in Summarization (SU) does it provide no accuracy gain.",
111
+ xref="paper", yref="paper",
112
+ x=0, y=-0.2,
113
+ showarrow=False,
114
+ font=dict(size=11, color="gray"),
115
+ align="left"
116
+ )
117
+
118
+ return fig
119
+
120
+
121
+ def boxplot_per_task(dataframe=None, baselines=None, references=None):
122
+
123
+ #print(dataframe.columns)
124
+
125
+ tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
126
+
127
+ if dataframe is None:
128
+ np.random.seed(42)
129
+ dataframe = pd.DataFrame({
130
+ task: np.random.uniform(0.4, 0.9, 20) * 100
131
+ for task in tasks
132
+ })
133
+
134
+ if baselines is None:
135
+ baselines = {task: np.random.randint(50, 70) for task in tasks}
136
+
137
+ colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
138
+ "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
139
+
140
+ fig = go.Figure()
141
+
142
+ for i, task in enumerate(tasks):
143
+ if task in dataframe.columns:
144
+ y_data = dataframe[task].dropna().tolist()
145
+
146
+ # boxplot
147
+ fig.add_trace(go.Box(
148
+ y=y_data,
149
+ name=task,
150
+ marker=dict(color=colors[i]),
151
+ line=dict(color="black", width=2),
152
+ fillcolor=colors[i],
153
+ opacity=0.7,
154
+ hovertemplate="<b>"+task+"</b><br>Accuracy: %{y:.2f}%<extra></extra>",
155
+ width=0.6,
156
+ whiskerwidth=0.2,
157
+ quartilemethod="linear"
158
+ ))
159
+
160
+ # baseline
161
+ if task in baselines and baselines[task] is not None:
162
+ fig.add_shape(
163
+ type="line",
164
+ x0=i - 0.3, x1=i + 0.3,
165
+ y0=baselines[task], y1=baselines[task],
166
+ line=dict(color="black", width=2, dash="dot"), # piΓΉ visibile
167
+ xref="x", yref="y"
168
+ )
169
+ '''
170
+ fig.add_annotation(
171
+ x=i, y=baselines[task],
172
+ text=f"{baselines[task]}%",
173
+ showarrow=False,
174
+ yshift=10,
175
+ font=dict(size=10, color="black")
176
+ )
177
+ '''
178
+
179
+ # reference GPT-4o
180
+ if task in references and references[task] is not None:
181
+ fig.add_shape(
182
+ type="line",
183
+ x0=i - 0.3, x1=i + 0.3,
184
+ y0=references[task], y1=references[task],
185
+ line=dict(color="red", width=2, dash="dashdot"),
186
+ xref="x", yref="y"
187
+ )
188
+
189
+ fig.update_layout(
190
+ title="Distribution of Model Accuracy by Task",
191
+ xaxis_title="Task",
192
+ yaxis_title="Combined Performance",
193
+ template="plotly_white",
194
+ boxmode="group",
195
+ dragmode=False,
196
+ font=dict(family="Arial", size=10),
197
+ margin=dict(b=80),
198
+ )
199
+
200
+ fig.add_annotation(
201
+ text=(
202
+ "In tasks like TE and SA, models approach the accuracy of supervised <br>"
203
+ "models at EVALITA (dashed black line); in NER and REL they remain lower. <br>"
204
+ "Dashed red lines show GPT-4o reference results for generative tasks."
205
+ ),
206
+ xref="paper", yref="paper",
207
+ x=0.5, y=-0.30,
208
+ showarrow=False,
209
+ font=dict(size=11, color="gray"),
210
+ align="left"
211
+ )
212
+
213
+ fig.update_yaxes(range=[0, 100], fixedrange=True)
214
+
215
+ return fig
216
+
217
+ # EVALITA results
218
+ BASELINES = {
219
+ "TE":71.00, "SA": 66.38, "HS": 80.88, "AT": 82.40, "WIC": 85.00,
220
+ "LS": 38.82, "SU": 38.91, "NER":88.00, "REL": 62.99
221
+ }
222
+
223
+ # GPT-4o
224
+ REFERENCES = {
225
+ "NER": 79.11,
226
+ "REL": 63.32,
227
+ "LS": 59.25,
228
+ "SU": 33.04
229
+
230
+ }
231
+
232
+
233
+ def boxplot_prompts_per_task(dataframe, tasks=None):
234
+ if tasks is None:
235
+ tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
236
+
237
+ # Lista delle colonne da aggiornare
238
+ cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"]
239
+ # Applichiamo la trasformazione
240
+ for col in cols_to_update:
241
+ dataframe[col] = dataframe[col].replace({1: 7, 2: 8})
242
+
243
+ fig = go.Figure()
244
+
245
+ # Liste per creare una sola voce in legenda per Average e Best
246
+ avg_x, avg_y = [], []
247
+ best_x, best_y, best_text = [], [], []
248
+
249
+ for task in tasks:
250
+ avg_col = f"{task} Prompt Average"
251
+ best_col = f"{task} Best Prompt"
252
+ best_id_col = f"{task} Best Prompt Id"
253
+
254
+ if all(col in dataframe.columns for col in [avg_col, best_col, best_id_col]):
255
+ avg_value = dataframe[avg_col].mean()
256
+ avg_x.append(task)
257
+ avg_y.append(avg_value)
258
+
259
+ best_value = dataframe[best_col].mean()
260
+ best_x.append(task)
261
+ best_y.append(best_value)
262
+ best_id = dataframe[best_id_col].mode()[0] # Most frequent best prompt id
263
+ best_text.append(f"P:{best_id}")
264
+
265
+ # Barre Average Accuracy (azzurro)
266
+ fig.add_trace(go.Bar(
267
+ x=avg_x,
268
+ y=avg_y,
269
+ name="Avg. Accuracy",
270
+ marker_color="#1f77b4",
271
+ #hovertemplate="%{y:.2f}%<extra></extra>"
272
+ #hovertemplate="<b>" + task + "</b><br>Accuracy: %{y:.2f}%<extra></extra>",
273
+ ))
274
+
275
+ # Barre Best Prompt (rosso)
276
+ fig.add_trace(go.Bar(
277
+ x=best_x,
278
+ y=best_y,
279
+ name="Best Prompt",
280
+ marker_color="#d62728",
281
+ #hovertemplate="%{y:.2f}%<extra></extra>"
282
+ #hovertemplate = "<b>" + task + "</b><br>Accuracy: %{y:.2f}%<extra></extra>",
283
+ ))
284
+
285
+ # Testo sopra barre Best Prompt con ID
286
+ for x, y, text in zip(best_x, best_y, best_text):
287
+ fig.add_annotation(
288
+ x=x,
289
+ y=y + 3, # leggermente sopra la barra
290
+ text=text,
291
+ showarrow=False,
292
+ font=dict(size=12, color="black")
293
+ )
294
+
295
+ fig.update_layout(
296
+ title= "Prompt Accuracy: Avg vs Best",
297
+ xaxis_title="Task",
298
+ yaxis_title="Combined Performance",
299
+ barmode='group',
300
+ template="plotly_white",
301
+ font=dict(family="Arial", size=10),
302
+ yaxis=dict(range=[0, 100], fixedrange=True)
303
+ )
304
+
305
+ # caption come annotazione separata
306
+ fig.add_annotation(
307
+ text="There is no single prompt that performs best across all tasks.<br>"
308
+ "Different prompts achieve the highest accuracy on different tasks.",
309
+ xref="paper", yref="paper",
310
+ x=0.5, y=-0.3,
311
+ showarrow=False,
312
+ font=dict(size=11, color="gray"),
313
+ align="center",
314
+ xanchor="center"
315
+ )
316
+
317
+ return fig
318
+
319
+
320
+ def line_chart(dataframe):
321
+
322
+ # Normalizza le dimensioni per avere marker non troppo piccoli nΓ© enormi
323
+ def scale_sizes(values, min_size=8, max_size=30):
324
+ vmin, vmax = min(values), max(values)
325
+ return [
326
+ min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size) if vmax > vmin else (min_size + max_size) / 2
327
+ for val in values
328
+ ]
329
+
330
+ # dati in base a IS_FS
331
+ df_true = dataframe[dataframe['IS_FS'] == True]
332
+ df_false = dataframe[dataframe['IS_FS'] == False]
333
+
334
+ # Estrai valori x, y e labels
335
+ x_true = df_true['#Params (B)'].tolist()
336
+ y_true = df_true['Avg. Comb. Perf. ⬆️'].tolist()
337
+ labels_true = [re.search(r'>([^<]+)<', m).group(1) for m in df_true['Model'].tolist()]
338
+
339
+ x_false = df_false['#Params (B)'].tolist()
340
+ y_false = df_false['Avg. Comb. Perf. ⬆️'].tolist()
341
+ labels_false = [re.search(r'>([^<]+)<', m).group(1) for m in df_false['Model'].tolist()]
342
+
343
+ fig = go.Figure()
344
+
345
+ # Punti IS_FS=True
346
+ fig.add_trace(go.Scatter(
347
+ x=x_true,
348
+ y=y_true,
349
+ mode='markers',
350
+ name='5-Shot',
351
+ marker=dict(
352
+ color='blue',
353
+ size=scale_sizes(x_true)
354
+ ),
355
+ hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>',
356
+ customdata=labels_true
357
+ ))
358
+
359
+ # Punti IS_FS=False
360
+ fig.add_trace(go.Scatter(
361
+ x=x_false,
362
+ y=y_false,
363
+ mode='markers',
364
+ name='0-Shot',
365
+ marker=dict(
366
+ color='red',
367
+ size=scale_sizes(x_false)
368
+ ),
369
+ hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>',
370
+ customdata=labels_false
371
+ ))
372
+
373
+ # Trova il massimo tra tutti i modelli
374
+ all_y = y_true + y_false
375
+ all_x = x_true + x_false
376
+ all_labels = labels_true + labels_false
377
+ max_idx = all_y.index(max(all_y))
378
+ max_x = all_x[max_idx]
379
+ max_y = all_y[max_idx]
380
+ max_label = all_labels[max_idx]
381
+
382
+ # Aggiungi annotazione visibile per il modello migliore
383
+ fig.add_annotation(
384
+ x=max_x,
385
+ y=max_y,
386
+ #text=f"Top: {max_label} ({max_y:.1f}%)",
387
+ text=f"{max_label}",
388
+ showarrow=True,
389
+ arrowhead=2,
390
+ arrowsize=1,
391
+ arrowwidth=2,
392
+ arrowcolor="black",
393
+ font=dict(size=11, color="black"),
394
+ xshift=10,
395
+ yshift=10,
396
+ ax = -30, ay = -20, # sposta la label a sinistra e sopra il punto
397
+ xanchor = "right" # allinea la label a destra rispetto al punto
398
+ )
399
+
400
+ fig.update_layout(
401
+ title="Avg. Combined Performance vs #Params",
402
+ xaxis_title="#Params (B)",
403
+ yaxis_title="Avg. Combined Performance",
404
+ template="plotly_white",
405
+ hovermode="closest",
406
+ font=dict(family="Arial", size=10),
407
+ dragmode=False,
408
+ xaxis=dict(
409
+ tickvals=[0, 25, 50, 75, 100, 125],
410
+ ticktext=["0", "25", "50", "75", "100"]
411
+ ),
412
+ yaxis=dict(
413
+ tickvals=[0, 20, 40, 60, 80, 100], # πŸ‘ˆ tick fissi
414
+ range=[0, 100] # πŸ‘ˆ range bloccato
415
+ )
416
+ )
417
+
418
+ # Caption
419
+ fig.add_annotation(
420
+ text="Accuracy generally rises with #Params, but smaller models <br>"
421
+ "with 5-shot can outperform larger zero-shot models.",
422
+ xref="paper", yref="paper",
423
+ x=0.5, y=-0.3, # πŸ‘ˆ centrata
424
+ showarrow=False,
425
+ font=dict(size=11, color="gray"),
426
+ align="center",
427
+ xanchor="center" # πŸ‘ˆ ancora centrata rispetto al testo
428
+ )
429
+
430
+ fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
431
+ fig.update_yaxes(fixedrange=True)
432
+
433
+ return fig
434
+
435
+
436
+ # Define task metadata (icons, names, descriptions)
437
+ TASK_METADATA_MULTIPLECHOICE = {
438
+ "TE": {"icon": "πŸ“Š", "name": "Textual Entailment", "tooltip": ""},
439
+ "SA": {"icon": "πŸ˜ƒ", "name": "Sentiment Analysis", "tooltip": ""},
440
+ "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""},
441
+ "AT": {"icon": "πŸ₯", "name": "Admission Test", "tooltip": ""},
442
+ "WIC": {"icon": "πŸ”€", "name": "Word in Context", "tooltip": ""},
443
+ "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""}
444
+ }
445
+
446
+ # Define task metadata (icons, names, descriptions)
447
+ TASK_METADATA_GENERATIVE = {
448
+ "LS": {"icon": "πŸ”„", "name": "Lexical Substitution", "tooltip": ""},
449
+ "SU": {"icon": "πŸ“", "name": "Summarization", "tooltip": ""},
450
+ "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
451
+ "REL": {"icon": "πŸ”—", "name": "Relation Extraction", "tooltip": ""},
452
+ }
453
+
454
+ def restart_space():
455
+ """Restart the Hugging Face space."""
456
+ API.restart_space(repo_id=REPO_ID)
457
+
458
+
459
+ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
460
+ """
461
+ Initialize and return the leaderboard when it is first loaded or when 'benchmark' is selected.
462
+ The table is sorted based on the "Avg. Combined Performance" field.
463
+ """
464
+ if dataframe is None or dataframe.empty:
465
+ raise ValueError("Leaderboard DataFrame is empty or None.")
466
+
467
+ #print("????????????????????????????????", mean_of_max_per_field(dataframe))
468
+
469
+ sorted_dataframe = dataframe.sort_values(by="Avg. Comb. Perf. ⬆️", ascending=False)
470
+
471
+ sorted_dataframe = sorted_dataframe.reset_index(drop=True)
472
+ sorted_dataframe["Rank"] = sorted_dataframe.index + 1
473
+
474
+ # Flag per sapere se la medaglia Γ¨ giΓ  stata assegnata per categoria e tipo
475
+ large_medal_fs_assigned = False
476
+ medium_medal_fs_assigned = False
477
+ small_medal_fs_assigned = False
478
+
479
+ large_medal_0shot_assigned = False
480
+ medium_medal_0shot_assigned = False
481
+ small_medal_0shot_assigned = False
482
+
483
+ # Lista temporanea per salvare i nuovi valori della colonna Model
484
+ new_model_column = []
485
+
486
+ for _, row in sorted_dataframe.iterrows():
487
+ if row['IS_FS']: # 5-Few-Shot
488
+ if row["Size"] == "πŸ”΅πŸ”΅πŸ”΅" and not large_medal_fs_assigned:
489
+ new_model_column.append(f"{row['Model']} πŸ”΅πŸ”΅πŸ”΅πŸ†")
490
+ large_medal_fs_assigned = True
491
+ elif row["Size"] == "πŸ”΅πŸ”΅" and not medium_medal_fs_assigned:
492
+ new_model_column.append(f"{row['Model']} πŸ”΅πŸ”΅πŸ†")
493
+ medium_medal_fs_assigned = True
494
+ elif row["Size"] == "πŸ”΅" and not small_medal_fs_assigned:
495
+ new_model_column.append(f"{row['Model']} πŸ”΅πŸ†")
496
+ small_medal_fs_assigned = True
497
+ else:
498
+ new_model_column.append(row["Model"])
499
+ else: # 0-Shot
500
+ if row["Size"] == "πŸ”΅πŸ”΅πŸ”΅" and not large_medal_0shot_assigned:
501
+ new_model_column.append(f"{row['Model']} πŸ”΅πŸ”΅πŸ”΅πŸŽ–οΈ")
502
+ large_medal_0shot_assigned = True
503
+ elif row["Size"] == "πŸ”΅πŸ”΅" and not medium_medal_0shot_assigned:
504
+ new_model_column.append(f"{row['Model']} πŸ”΅πŸ”΅πŸŽ–οΈ")
505
+ medium_medal_0shot_assigned = True
506
+ elif row["Size"] == "πŸ”΅" and not small_medal_0shot_assigned:
507
+ new_model_column.append(f"{row['Model']} πŸ”΅πŸŽ–οΈ")
508
+ small_medal_0shot_assigned = True
509
+ else:
510
+ new_model_column.append(row["Model"])
511
+
512
+ # Lista delle colonne da aggiornare
513
+ #cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"]
514
+ # Applichiamo la trasformazione
515
+ #for col in cols_to_update:
516
+ # dataframe[col] = dataframe[col].replace({1: 7, 2: 8})
517
+
518
+ # Aggiorna la colonna Model
519
+ sorted_dataframe["Model"] = new_model_column
520
+
521
+ field_list = fields(AutoEvalColumn)
522
+
523
+ return Leaderboard(
524
+ value=sorted_dataframe,
525
+ datatype=[c.type for c in field_list],
526
+ #select_columns=SelectColumns(
527
+ # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
528
+ # cant_deselect=[c.name for c in field_list if c.never_hidden],
529
+ # label="Select Columns to Display:",
530
+ #),
531
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
532
+ hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
533
+ filter_columns=[
534
+ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"),
535
+ #ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)",
536
+ # default=[["0️⃣", "0️⃣"]]),
537
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max = 100, default = [0,100], label="Select the number of parameters (B)"),
538
+ ],
539
+ #filter_columns=[
540
+ # ColumnFilter("IS_FS", type="checkbox", default=False, label="5-Few-Shot")
541
+ # #ColumnFilter("FS", type="dropdown", label="5-Few-Shot")
542
+ #],
543
+ bool_checkboxgroup_label="Evaluation Mode",
544
+ interactive=False,
545
+ )
546
+
547
+ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None):
548
+ """
549
+ Update and return the leaderboard when a specific task is selected.
550
+ The table is sorted based on the "Combined Performance" field.
551
+ """
552
+ if dataframe is None or dataframe.empty:
553
+ raise ValueError("Leaderboard DataFrame is empty or None.")
554
+
555
+ sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False)
556
+
557
+ # aggiungo la colonna rank in base alla posizione
558
+ sorted_dataframe = sorted_dataframe.reset_index(drop=True)
559
+ sorted_dataframe["Rank"] = sorted_dataframe.index + 1
560
+
561
+ # Flag per sapere se la medaglia Γ¨ giΓ  stata assegnata per categoria e tipo
562
+ large_medal_fs_assigned = False
563
+ medium_medal_fs_assigned = False
564
+ small_medal_fs_assigned = False
565
+
566
+ large_medal_0shot_assigned = False
567
+ medium_medal_0shot_assigned = False
568
+ small_medal_0shot_assigned = False
569
+
570
+ # Lista temporanea per salvare i nuovi valori della colonna Model
571
+ new_model_column = []
572
+
573
+ for _, row in sorted_dataframe.iterrows():
574
+ if row['IS_FS']: # 5-Few-Shot
575
+ if row["Size"] == "πŸ”΅πŸ”΅πŸ”΅" and not large_medal_fs_assigned:
576
+ new_model_column.append(f"{row['Model']} πŸ”΅πŸ”΅πŸ”΅πŸ†")
577
+ large_medal_fs_assigned = True
578
+ elif row["Size"] == "πŸ”΅πŸ”΅" and not medium_medal_fs_assigned:
579
+ new_model_column.append(f"{row['Model']} πŸ”΅πŸ”΅πŸ†")
580
+ medium_medal_fs_assigned = True
581
+ elif row["Size"] == "πŸ”΅" and not small_medal_fs_assigned:
582
+ new_model_column.append(f"{row['Model']} πŸ”΅πŸ†")
583
+ small_medal_fs_assigned = True
584
+ else:
585
+ new_model_column.append(row["Model"])
586
+ else: # 0-Shot
587
+ if row["Size"] == "πŸ”΅πŸ”΅πŸ”΅" and not large_medal_0shot_assigned:
588
+ new_model_column.append(f"{row['Model']} πŸ”΅πŸ”΅πŸ”΅πŸŽ–οΈ")
589
+ large_medal_0shot_assigned = True
590
+ elif row["Size"] == "πŸ”΅πŸ”΅" and not medium_medal_0shot_assigned:
591
+ new_model_column.append(f"{row['Model']} πŸ”΅πŸ”΅πŸŽ–οΈ")
592
+ medium_medal_0shot_assigned = True
593
+ elif row["Size"] == "πŸ”΅" and not small_medal_0shot_assigned:
594
+ new_model_column.append(f"{row['Model']} πŸ”΅πŸŽ–οΈ")
595
+ small_medal_0shot_assigned = True
596
+ else:
597
+ new_model_column.append(row["Model"])
598
+
599
+ # Aggiorna la colonna Model
600
+ sorted_dataframe["Model"] = new_model_column
601
+
602
+ pd.set_option('display.max_colwidth', None)
603
+ #print("========================", dataframe['Model'])
604
+
605
+ #print(sorted_dataframe['Combined Performance'])
606
+
607
+ field_list = fields(AutoEvalColumn)
608
+
609
+ return Leaderboard(
610
+ value=sorted_dataframe,
611
+ #datatype=[c.type for c in field_list],
612
+ datatype=[c.type for c in field_list] + [int],
613
+ #select_columns=SelectColumns(
614
+ # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
615
+ # cant_deselect=[c.name for c in field_list if c.never_hidden],
616
+ # label="Select Columns to Display:",
617
+ #),
618
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
619
+ hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
620
+ filter_columns=[
621
+ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"),
622
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
623
+ label="Select the number of parameters (B)"),
624
+ ],
625
+ bool_checkboxgroup_label="Evaluation Mode",
626
+ interactive=False
627
+ )
628
+
629
+ '''
630
+ # Helper function for leaderboard initialization
631
+ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
632
+ """Initialize and return a leaderboard."""
633
+ if dataframe is None or dataframe.empty:
634
+ raise ValueError("Leaderboard DataFrame is empty or None.")
635
+
636
+ return Leaderboard(
637
+ value=dataframe,
638
+ datatype=[c.type for c in fields(AutoEvalColumn)],
639
+ select_columns=SelectColumns(
640
+ default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
641
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
642
+ label="Select Columns to Display:",
643
+ ),
644
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
645
+ hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
646
+ filter_columns=[
647
+ ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
648
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
649
+ ],
650
+ bool_checkboxgroup_label="Hide models",
651
+ interactive=False,
652
+ )
653
+ '''
654
+
655
+ def download_snapshot(repo, local_dir):
656
+ """Try to download a snapshot from Hugging Face Hub."""
657
+ try:
658
+ print(f"Downloading from {repo} to {local_dir}...")
659
+ snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN)
660
+ except Exception as e:
661
+ print(f"Error downloading {repo}: {e}")
662
+ restart_space()
663
+
664
+
665
+ # Initialize the app by downloading snapshots
666
+ download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
667
+ download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
668
+
669
+ # Load leaderboard data
670
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
671
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
672
+ #print(LEADERBOARD_DF.columns.tolist())
673
+
674
+ theoretical_max_combined_perf = mean_of_max_per_field(LEADERBOARD_DF)
675
+
676
+ # Prepare the main interface
677
+ demo = gr.Blocks(css=custom_css)
678
+ with demo:
679
+ #gr.HTML(TITLE)
680
+ gr.HTML(
681
+ """
682
+ <div style="display: flex; align-items: center; position: relative; width: 100%; height: 60px; padding: 10px 0;">
683
+ <h1 style="
684
+ margin: 0 auto;
685
+ font-weight: 900;
686
+ font-size: 2.5em;
687
+ letter-spacing: 2px;
688
+ text-transform: uppercase;
689
+ background: linear-gradient(90deg, #1f77b4, #00c6ff);
690
+ -webkit-background-clip: text;
691
+ -webkit-text-fill-color: transparent;
692
+ text-shadow: 2px 2px 8px rgba(0,0,0,0.2);
693
+ ">
694
+ EVALITA-LLM Leaderboard
695
+ </h1>
696
+ <a href="https://huggingface.co/spaces/mii-llm/open_ita_llm_leaderboard" target="_blank"
697
+ style="position: absolute; right: 0; display: inline-flex; align-items: center; gap: 6px; text-decoration: none; color: #1f77b4; font-weight: 600;">
698
+ <!-- Icona stilizzata -->
699
+ <svg xmlns="http://www.w3.org/2000/svg" width="22" height="22" fill="#1f77b4" viewBox="0 0 24 24">
700
+ <path d="M3.9 12a5 5 0 0 1 7.07-7.07l1.41 1.41-1.41 1.41-1.42-1.42a3 3 0 1 0 4.24 4.24l3.54-3.54a5 5 0 0 1-7.07 7.07l-1.41-1.41 1.41-1.41 1.42 1.42z"/>
701
+ <path d="M20.1 12a5 5 0 0 1-7.07 7.07l-1.41-1.41 1.41-1.41 1.42 1.42a3 3 0 1 0-4.24-4.24l-3.54 3.54a5 5 0 0 1 7.07-7.07l1.41 1.41-1.41 1.41-1.42-1.42z"/>
702
+ </svg>
703
+ Open Italian LLM Leaderboard
704
+ </a>
705
+ </div>
706
+ """
707
+ )
708
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
709
+
710
+ # ⬇️ QUI aggiungiamo i grafici subito sotto la barra del titolo e sopra le tabs
711
+ with gr.Row():
712
+ gr.Plot(value=line_chart(LEADERBOARD_DF), elem_id="line-chart")
713
+ gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="boxplot-task")
714
+ #gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF), elem_id="boxplot-prompt-task")
715
+
716
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
717
+
718
+ # Main leaderboard tab
719
+ with gr.TabItem("πŸ… Benchmark"):
720
+
721
+ leaderboard = init_leaderboard(
722
+ LEADERBOARD_DF,
723
+ default_selection=['Rank', 'Size', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
724
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
725
+ )
726
+
727
+ gr.HTML(
728
+ f"""
729
+ <div style="
730
+ border: 2px solid #1f77b4;
731
+ border-radius: 10px;
732
+ padding: 10px;
733
+ background-color: #f0f8ff;
734
+ font-weight: bold;
735
+ font-size: 14px;
736
+ display: inline-block;
737
+ ">
738
+ Theoretical performance of a model that scores the highest on every individual task: <span style="color:#d62728; font-size:18px;">{theoretical_max_combined_perf:.2f}</span>
739
+ </div>
740
+ """
741
+ )
742
+
743
+ '''
744
+ with gr.TabItem("πŸ“ˆ Charts"):
745
+ #gr.Plot(value=line_chart(LEADERBOARD_DF), label="Andamento di esempio")
746
+ #gr.Plot(value=line_chart_interactive_test(), label="Andamento interattivo")
747
+ gr.Plot(value=line_chart(LEADERBOARD_DF))
748
+ gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES))
749
+ gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF))
750
+ gr.Plot(value=barplot_mean_few_minus_zero_shot(LEADERBOARD_DF))
751
+ '''
752
+
753
+ # About tab
754
+ with gr.TabItem("πŸ“ About"):
755
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
756
+
757
+ # About tab
758
+ with gr.TabItem("β•‘", interactive=False):
759
+ gr.Markdown("", elem_classes="markdown-text")
760
+
761
+
762
+ # Task-specific leaderboards
763
+ for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
764
+
765
+ with gr.TabItem(f"{metadata['icon']}{task}"):
766
+
767
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
768
+ gr.Markdown(task_description, elem_classes="markdown-text")
769
+
770
+ leaderboard = update_task_leaderboard(
771
+ LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Prompt Std": "Prompt Std", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}),
772
+ default_selection=['Rank', 'Size', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
773
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id']]
774
+ )
775
+
776
+ # About tab
777
+ with gr.TabItem("β”‚", interactive=False):
778
+ gr.Markdown("", elem_classes="markdown-text")
779
+
780
+ # Task-specific leaderboards
781
+ for task, metadata in TASK_METADATA_GENERATIVE.items():
782
+ with gr.TabItem(f"{metadata['icon']}{task}"):
783
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
784
+ gr.Markdown(task_description, elem_classes="markdown-text")
785
+
786
+ leaderboard = update_task_leaderboard(
787
+ LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average",
788
+ f"{task} Prompt Std": "Prompt Std",
789
+ f"{task} Best Prompt": "Best Prompt",
790
+ f"{task} Best Prompt Id": "Best Prompt Id",
791
+ task: "Combined Performance"}),
792
+ default_selection=['Rank', 'Size', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt',
793
+ 'Best Prompt Id'],
794
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
795
+ col not in ['Rank', 'Size', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std',
796
+ 'Best Prompt', 'Best Prompt Id']]
797
+ )
798
+
799
+ # Citation section
800
+ with gr.Accordion("πŸ“™ Citation", open=False):
801
+ gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)
802
+
803
+ with gr.Accordion("πŸ“™ Credits", open=False):
804
+ gr.Markdown(
805
+ """
806
+ **This project has benefited from the following support:**
807
+
808
+ - 🧠 **Codebase**: Based on and extended from the Open Italian LLM Leaderboard, developed by **Alessandro Ercolani** and **Samuele Colombo**. We warmly thank them for their invaluable support and guidance in implementing this leaderboard.
809
+
810
+ - πŸ’Ά **Funding**: Partially supported by the PNRR project **FAIR - Future AI Research (PE00000013)**, under the NRRP MUR program funded by **NextGenerationEU**.
811
+
812
+ - πŸ–₯️ **Computation**: We gratefully acknowledge **CINECA** for granting access to the **LEONARDO** supercomputer.
813
+ """
814
+ )
815
+
816
+ # Background job to restart space
817
+ scheduler = BackgroundScheduler()
818
+ scheduler.add_job(restart_space, "interval", seconds=1800)
819
+ scheduler.start()
820
+
821
+ # Launch the app with concurrent queueing
822
+ demo.queue(default_concurrency_limit=40).launch(debug=True, # Enable Gradio debug mode
823
+ show_error=True)