rzanoli commited on
Commit
7b2afd4
·
1 Parent(s): e69ebae

Add heatmap and model comparison table

Browse files
Files changed (2) hide show
  1. app.py +230 -4
  2. app_30_09_2025.py +758 -0
app.py CHANGED
@@ -111,6 +111,227 @@ def validate_and_submit_request(model_name, user_email, user_affiliation):
111
  # Return the Slack response (success or failure message)
112
  return slack_response
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  def highlight_best_per_task(df):
115
  """Add 🟡 symbol next to the maximum value in each task column"""
116
 
@@ -315,9 +536,8 @@ def create_boxplot_task(dataframe=None, baselines=None, references=None):
315
  # Caption
316
  fig.add_annotation(
317
  text=(
318
- "In tasks like TE and SA, models approach the accuracy of supervised <br>"
319
- "models at EVALITA (dashed black line); in NER and REL they remain lower. <br>"
320
- "Dashed red lines show GPT-4o reference results for generative tasks."
321
  ),
322
  xref="paper", yref="paper",
323
  x=0.5, y=-0.30,
@@ -531,6 +751,8 @@ def initialize_app():
531
 
532
  # Initialize data
533
  LEADERBOARD_DF, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, theoretical_max_combined_perf = initialize_app()
 
 
534
 
535
  if LEADERBOARD_DF is None:
536
  # Fallback behavior
@@ -582,7 +804,11 @@ def create_gradio_interface():
582
  # Grafici affiancati
583
  with gr.Row():
584
  gr.Plot(value=create_line_chart(LEADERBOARD_DF), elem_id="line-chart")
585
- gr.Plot(value=create_boxplot_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="boxplot-task")
 
 
 
 
586
 
587
  # Leaderboard
588
  leaderboard = init_leaderboard(
 
111
  # Return the Slack response (success or failure message)
112
  return slack_response
113
 
114
+
115
+ def map_prompt_ids_for_generation(dataframe):
116
+ """
117
+ Map original prompt IDs (1 or 2) to their corresponding generative prompt IDs.
118
+
119
+ - For task 'SU': 1 -> 7, 2 -> 8
120
+ - For tasks 'NER', 'REL', 'LS': 1 -> 9, 2 -> 10
121
+ """
122
+
123
+ # Mapping for SU task
124
+ task = "SU"
125
+ best_prompt_col = f"{task} Best Prompt Id"
126
+ if best_prompt_col in dataframe.columns:
127
+ dataframe[best_prompt_col] = dataframe[best_prompt_col].apply(
128
+ lambda x: 7 if x == 1 else 8
129
+ )
130
+
131
+ # Mapping for other tasks
132
+ for task in ["NER", "REL", "LS"]:
133
+ best_prompt_col = f"{task} Best Prompt Id"
134
+ if best_prompt_col in dataframe.columns:
135
+ dataframe[best_prompt_col] = dataframe[best_prompt_col].apply(
136
+ lambda x: 9 if x == 1 else 10
137
+ )
138
+
139
+ return dataframe
140
+
141
+
142
+ def create_best_model_comparison_table(dataframe):
143
+ """
144
+ Tabella interattiva con dettagli dei modelli migliori per ogni task.
145
+ """
146
+
147
+ tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
148
+
149
+ table_data = {
150
+ 'Task': [], 'Model': [], 'Perf.': [], 'FS': [], 'Params (B)': []
151
+ }
152
+
153
+ for task in tasks:
154
+ if task in dataframe.columns:
155
+ max_idx = dataframe[task].idxmax()
156
+ model_raw = dataframe.loc[max_idx, 'Model']
157
+ if isinstance(model_raw, str) and '<' in model_raw:
158
+ match = re.search(r'>([^<]+)<', model_raw)
159
+ model_name = match.group(1) if match else model_raw
160
+ else:
161
+ model_name = str(model_raw)
162
+
163
+ table_data['Task'].append(task)
164
+ table_data['Model'].append(model_name)
165
+ table_data['Perf.'].append(f"{dataframe.loc[max_idx, task]:.2f}%")
166
+ table_data['FS'].append("5-Shot" if dataframe.loc[max_idx, 'IS_FS'] else "0-Shot")
167
+ table_data['Params (B)'].append(f"{dataframe.loc[max_idx, '#Params (B)']:.1f}")
168
+
169
+ fig = go.Figure(data=[go.Table(
170
+ columnwidth=[80, 200, 80, 80, 100], # larghezze proporzionali
171
+ header=dict(
172
+ values=[f'<b>{col}</b>' for col in table_data.keys()],
173
+ fill_color='#005f87',
174
+ font=dict(color='white', size=12, family='Arial'),
175
+ align='center',
176
+ height=35
177
+ ),
178
+ cells=dict(
179
+ values=list(table_data.values()),
180
+ fill_color=[['#f0f0f0' if i % 2 == 0 else 'white' for i in range(len(table_data['Task']))]],
181
+ font=dict(color='#2c3e50', size=11, family='Arial'),
182
+ align=['center', 'left', 'center', 'center', 'center'],
183
+ height=30
184
+ )
185
+ )])
186
+
187
+ fig.update_layout(
188
+ title={'text': "Best Model Performance Summary by Task",
189
+ 'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}},
190
+ font=dict(family="Arial", size=11), # allinea font
191
+ height=500,
192
+ margin=dict(l=20, r=20, t=60, b=100)
193
+ )
194
+
195
+ # Caption
196
+ fig.add_annotation(
197
+ text="No single model achieves the highest performance across all tasks.",
198
+ xref="paper", yref="paper",
199
+ x=0.5, y=-0.15,
200
+ showarrow=False,
201
+ font=dict(size=12, color="gray", family="Arial"),
202
+ align="center",
203
+ xanchor="center"
204
+ )
205
+
206
+ return fig
207
+
208
+
209
+ def create_prompt_heatmap(dataframe):
210
+ """
211
+ Heatmap con percentuale di modelli che hanno ottenuto le best performance con ciascun prompt per ogni task,
212
+ mostrando solo i valori pertinenti:
213
+ - Prompt 1-6: solo per task multiple-choice
214
+ - Prompt 7-8: solo per SU
215
+ - Prompt 9-10: solo per LS, NER, REL
216
+ """
217
+
218
+ tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
219
+ generative_tasks = ["LS", "SU", "NER", "REL"]
220
+ mc_tasks = [t for t in tasks if t not in generative_tasks]
221
+
222
+ all_prompt_ids = set()
223
+ for task in tasks:
224
+ prompt_col = f"{task} Best Prompt Id"
225
+ if prompt_col in dataframe.columns:
226
+ all_prompt_ids.update(dataframe[prompt_col].dropna().unique())
227
+
228
+ prompt_ids = sorted(all_prompt_ids, key=int)
229
+
230
+ matrix = []
231
+ hover_texts = []
232
+
233
+ for pid in prompt_ids:
234
+ row = []
235
+ hover_row = []
236
+ for task in tasks:
237
+ prompt_col = f"{task} Best Prompt Id"
238
+
239
+ pid_int = int(pid)
240
+ # Filtri personalizzati
241
+ if pid_int <= 6 and task in generative_tasks:
242
+ row.append(None)
243
+ hover_row.append("")
244
+ elif pid_int in [7, 8] and task != "SU":
245
+ row.append(None)
246
+ hover_row.append("")
247
+ elif pid_int in [9, 10] and task not in ["LS", "NER", "REL"]:
248
+ row.append(None)
249
+ hover_row.append("")
250
+ elif prompt_col in dataframe.columns:
251
+ total = len(dataframe[prompt_col].dropna())
252
+ count = (dataframe[prompt_col] == pid).sum()
253
+ percentage = (count / total * 100) if total > 0 else 0
254
+ row.append(percentage)
255
+ hover_row.append(
256
+ f"<b>Prompt {pid} - {task}</b><br>"
257
+ f"Models: {count}/{total}<br>"
258
+ f"Percentage: {percentage:.1f}%"
259
+ )
260
+ else:
261
+ row.append(0)
262
+ hover_row.append(f"<b>Prompt {pid} - {task}</b><br>No data")
263
+ matrix.append(row)
264
+ hover_texts.append(hover_row)
265
+
266
+ # Ticktext colorati: blu per 1-6, arancio per 7-10
267
+ ticktext = []
268
+ for pid in prompt_ids:
269
+ pid_int = int(pid)
270
+ #if pid_int <= 6:
271
+ ticktext.append(f'<span style="color:#1f77b4;">P{pid} </span>') # blu
272
+ #else:
273
+ #ticktext.append(f'<span style="color:#ff7f0e;">P{pid}</span>') # arancio
274
+
275
+ fig = go.Figure(data=go.Heatmap(
276
+ z=matrix,
277
+ x=tasks,
278
+ y=prompt_ids,
279
+ colorscale=[
280
+ [0, '#f7fbff'],
281
+ [0.2, '#deebf7'],
282
+ [0.4, '#9ecae1'],
283
+ [0.6, '#4292c6'],
284
+ [0.8, '#2171b5'],
285
+ [1, '#08519c']
286
+ ],
287
+ text=[[f"{val:.0f}%" if val is not None else "" for val in row] for row in matrix],
288
+ texttemplate="%{text}",
289
+ textfont={"size": 11, "family": "Arial"},
290
+ hovertemplate='%{customdata}<extra></extra>',
291
+ customdata=hover_texts,
292
+ colorbar=dict(title="% Models", ticksuffix="%"),
293
+ zmin=0,
294
+ zmax=100
295
+ ))
296
+
297
+ fig.update_yaxes(
298
+ tickmode='array',
299
+ tickvals=prompt_ids,
300
+ ticktext=ticktext,
301
+ tickfont={"size": 11, "family": "Arial"}
302
+ )
303
+
304
+ fig.update_layout(
305
+ title={'text': "Most Effective Prompts per Task Across Models",
306
+ 'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}},
307
+ xaxis_title="Task",
308
+ yaxis_title="Prompt Variant",
309
+ font=dict(family="Arial", size=11), # allinea font con line_chart
310
+ margin=dict(b=150),
311
+ template="plotly_white",
312
+ dragmode=False,
313
+ height=500
314
+ )
315
+
316
+ fig.add_annotation(
317
+ text=(
318
+ "Prompts 1–6 are for multiple-choice tasks, and prompts 7–10 are for generative tasks. Darker cells indicate<br>"
319
+ "the percentage of models for which a prompt achieved the top performance, with no prompt being best for all tasks.<br>"
320
+ ),
321
+ xref="paper", yref="paper",
322
+ x=0.5, y=-0.25,
323
+ showarrow=False,
324
+ font=dict(size=11, color="gray", family="Arial"),
325
+ align="center",
326
+ xanchor="center"
327
+ )
328
+
329
+ fig.update_xaxes(fixedrange=True)
330
+ fig.update_yaxes(fixedrange=True)
331
+
332
+ return fig
333
+
334
+
335
  def highlight_best_per_task(df):
336
  """Add 🟡 symbol next to the maximum value in each task column"""
337
 
 
536
  # Caption
537
  fig.add_annotation(
538
  text=(
539
+ "In tasks like TE and SA, models approach the accuracy of supervised models at EVALITA (dashed black line).<br>"
540
+ "In NER and REL they remain lower. Dashed red lines show GPT-4o reference results for generative tasks."
 
541
  ),
542
  xref="paper", yref="paper",
543
  x=0.5, y=-0.30,
 
751
 
752
  # Initialize data
753
  LEADERBOARD_DF, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, theoretical_max_combined_perf = initialize_app()
754
+ LEADERBOARD_DF = map_prompt_ids_for_generation(LEADERBOARD_DF)
755
+
756
 
757
  if LEADERBOARD_DF is None:
758
  # Fallback behavior
 
804
  # Grafici affiancati
805
  with gr.Row():
806
  gr.Plot(value=create_line_chart(LEADERBOARD_DF), elem_id="line-chart")
807
+ gr.Plot(value=create_boxplot_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="line-chart")
808
+
809
+ with gr.Row():
810
+ gr.Plot(value=create_prompt_heatmap(LEADERBOARD_DF), elem_id="line-chart")
811
+ gr.Plot(value=create_best_model_comparison_table(LEADERBOARD_DF), elem_id="line-chart")
812
 
813
  # Leaderboard
814
  leaderboard = init_leaderboard(
app_30_09_2025.py ADDED
@@ -0,0 +1,758 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+ from functools import lru_cache
7
+ import logging
8
+ import os
9
+
10
+ from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, \
11
+ LLM_BENCHMARKS_TEXT, TITLE
12
+ from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION
13
+ from src.display.css_html_js import custom_css
14
+ from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, \
15
+ WeightType, Precision
16
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
17
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
18
+ from src.submission.submit import add_new_eval
19
+ import matplotlib.pyplot as plt
20
+ import re
21
+ import plotly.express as px
22
+ import plotly.graph_objects as go
23
+ import numpy as np
24
+
25
+ import requests
26
+
27
+ # Configure logging
28
+ logging.basicConfig(level=logging.INFO)
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # EVALITA results
32
+ BASELINES = {
33
+ "TE": 71.00, "SA": 66.38, "HS": 80.88, "AT": 82.40, "WIC": 85.00,
34
+ "LS": 38.82, "SU": 38.91, "NER": 88.00, "REL": 62.99
35
+ }
36
+
37
+ # GPT-4o results
38
+ REFERENCES = {
39
+ "NER": 79.11, "REL": 63.32, "LS": 59.25, "SU": 33.04
40
+ }
41
+
42
+ TASK_METADATA_MULTIPLECHOICE = {
43
+ "TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": ""},
44
+ "SA": {"icon": "😃", "name": "Sentiment Analysis", "tooltip": ""},
45
+ "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""},
46
+ "AT": {"icon": "🏥", "name": "Admission Test", "tooltip": ""},
47
+ "WIC": {"icon": "🔤", "name": "Word in Context", "tooltip": ""},
48
+ "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""}
49
+ }
50
+
51
+ TASK_METADATA_GENERATIVE = {
52
+ "LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": ""},
53
+ "SU": {"icon": "📝", "name": "Summarization", "tooltip": ""},
54
+ "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
55
+ "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""},
56
+ }
57
+
58
+ # Function to send a Slack notification for a new model submission for evaluation
59
+ def send_slack_notification(model_name, user_name, user_affiliation):
60
+ # Insert your Slack webhook URL here
61
+ webhook_url = os.getenv("WEBHOOK_URL")
62
+
63
+ # Create the messag to be sent to Slack
64
+ message = {
65
+ "text": f"New model submission for EVALITA-LLM leaderboard:\n\n"
66
+ f"**Model Name**: {model_name}\n"
67
+ f"**User**: {user_name}\n"
68
+ f"**Affiliation**: {user_affiliation}\n"
69
+ f"Check out the model on HuggingFace: https://huggingface.co/{model_name}"
70
+ }
71
+
72
+ # Send the message to Slack
73
+ response = requests.post(webhook_url, json=message)
74
+
75
+ # Check if the request was successful and return the appropriate message
76
+ if response.status_code == 200:
77
+ return "✅ **Notification sent successfully!**"
78
+ else:
79
+ return f"❌ **Failed to send notification**: {response.text}"
80
+
81
+
82
+ # Funcion to validate the model submission and send the request for processing
83
+ def validate_and_submit_request(model_name, user_email, user_affiliation):
84
+ # Check if model name is provided and not empt
85
+ if not model_name or not model_name.strip():
86
+ return "❌ **Error:** Model name is required."
87
+
88
+ # Check if user email is provided and not empty
89
+ if not user_email or not user_email.strip():
90
+ return "❌ **Error:** Email address is required."
91
+
92
+ # Validate email format using regex
93
+ email_regex = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
94
+ if not re.match(email_regex, user_email.strip()):
95
+ return "❌ **Error:** Invalid email format. Please enter a valid email address."
96
+
97
+ # Check if user affiliation is provided and not empty
98
+ if not user_affiliation or not user_affiliation.strip():
99
+ return "❌ **Error:** Affiliation is required."
100
+
101
+ # Check if model name follows the correct format (organization/model-name)
102
+ if "/" not in model_name:
103
+ return "❌ **Error:** Model name must be in format 'organization/model-name' (e.g., 'microsoft/DialoGPT-medium')."
104
+
105
+ # Check if the model name contains only valid characters
106
+ if not re.match(r'^[a-zA-Z0-9._/-]+$', model_name):
107
+ return "❌ **Error:** Model name contains invalid characters."
108
+
109
+ slack_response = send_slack_notification(model_name.strip(), user_email.strip(), user_affiliation.strip())
110
+
111
+ # Return the Slack response (success or failure message)
112
+ return slack_response
113
+
114
+ def highlight_best_per_task(df):
115
+ """Add 🟡 symbol next to the maximum value in each task column"""
116
+
117
+ task_columns = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
118
+
119
+ df = df.copy()
120
+ for col in task_columns:
121
+ if col in df.columns:
122
+ max_val = df[col].max()
123
+ df[col] = df[col].apply(
124
+ lambda x: f"{x:.1f}🔺" if x == max_val else f"{x:.1f}"
125
+ )
126
+ return df
127
+
128
+ def theoretical_performance(df_hash):
129
+ """
130
+ Theoretical performance of a model that scores the highest on every individual task
131
+ """
132
+ # This is a placeholder - you'd need to pass the actual dataframe
133
+ # In practice, you'd compute this once and store it
134
+ #fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
135
+ return 75.0 # Placeholder value
136
+
137
+
138
+ def scale_sizes(values, min_size=8, max_size=30):
139
+ """Normalize sizes for scatter plot markers """
140
+ if not values:
141
+ return []
142
+ vmin, vmax = min(values), max(values)
143
+ if vmax == vmin:
144
+ return [(min_size + max_size) / 2] * len(values)
145
+ return [
146
+ min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size)
147
+ for val in values
148
+ ]
149
+
150
+
151
+ def extract_model_name(model_string):
152
+ """Extract model name from HTML string."""
153
+ match = re.search(r'>([^<]+)<', model_string)
154
+ return match.group(1) if match else model_string
155
+
156
+
157
+ def create_line_chart(dataframe):
158
+ """Create left chart."""
159
+
160
+ def scale_sizes(values, min_size=8, max_size=30):
161
+ vmin, vmax = min(values), max(values)
162
+ return [
163
+ min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size) if vmax > vmin
164
+ else (min_size + max_size) / 2
165
+ for val in values
166
+ ]
167
+
168
+ fig = go.Figure()
169
+
170
+ # Loop su 5-Shot e 0-Shot
171
+ for shot, color in [(True, "blue"), (False, "red")]:
172
+ df = dataframe[dataframe["IS_FS"] == shot]
173
+
174
+ x = df["#Params (B)"].tolist()
175
+ y = df["Avg. Comb. Perf. ⬆️"].tolist()
176
+ labels = [
177
+ re.search(r'>([^<]+)<', m).group(1) if isinstance(m, str) and re.search(r'>([^<]+)<', m) else str(m)
178
+ for m in df["Model"].tolist()
179
+ ]
180
+
181
+ fig.add_trace(go.Scatter(
182
+ x=x,
183
+ y=y,
184
+ mode="markers",
185
+ name="5-Shot" if shot else "0-Shot",
186
+ marker=dict(color=color, size=scale_sizes(x)),
187
+ hovertemplate="<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>",
188
+ customdata=labels,
189
+ ))
190
+
191
+ # Show the best model
192
+ all_y = dataframe["Avg. Comb. Perf. ⬆️"].tolist()
193
+ if all_y:
194
+ max_idx = all_y.index(max(all_y))
195
+ max_x = dataframe["#Params (B)"].iloc[max_idx]
196
+ max_y = all_y[max_idx]
197
+ max_label = re.search(r'>([^<]+)<', dataframe["Model"].iloc[max_idx]).group(1)
198
+
199
+ fig.add_annotation(
200
+ x=max_x,
201
+ y=max_y,
202
+ text=max_label,
203
+ showarrow=True,
204
+ arrowhead=2,
205
+ arrowsize=1,
206
+ arrowwidth=2,
207
+ arrowcolor="black",
208
+ font=dict(size=11, color="black"),
209
+ xshift=10, yshift=10,
210
+ ax=-30, ay=-20,
211
+ xanchor="right"
212
+ )
213
+
214
+ # Layout
215
+ fig.update_layout(
216
+ title="Average Combined Performance vs #Params",
217
+ xaxis_title="#Params (B)", yaxis_title="Average Combined Performance",
218
+ template="plotly_white", hovermode="closest",
219
+ font=dict(family="Arial", size=10), dragmode=False,
220
+ xaxis=dict(tickvals=[0, 25, 50, 75, 100, 125], ticktext=["0", "25", "50", "75", "100"]),
221
+ yaxis=dict(tickvals=[0, 20, 40, 60, 80, 100], range=[0, 100])
222
+ )
223
+
224
+ # Caption
225
+ fig.add_annotation(
226
+ text="Accuracy generally rises with #Params, but smaller models <br>"
227
+ "with 5-shot can outperform larger zero-shot models.",
228
+ xref="paper", yref="paper", x=0.5, y=-0.3,
229
+ showarrow=False, font=dict(size=11, color="gray"),
230
+ align="center", xanchor="center"
231
+ )
232
+
233
+ fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
234
+ fig.update_yaxes(fixedrange=True)
235
+
236
+ return fig
237
+
238
+
239
+ def create_boxplot_task(dataframe=None, baselines=None, references=None):
240
+ """Create right chart"""
241
+ print(dataframe.columns)
242
+
243
+ tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
244
+
245
+ # Dati di default se non forniti
246
+ if dataframe is None:
247
+ np.random.seed(42)
248
+ dataframe = pd.DataFrame({task: np.random.uniform(0.4, 0.9, 20) * 100 for task in tasks})
249
+
250
+ if baselines is None:
251
+ baselines = {task: np.random.randint(50, 70) for task in tasks}
252
+
253
+ if references is None:
254
+ references = {}
255
+
256
+ colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
257
+ "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
258
+
259
+ fig = go.Figure()
260
+
261
+ for i, task in enumerate(tasks):
262
+ if task not in dataframe.columns:
263
+ continue
264
+
265
+ y_data = dataframe[task].dropna().tolist()
266
+
267
+ # Boxplot
268
+ fig.add_trace(go.Box(
269
+ y=y_data,
270
+ name=task,
271
+ marker=dict(color=colors[i]),
272
+ line=dict(color="black", width=2),
273
+ fillcolor=colors[i],
274
+ opacity=0.7,
275
+ hovertemplate="<b>"+task+"</b><br>Accuracy: %{y:.2f}%<extra></extra>",
276
+ hoverlabel=dict(bgcolor=colors[i], font_color="white"),
277
+ width=0.6,
278
+ whiskerwidth=0.2,
279
+ quartilemethod="linear"
280
+ ))
281
+
282
+ # Linea baseline
283
+ baseline_value = baselines.get(task)
284
+ if baseline_value is not None:
285
+ fig.add_shape(
286
+ type="line",
287
+ x0=i - 0.3, x1=i + 0.3,
288
+ y0=baseline_value, y1=baseline_value,
289
+ line=dict(color="black", width=2, dash="dot"),
290
+ xref="x", yref="y"
291
+ )
292
+
293
+ # Linea reference GPT-4o
294
+ reference_value = references.get(task)
295
+ if reference_value is not None:
296
+ fig.add_shape(
297
+ type="line",
298
+ x0=i - 0.3, x1=i + 0.3,
299
+ y0=reference_value, y1=reference_value,
300
+ line=dict(color="red", width=2, dash="dashdot"),
301
+ xref="x", yref="y"
302
+ )
303
+
304
+ # Layout
305
+ fig.update_layout(
306
+ title="Distribution of Model Accuracy by Task",
307
+ xaxis_title="Task",
308
+ yaxis_title="Combined Performance",
309
+ template="plotly_white",
310
+ boxmode="group",
311
+ dragmode=False,
312
+ font=dict(family="Arial", size=10),
313
+ margin=dict(b=80),
314
+ )
315
+
316
+ # Caption
317
+ fig.add_annotation(
318
+ text=(
319
+ "In tasks like TE and SA, models approach the accuracy of supervised <br>"
320
+ "models at EVALITA (dashed black line); in NER and REL they remain lower. <br>"
321
+ "Dashed red lines show GPT-4o reference results for generative tasks."
322
+ ),
323
+ xref="paper", yref="paper",
324
+ x=0.5, y=-0.30,
325
+ showarrow=False,
326
+ font=dict(size=11, color="gray"),
327
+ align="center"
328
+ )
329
+
330
+ fig.update_yaxes(range=[0, 100], fixedrange=True)
331
+ fig.update_xaxes(fixedrange=True)
332
+
333
+ return fig
334
+
335
+
336
+ def create_medal_assignments(sorted_df):
337
+ """Function for medal assignment logic"""
338
+ medals = {
339
+ 'large_fs': False, 'medium_fs': False, 'small_fs': False,
340
+ 'large_0shot': False, 'medium_0shot': False, 'small_0shot': False
341
+ }
342
+
343
+ new_model_column = []
344
+
345
+ for _, row in sorted_df.iterrows():
346
+ model_name = row['Model']
347
+ size = row["Size"]
348
+ is_fs = row['IS_FS']
349
+
350
+ if is_fs: # 5-Few-Shot
351
+ if size == "🔵🔵🔵" and not medals['large_fs']:
352
+ model_name = f"{model_name} 🔵🔵🔵🏆"
353
+ medals['large_fs'] = True
354
+ elif size == "🔵🔵" and not medals['medium_fs']:
355
+ model_name = f"{model_name} 🔵🔵🏆"
356
+ medals['medium_fs'] = True
357
+ elif size == "🔵" and not medals['small_fs']:
358
+ model_name = f"{model_name} 🔵🏆"
359
+ medals['small_fs'] = True
360
+ else: # 0-Shot
361
+ if size == "🔵🔵🔵" and not medals['large_0shot']:
362
+ model_name = f"{model_name} 🔵🔵🔵🎖️"
363
+ medals['large_0shot'] = True
364
+ elif size == "🔵🔵" and not medals['medium_0shot']:
365
+ model_name = f"{model_name} 🔵🔵🎖️"
366
+ medals['medium_0shot'] = True
367
+ elif size == "🔵" and not medals['small_0shot']:
368
+ model_name = f"{model_name} 🔵🎖️"
369
+ medals['small_0shot'] = True
370
+
371
+ new_model_column.append(model_name)
372
+
373
+ return new_model_column
374
+
375
+
376
+ def create_leaderboard_base(sorted_dataframe, field_list, hidden_columns):
377
+ """Base leaderboard creation with common parameters. """
378
+
379
+ return Leaderboard(
380
+ value=sorted_dataframe,
381
+ datatype=[c.type for c in field_list],
382
+ search_columns=[AutoEvalColumn.model.name],
383
+ hide_columns=hidden_columns,
384
+ filter_columns=[
385
+ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"),
386
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
387
+ label="Select the number of parameters (B)"),
388
+ ],
389
+ bool_checkboxgroup_label="Evaluation Mode",
390
+ interactive=False,
391
+ )
392
+
393
+
394
+ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
395
+ """Leaderboard initialization """
396
+ if dataframe is None or dataframe.empty:
397
+ raise ValueError("Leaderboard DataFrame is empty or None.")
398
+
399
+ # Sort and reset index
400
+ sorted_dataframe = dataframe.sort_values(by="Avg. Comb. Perf. ⬆️", ascending=False).reset_index(drop=True)
401
+ sorted_dataframe["Rank"] = sorted_dataframe.index + 1
402
+
403
+ # Apply medal assignments
404
+ sorted_dataframe["Model"] = create_medal_assignments(sorted_dataframe)
405
+
406
+ # Show the best values for tasks
407
+ sorted_dataframe = highlight_best_per_task(sorted_dataframe)
408
+
409
+ field_list = fields(AutoEvalColumn)
410
+
411
+ return create_leaderboard_base(sorted_dataframe, field_list, hidden_columns)
412
+
413
+
414
+ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None):
415
+
416
+ """ Task-specific leaderboard update."""
417
+ if dataframe is None or dataframe.empty:
418
+ raise ValueError("Leaderboard DataFrame is empty or None.")
419
+
420
+ # Sort and reset index
421
+ sorted_dataframe = dataframe.sort_values(by="Comb. Perf. ⬆️", ascending=False).reset_index(drop=True)
422
+ sorted_dataframe["Rank"] = sorted_dataframe.index + 1
423
+
424
+ # Apply medal assignments
425
+ sorted_dataframe["Model"] = create_medal_assignments(sorted_dataframe)
426
+
427
+ field_list = fields(AutoEvalColumn)
428
+
429
+ return Leaderboard(
430
+ value=sorted_dataframe,
431
+ datatype=[c.type for c in field_list] + [int],
432
+ search_columns=[AutoEvalColumn.model.name],
433
+ hide_columns=hidden_columns,
434
+ filter_columns=[
435
+ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"),
436
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
437
+ label="Select the number of parameters (B)"),
438
+ ],
439
+ bool_checkboxgroup_label="Evaluation Mode",
440
+ interactive=False
441
+ )
442
+
443
+
444
+ def download_snapshot(repo, local_dir, max_retries=3):
445
+ """Snapshot download with retry logic."""
446
+ for attempt in range(max_retries):
447
+ try:
448
+ logger.info(f"Downloading from {repo} to {local_dir} (attempt {attempt + 1}/{max_retries})")
449
+ snapshot_download(
450
+ repo_id=repo,
451
+ local_dir=local_dir,
452
+ repo_type="dataset",
453
+ tqdm_class=None,
454
+ etag_timeout=30,
455
+ token=TOKEN
456
+ )
457
+ return True
458
+ except Exception as e:
459
+ logger.error(f"Error downloading {repo} (attempt {attempt + 1}): {e}")
460
+ if attempt == max_retries - 1:
461
+ logger.error(f"Failed to download {repo} after {max_retries} attempts")
462
+ return False
463
+ return False
464
+
465
+
466
+ def restart_space():
467
+ """Restart the Hugging Face space."""
468
+ try:
469
+ logger.info("Restarting space... ")
470
+ API.restart_space(repo_id=REPO_ID)
471
+ except Exception as e:
472
+ logger.error(f"Error restarting space: {e}")
473
+
474
+
475
+ def create_title_html():
476
+ """Function for title HTML."""
477
+ return """
478
+ <div class="title-header">
479
+ <h1 class="title-text">
480
+ EVALITA-LLM Leaderboard
481
+ </h1>
482
+ <a href="https://huggingface.co/spaces/mii-llm/open_ita_llm_leaderboard" target="_blank" class="title-link">
483
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
484
+ <path d="M3.9 12a5 5 0 0 1 7.07-7.07l1.41 1.41-1.41 1.41-1.42-1.42a3 3 0 1 0 4.24 4.24l3.54-3.54a5 5 0 0 1-7.07 7.07l-1.41-1.41 1.41-1.41 1.42 1.42z"/>
485
+ <path d="M20.1 12a5 5 0 0 1-7.07 7.07l-1.41-1.41 1.41-1.41 1.42 1.42a3 3 0 1 0-4.24-4.24l-3.54 3.54a5 5 0 0 1 7.07-7.07l1.41 1.41-1.41 1.41-1.42-1.42z"/>
486
+ </svg>
487
+ Open Italian LLM Leaderboard
488
+ </a>
489
+ </div>
490
+ """
491
+
492
+
493
+ def create_credits_markdown():
494
+ """Credits section."""
495
+ return """
496
+ **This project has benefited from the following support:**
497
+
498
+ - 🧠 **Codebase**: Based on and extended from the Open Italian LLM Leaderboard, developed by **Alessandro Ercolani** and **Samuele Colombo**. We warmly thank them for their invaluable support and guidance in implementing this leaderboard.
499
+
500
+ - 💶 **Funding**: Partially supported by the PNRR project **FAIR - Future AI Research (PE00000013)**, under the NRRP MUR program funded by **NextGenerationEU**.
501
+
502
+ - 🖥️ **Computation**: We gratefully acknowledge **CINECA** for granting access to the **LEONARDO** supercomputer.
503
+ """
504
+
505
+
506
+ # Main initialization
507
+ def initialize_app():
508
+ """Initialize the application ."""
509
+ try:
510
+ # Download snapshots
511
+ queue_success = download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
512
+ results_success = download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
513
+
514
+ if not (queue_success and results_success):
515
+ logger.error("Failed to download required data")
516
+ return None, None, None, None, None
517
+
518
+ # Load leaderboard data
519
+ leaderboard_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
520
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
521
+ EVAL_REQUESTS_PATH, EVAL_COLS)
522
+
523
+ # Calculate theoretical max performance
524
+ theoretical_max = theoretical_performance(hash(str(leaderboard_df.values.tobytes())))
525
+
526
+ return leaderboard_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, theoretical_max
527
+
528
+ except Exception as e:
529
+ logger.error(f"Error initializing app: {e}")
530
+ return None, None, None, None, None
531
+
532
+
533
+ # Initialize data
534
+ LEADERBOARD_DF, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, theoretical_max_combined_perf = initialize_app()
535
+
536
+ if LEADERBOARD_DF is None:
537
+ # Fallback behavior
538
+ logger.error("Failed to initialize app data")
539
+ theoretical_max_combined_perf = 0.0
540
+
541
+
542
+ # Main Gradio interface
543
+ def create_gradio_interface():
544
+ """The main Gradio interface."""
545
+ demo = gr.Blocks(css=custom_css)
546
+
547
+ with demo:
548
+ # Titolo
549
+ gr.HTML(create_title_html())
550
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
551
+
552
+ # Tabs principali
553
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
554
+ # 🏅 Benchmark
555
+ with gr.TabItem("🏅 Benchmark"):
556
+ if LEADERBOARD_DF is not None:
557
+ # Labels dei campi affiancate
558
+
559
+ with gr.Row():
560
+ gr.HTML(f"""
561
+ <div class="performance-metrics">
562
+ <div class="metric-label" title="Total number of configurations (zero-shot and 5-few-shot) of the models evaluated in the leaderboard.">
563
+ Models tested: {len(LEADERBOARD_DF)}
564
+ </div>
565
+ <div class="metric-label" title="Average accuracy of the evaluated models.">
566
+ Avg combined perf.: {LEADERBOARD_DF['Avg. Comb. Perf. ⬆️'].mean():.2f}
567
+ </div>
568
+ <div class="metric-label" title="Standard deviation of the evaluated models' performance.">
569
+ Std. Dev. {LEADERBOARD_DF['Avg. Comb. Perf. ⬆️'].std():.2f}
570
+ </div>
571
+ <div class="metric-label" title="Best evaluated model.">
572
+ Best model: {LEADERBOARD_DF.loc[LEADERBOARD_DF['Avg. Comb. Perf. ⬆️'].idxmax(), 'Model']}
573
+ </div>
574
+ <div class="metric-label" title="Accuracy of the best evaluated model.">
575
+ Best model accuracy: {LEADERBOARD_DF.loc[LEADERBOARD_DF['Avg. Comb. Perf. ⬆️'].idxmax(), 'Avg. Comb. Perf. ⬆️']:.2f}
576
+ </div>
577
+ <div class="metric-label" title="Maximum achievable accuracy based on the highest performance for each task by any model in the leaderboard.">
578
+ Ideal model: {theoretical_max_combined_perf:.2f}
579
+ </div>
580
+ </div>
581
+ """)
582
+
583
+ # Grafici affiancati
584
+ with gr.Row():
585
+ gr.Plot(value=create_line_chart(LEADERBOARD_DF), elem_id="line-chart")
586
+ gr.Plot(value=create_boxplot_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="boxplot-task")
587
+
588
+ # Leaderboard
589
+ leaderboard = init_leaderboard(
590
+ LEADERBOARD_DF,
591
+ default_selection=['Rank', 'Size', 'FS', 'Model', "Avg. Comb. Perf. ⬆️",
592
+ "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
593
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
594
+ col not in ['Rank', 'Size', 'FS', 'Model', "Avg. Comb. Perf. ⬆️",
595
+ "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
596
+ )
597
+
598
+ # 📝 About
599
+ with gr.TabItem("📝 About"):
600
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
601
+
602
+
603
+ # 🚀 Submit a new model to evaluate
604
+ with gr.TabItem("🚀 Submit"):
605
+ gr.Markdown("# 📝 Model Evaluation Request", elem_classes="markdown-text")
606
+ gr.Markdown("""
607
+ **Fill out the form below to request evaluation of your model on EVALITA-LLM.**
608
+
609
+ Once submitted, our team will automatically receive a notification. We will evaluate the
610
+ submission’s relevance for both research and commercial purposes, as well as assess its feasibility.
611
+ """, elem_classes="markdown-text")
612
+
613
+ with gr.Row():
614
+ with gr.Column():
615
+ # HuggingFace model name field
616
+ model_name_input = gr.Textbox(
617
+ label="HuggingFace Model Name",
618
+ placeholder="e.g., microsoft/DialoGPT-medium",
619
+ info="Enter the complete model name as it appears on HuggingFace Hub (organization/model-name)",
620
+ elem_id="model-name-input"
621
+ )
622
+
623
+ # User email field
624
+ user_name_input = gr.Textbox(
625
+ label="Your email address",
626
+ placeholder="e.g., [email protected]",
627
+ info="Enter your email address for communication",
628
+ elem_id="user-email-input"
629
+ )
630
+
631
+ # Affiliation field
632
+ user_affiliation_input = gr.Textbox(
633
+ label="Affiliation",
634
+ placeholder="e.g., University of Milan, Google Research, Freelancer",
635
+ info="Enter your affiliation (university, company, organization)",
636
+ elem_id="user-affiliation-input"
637
+ )
638
+
639
+ # Submit button
640
+ submit_request_button = gr.Button(
641
+ "📤 Submit Request",
642
+ variant="primary",
643
+ elem_id="submit-request-button"
644
+ )
645
+
646
+ # Result area
647
+ submission_status = gr.Markdown(elem_id="submission-status")
648
+
649
+ # Connect button to function
650
+ submit_request_button.click(
651
+ validate_and_submit_request,
652
+ inputs=[model_name_input, user_name_input, user_affiliation_input],
653
+ outputs=submission_status
654
+ )
655
+
656
+ # Additional information
657
+ with gr.Accordion("ℹ️ Additional Information", open=False):
658
+ gr.Markdown("""
659
+ **What happens after submission:**
660
+ 1. Your request is automatically sent to the EVALITA-LLM team
661
+ 2. We verify that the model is accessible on HuggingFace
662
+ 3. We contact you to confirm inclusion in the evaluation
663
+ 4. The model is added to the evaluation queue
664
+
665
+ **Model requirements:**
666
+ - Model must be publicly accessible on HuggingFace Hub
667
+ - Must be compatible with the EleutherAI/lm-evaluation-harness framework
668
+ - Must have a license that allows evaluation
669
+
670
+ **Evaluation tasks:**
671
+ Your model will be evaluated on all tasks: TE, SA, HS, AT, WIC, FAQ, LS, SU, NER, REL.
672
+ """, elem_classes="markdown-text")
673
+
674
+
675
+ # Separators
676
+ with gr.TabItem("║", interactive=False):
677
+ gr.Markdown("", elem_classes="markdown-text")
678
+
679
+ # Task-specific tabs (Multiple Choice)
680
+ if LEADERBOARD_DF is not None:
681
+ for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
682
+ with gr.TabItem(f"{metadata['icon']}{task}"):
683
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
684
+ gr.Markdown(task_description, elem_classes="markdown-text")
685
+
686
+ leaderboard_task = update_task_leaderboard(
687
+ LEADERBOARD_DF.rename(columns={
688
+ f"{task} Prompt Average": "Prompt Average",
689
+ f"{task} Prompt Std": "Prompt Std",
690
+ f"{task} Best Prompt": "Best Prompt",
691
+ f"{task} Best Prompt Id": "Best Prompt Id",
692
+ task: "Comb. Perf. ⬆️"
693
+ }),
694
+ default_selection=['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️',
695
+ 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
696
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
697
+ col not in ['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️',
698
+ 'Prompt Average', 'Prompt Std', 'Best Prompt',
699
+ 'Best Prompt Id']]
700
+ )
701
+
702
+ # Separators
703
+ with gr.TabItem("│", interactive=False):
704
+ gr.Markdown("", elem_classes="markdown-text")
705
+
706
+ # Task-specific tabs (Generative)
707
+ if LEADERBOARD_DF is not None:
708
+ for task, metadata in TASK_METADATA_GENERATIVE.items():
709
+ with gr.TabItem(f"{metadata['icon']}{task}"):
710
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
711
+ gr.Markdown(task_description, elem_classes="markdown-text")
712
+
713
+ leaderboard_task = update_task_leaderboard(
714
+ LEADERBOARD_DF.rename(columns={
715
+ f"{task} Prompt Average": "Prompt Average",
716
+ f"{task} Prompt Std": "Prompt Std",
717
+ f"{task} Best Prompt": "Best Prompt",
718
+ f"{task} Best Prompt Id": "Best Prompt Id",
719
+ task: "Comb. Perf. ⬆️"
720
+ }),
721
+ default_selection=['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️',
722
+ 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
723
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
724
+ col not in ['Rank', 'Size', 'FS', 'Model', 'Comb. Perf. ⬆️',
725
+ 'Prompt Average', 'Prompt Std', 'Best Prompt',
726
+ 'Best Prompt Id']]
727
+ )
728
+
729
+ # Citation e Credits
730
+ with gr.Accordion("📙 Citation", open=False):
731
+ gr.Textbox(
732
+ value=CITATION_BUTTON_TEXT,
733
+ label=CITATION_BUTTON_LABEL,
734
+ lines=20,
735
+ elem_id="citation-button",
736
+ show_copy_button=True
737
+ )
738
+
739
+ with gr.Accordion("📙 Credits", open=False):
740
+ gr.Markdown(create_credits_markdown())
741
+
742
+ return demo
743
+
744
+
745
+ # Create and configure the demo
746
+ demo = create_gradio_interface()
747
+
748
+ # Background scheduler for space restart
749
+ scheduler = BackgroundScheduler()
750
+ scheduler.add_job(restart_space, "interval", seconds=1800)
751
+ scheduler.start()
752
+
753
+ # Launch configuration
754
+ if __name__ == "__main__":
755
+ demo.queue(default_concurrency_limit=40).launch(
756
+ debug=True,
757
+ show_error=True
758
+ )