rzanoli commited on
Commit
cae4d0f
·
1 Parent(s): d04734c

Small changes

Browse files
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
- app_file: app.py
8
  pinned: true
9
  license: apache-2.0
10
  short_description: Duplicate this leaderboard to initialize your own!
 
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ app_file: example_app.py
8
  pinned: true
9
  license: apache-2.0
10
  short_description: Duplicate this leaderboard to initialize your own!
app.py CHANGED
@@ -1,324 +1,120 @@
1
- import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
- import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
-
16
- from src.tasks import (
17
- TE_DESCRIPTION,
18
- )
19
-
20
- from src.display.css_html_js import custom_css
21
- from src.display.utils import (
22
- BENCHMARK_COLS,
23
- COLS,
24
- EVAL_COLS,
25
- EVAL_TYPES,
26
- AutoEvalColumn,
27
- ModelType,
28
- fields,
29
- WeightType,
30
- Precision
31
- )
32
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
33
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
34
- from src.submission.submit import add_new_eval
35
-
36
-
37
- def restart_space():
38
- API.restart_space(repo_id=REPO_ID)
39
-
40
- ### Space initialisation
41
- try:
42
- print(EVAL_REQUESTS_PATH)
43
- snapshot_download(
44
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
45
- )
46
- except Exception:
47
- restart_space()
48
- try:
49
- print(EVAL_RESULTS_PATH)
50
- snapshot_download(
51
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
52
- )
53
- except Exception:
54
- restart_space()
55
-
56
-
57
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
58
-
59
- (
60
- finished_eval_queue_df,
61
- running_eval_queue_df,
62
- pending_eval_queue_df,
63
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
64
-
65
- def init_leaderboard(dataframe):
66
- print(dataframe)
67
- if dataframe is None or dataframe.empty:
68
- raise ValueError("Leaderboard DataFrame is empty or None.")
69
- return Leaderboard(
70
- value=dataframe,
71
- datatype=[c.type for c in fields(AutoEvalColumn)],
72
- select_columns=SelectColumns(
73
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
74
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
75
- label="Select Columns to Display:",
76
- ),
77
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
78
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
79
- filter_columns=[
80
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
81
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
82
- ColumnFilter(
83
- AutoEvalColumn.params.name,
84
- type="slider",
85
- min=0.01,
86
- max=150,
87
- label="Select the number of parameters (B)",
88
- ),
89
- ColumnFilter(
90
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
91
- ),
92
- ],
93
- bool_checkboxgroup_label="Hide models",
94
- interactive=False,
95
- )
96
-
97
-
98
- def init_leaderboard2(dataframe, default_selection=None, hidden_columns=None):
99
-
100
- print("entrato===============================================")
101
-
102
- if dataframe is None or dataframe.empty:
103
- raise ValueError("Leaderboard DataFrame is empty or None.")
104
- return Leaderboard(
105
- value=dataframe,
106
- datatype=[c.type for c in fields(AutoEvalColumn)],
107
- select_columns=SelectColumns(
108
- default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
109
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
110
- label="Select Columns to Display:",
111
- ),
112
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
113
- hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
114
- filter_columns=[
115
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
116
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
117
- ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"),
118
- ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
119
- ],
120
- bool_checkboxgroup_label="Hide models",
121
- interactive=False,
122
- )
123
-
124
-
125
- demo = gr.Blocks(css=custom_css)
126
- with demo:
127
- gr.HTML(TITLE)
128
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
129
-
130
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
131
- with gr.TabItem("🏅 EVALITA-LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
132
- #leaderboard = init_leaderboard(LEADERBOARD_DF)
133
-
134
- leaderboard = init_leaderboard2(
135
- LEADERBOARD_DF,
136
- default_selection=['T', 'Model', "Average ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
137
- hidden_columns=[col for col in LEADERBOARD_DF.columns if
138
- col not in ['T', 'Model', "Average ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL" ]]
139
- )
140
-
141
-
142
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
143
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
144
-
145
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
146
- with gr.Column():
147
- with gr.Row():
148
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
149
-
150
- with gr.Column():
151
- with gr.Accordion(
152
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
153
- open=False,
154
- ):
155
- with gr.Row():
156
- finished_eval_table = gr.components.Dataframe(
157
- value=finished_eval_queue_df,
158
- headers=EVAL_COLS,
159
- datatype=EVAL_TYPES,
160
- row_count=5,
161
- )
162
- with gr.Accordion(
163
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
164
- open=False,
165
- ):
166
- with gr.Row():
167
- running_eval_table = gr.components.Dataframe(
168
- value=running_eval_queue_df,
169
- headers=EVAL_COLS,
170
- datatype=EVAL_TYPES,
171
- row_count=5,
172
- )
173
-
174
- with gr.Accordion(
175
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
176
- open=False,
177
- ):
178
- with gr.Row():
179
- pending_eval_table = gr.components.Dataframe(
180
- value=pending_eval_queue_df,
181
- headers=EVAL_COLS,
182
- datatype=EVAL_TYPES,
183
- row_count=5,
184
- )
185
- with gr.Row():
186
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
187
-
188
- with gr.Row():
189
- with gr.Column():
190
- model_name_textbox = gr.Textbox(label="Model name")
191
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
192
- model_type = gr.Dropdown(
193
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
194
- label="Model type",
195
- multiselect=False,
196
- value=None,
197
- interactive=True,
198
- )
199
-
200
- with gr.Column():
201
- precision = gr.Dropdown(
202
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
203
- label="Precision",
204
- multiselect=False,
205
- value="float16",
206
- interactive=True,
207
- )
208
- weight_type = gr.Dropdown(
209
- choices=[i.value.name for i in WeightType],
210
- label="Weights type",
211
- multiselect=False,
212
- value="Original",
213
- interactive=True,
214
- )
215
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
216
-
217
- submit_button = gr.Button("Submit Eval")
218
- submission_result = gr.Markdown()
219
- submit_button.click(
220
- add_new_eval,
221
- [
222
- model_name_textbox,
223
- base_model_name_textbox,
224
- revision_name_textbox,
225
- precision,
226
- weight_type,
227
- model_type,
228
- ],
229
- submission_result,
230
- )
231
-
232
-
233
- with gr.TabItem("TE", elem_id="llm-benchmark-tab-table", id=4):
234
- gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
235
- #leaderboard = init_leaderboard(LEADERBOARD_DF)
236
-
237
- LEADERBOARD_DF_TE = LEADERBOARD_DF.rename(columns={"TE Prompt Average": "Prompt Average",
238
- "TE Best Prompt": "Best Prompt",
239
- "TE Best Prompt Id": "Best Prompt Id",
240
- "TE": "Combined Performance"})
241
-
242
- leaderboard = init_leaderboard2(
243
- LEADERBOARD_DF_TE,
244
- default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'],
245
- hidden_columns=[col for col in LEADERBOARD_DF.columns if
246
- col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
247
- )
248
-
249
-
250
- with gr.TabItem("SA", elem_id="llm-benchmark-tab-table", id=5):
251
- gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
252
-
253
- LEADERBOARD_DF_SA = LEADERBOARD_DF.rename(columns={"SA Prompt Average": "Prompt Average",
254
- "SA Best Prompt": "Best Prompt",
255
- "SA Best Prompt Id": "Best Prompt Id",
256
- "SA": "Combined Performance"})
257
-
258
- leaderboard = init_leaderboard2(
259
- LEADERBOARD_DF_SA,
260
- default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
261
- 'Best Prompt Id'],
262
- hidden_columns=[col for col in LEADERBOARD_DF.columns if
263
- col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
264
- 'Best Prompt Id']]
265
- )
266
-
267
-
268
-
269
-
270
- with gr.TabItem("HS", elem_id="llm-benchmark-tab-table", id=6):
271
- gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
272
-
273
- LEADERBOARD_DF_HS = LEADERBOARD_DF.rename(columns={"HS Prompt Average": "Prompt Average",
274
- "HS Best Prompt": "Best Prompt",
275
- "HS Best Prompt Id": "Best Prompt Id",
276
- "HS": "Combined Performance"})
277
-
278
- leaderboard = init_leaderboard2(
279
- LEADERBOARD_DF_HS,
280
- default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
281
- 'Best Prompt Id'],
282
- hidden_columns=[col for col in LEADERBOARD_DF.columns if
283
- col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
284
- 'Best Prompt Id']]
285
- )
286
-
287
-
288
-
289
- with gr.TabItem("AT", elem_id="llm-benchmark-tab-table", id=7):
290
- gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
291
-
292
- with gr.TabItem("WIC", elem_id="llm-benchmark-tab-table", id=8):
293
- gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
294
-
295
- with gr.TabItem("FAQ", elem_id="llm-benchmark-tab-table", id=9):
296
- gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
297
-
298
- with gr.TabItem("LS", elem_id="llm-benchmark-tab-table", id=10):
299
- gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
300
-
301
- with gr.TabItem("SU", elem_id="llm-benchmark-tab-table", id=11):
302
- gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
303
-
304
- with gr.TabItem("NER", elem_id="llm-benchmark-tab-table", id=12):
305
- gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
306
-
307
- with gr.TabItem("REL", elem_id="llm-benchmark-tab-table", id=13):
308
- gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
309
-
310
-
311
- with gr.Row():
312
- with gr.Accordion("📙 Citation", open=False):
313
- citation_button = gr.Textbox(
314
- value=CITATION_BUTTON_TEXT,
315
- label=CITATION_BUTTON_LABEL,
316
- lines=20,
317
- elem_id="citation-button",
318
- show_copy_button=True,
319
- )
320
-
321
- scheduler = BackgroundScheduler()
322
- scheduler.add_job(restart_space, "interval", seconds=1800)
323
- scheduler.start()
324
  demo.queue(default_concurrency_limit=40).launch()
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+ from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
7
+ from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION
8
+ from src.display.css_html_js import custom_css
9
+ from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision
10
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
11
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
12
+ from src.submission.submit import add_new_eval
13
+
14
+
15
+ # Define task metadata (icons, names, descriptions)
16
+ TASK_METADATA = {
17
+ "TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": "Identify logical relationships between two text segments."},
18
+ "SA": {"icon": "😃", "name": "Sentiment Analysis", "tooltip": "Classify the sentiment (positive, negative, neutral) of a text."},
19
+ "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": "Detect hate speech in a text."},
20
+ "AT": {"icon": "🏥", "name": "Admission Test", "tooltip": "Classify whether a clinical statement pertains to an admission test."},
21
+ "WIC": {"icon": "🔤", "name": "Word in Context", "tooltip": "Identify words in context and their meaning."},
22
+ "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": "Answer frequently asked questions based on given text."},
23
+ "LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": "Identify alternative words in a given context."},
24
+ "SU": {"icon": "📝", "name": "Summarization", "tooltip": "Summarize long text into a shorter version."},
25
+ "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": "Identify named entities (e.g., persons, locations, organizations) in text."},
26
+ "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": "Extract and link laboratory test results to the respective tests in clinical narratives."},
27
+ }
28
+
29
+ def restart_space():
30
+ """Restart the Hugging Face space."""
31
+ API.restart_space(repo_id=REPO_ID)
32
+
33
+ # Helper function for leaderboard initialization
34
+ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
35
+ """Initialize and return a leaderboard."""
36
+ if dataframe is None or dataframe.empty:
37
+ raise ValueError("Leaderboard DataFrame is empty or None.")
38
+
39
+ return Leaderboard(
40
+ value=dataframe,
41
+ datatype=[c.type for c in fields(AutoEvalColumn)],
42
+ select_columns=SelectColumns(
43
+ default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
44
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
45
+ label="Select Columns to Display:",
46
+ ),
47
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
48
+ hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
49
+ filter_columns=[
50
+ ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="Few-Shot Learning (FS)"),
51
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"),
52
+ ],
53
+ bool_checkboxgroup_label="Hide models",
54
+ interactive=False,
55
+ )
56
+
57
+
58
+ def download_snapshot(repo, local_dir):
59
+ """Try to download a snapshot from Hugging Face Hub."""
60
+ try:
61
+ print(f"Downloading from {repo} to {local_dir}...")
62
+ snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN)
63
+ except Exception as e:
64
+ print(f"Error downloading {repo}: {e}")
65
+ restart_space()
66
+
67
+
68
+ # Initialize the app by downloading snapshots
69
+ download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
70
+ download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
71
+
72
+ # Load leaderboard data
73
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
74
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
75
+
76
+ # Prepare the main interface
77
+ demo = gr.Blocks(css=custom_css)
78
+ with demo:
79
+ gr.HTML(TITLE)
80
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
81
+
82
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
83
+ # Main leaderboard tab
84
+ with gr.TabItem("🏅 EVALITA-LLM Benchmark"):
85
+ leaderboard = init_leaderboard(
86
+ LEADERBOARD_DF,
87
+ default_selection=['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
88
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
89
+ )
90
+
91
+ # About tab
92
+ with gr.TabItem("📝 About"):
93
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
94
+
95
+ # Task-specific leaderboards
96
+ for task, metadata in TASK_METADATA.items():
97
+ with gr.TabItem(f"{metadata['icon']}{task}"):
98
+
99
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
100
+ gr.Markdown(task_description, elem_classes="markdown-text")
101
+
102
+ gr.Markdown(MEASURE_DESCRIPTION, elem_classes="markdown-text")
103
+
104
+ leaderboard = init_leaderboard(
105
+ LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}),
106
+ default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'],
107
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
108
+ )
109
+
110
+ # Citation section
111
+ with gr.Accordion("📙 Citation", open=False):
112
+ gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)
113
+
114
+ # Background job to restart space
115
+ scheduler = BackgroundScheduler()
116
+ scheduler.add_job(restart_space, "interval", seconds=1800)
117
+ scheduler.start()
118
+
119
+ # Launch the app with concurrent queueing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  demo.queue(default_concurrency_limit=40).launch()
example_app.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+
7
+ from src.about import (
8
+ CITATION_BUTTON_LABEL,
9
+ CITATION_BUTTON_TEXT,
10
+ EVALUATION_QUEUE_TEXT,
11
+ INTRODUCTION_TEXT,
12
+ LLM_BENCHMARKS_TEXT,
13
+ TITLE,
14
+ )
15
+
16
+ from src.tasks import (
17
+ TE_DESCRIPTION,
18
+ )
19
+
20
+ from src.display.css_html_js import custom_css
21
+ from src.display.utils import (
22
+ BENCHMARK_COLS,
23
+ COLS,
24
+ EVAL_COLS,
25
+ EVAL_TYPES,
26
+ AutoEvalColumn,
27
+ ModelType,
28
+ fields,
29
+ WeightType,
30
+ Precision
31
+ )
32
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
33
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
34
+ from src.submission.submit import add_new_eval
35
+
36
+
37
+ def restart_space():
38
+ API.restart_space(repo_id=REPO_ID)
39
+
40
+ ### Space initialisation
41
+ try:
42
+ print(EVAL_REQUESTS_PATH)
43
+ snapshot_download(
44
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
45
+ )
46
+ except Exception:
47
+ restart_space()
48
+ try:
49
+ print(EVAL_RESULTS_PATH)
50
+ snapshot_download(
51
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
52
+ )
53
+ except Exception:
54
+ restart_space()
55
+
56
+
57
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
58
+
59
+ (
60
+ finished_eval_queue_df,
61
+ running_eval_queue_df,
62
+ pending_eval_queue_df,
63
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
64
+
65
+ def init_leaderboard(dataframe):
66
+ print(dataframe)
67
+ if dataframe is None or dataframe.empty:
68
+ raise ValueError("Leaderboard DataFrame is empty or None.")
69
+ return Leaderboard(
70
+ value=dataframe,
71
+ datatype=[c.type for c in fields(AutoEvalColumn)],
72
+ select_columns=SelectColumns(
73
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
74
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
75
+ label="Select Columns to Display:",
76
+ ),
77
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
78
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
79
+ filter_columns=[
80
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
81
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
82
+ ColumnFilter(
83
+ AutoEvalColumn.params.name,
84
+ type="slider",
85
+ min=0.01,
86
+ max=150,
87
+ label="Select the number of parameters (B)",
88
+ ),
89
+ ColumnFilter(
90
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
91
+ ),
92
+ ],
93
+ bool_checkboxgroup_label="Hide models",
94
+ interactive=False,
95
+ )
96
+
97
+
98
+ def init_leaderboard2(dataframe, default_selection=None, hidden_columns=None):
99
+
100
+ print("entrato===============================================")
101
+
102
+ if dataframe is None or dataframe.empty:
103
+ raise ValueError("Leaderboard DataFrame is empty or None.")
104
+ return Leaderboard(
105
+ value=dataframe,
106
+ datatype=[c.type for c in fields(AutoEvalColumn)],
107
+ select_columns=SelectColumns(
108
+ default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
109
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
110
+ label="Select Columns to Display:",
111
+ ),
112
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
113
+ hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
114
+ filter_columns=[
115
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
116
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
117
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"),
118
+ ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
119
+ ],
120
+ bool_checkboxgroup_label="Hide models",
121
+ interactive=False,
122
+ )
123
+
124
+
125
+ demo = gr.Blocks(css=custom_css)
126
+ with demo:
127
+ gr.HTML(TITLE)
128
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
129
+
130
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
131
+ with gr.TabItem("🏅 EVALITA-LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
132
+ #leaderboard = init_leaderboard(LEADERBOARD_DF)
133
+
134
+ leaderboard = init_leaderboard2(
135
+ LEADERBOARD_DF,
136
+ default_selection=['T', 'Model', "Average ⬆��", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
137
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
138
+ col not in ['T', 'Model', "Average ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL" ]]
139
+ )
140
+
141
+
142
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
143
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
144
+
145
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
146
+ with gr.Column():
147
+ with gr.Row():
148
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
149
+
150
+ with gr.Column():
151
+ with gr.Accordion(
152
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
153
+ open=False,
154
+ ):
155
+ with gr.Row():
156
+ finished_eval_table = gr.components.Dataframe(
157
+ value=finished_eval_queue_df,
158
+ headers=EVAL_COLS,
159
+ datatype=EVAL_TYPES,
160
+ row_count=5,
161
+ )
162
+ with gr.Accordion(
163
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
164
+ open=False,
165
+ ):
166
+ with gr.Row():
167
+ running_eval_table = gr.components.Dataframe(
168
+ value=running_eval_queue_df,
169
+ headers=EVAL_COLS,
170
+ datatype=EVAL_TYPES,
171
+ row_count=5,
172
+ )
173
+
174
+ with gr.Accordion(
175
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
176
+ open=False,
177
+ ):
178
+ with gr.Row():
179
+ pending_eval_table = gr.components.Dataframe(
180
+ value=pending_eval_queue_df,
181
+ headers=EVAL_COLS,
182
+ datatype=EVAL_TYPES,
183
+ row_count=5,
184
+ )
185
+ with gr.Row():
186
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
187
+
188
+ with gr.Row():
189
+ with gr.Column():
190
+ model_name_textbox = gr.Textbox(label="Model name")
191
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
192
+ model_type = gr.Dropdown(
193
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
194
+ label="Model type",
195
+ multiselect=False,
196
+ value=None,
197
+ interactive=True,
198
+ )
199
+
200
+ with gr.Column():
201
+ precision = gr.Dropdown(
202
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
203
+ label="Precision",
204
+ multiselect=False,
205
+ value="float16",
206
+ interactive=True,
207
+ )
208
+ weight_type = gr.Dropdown(
209
+ choices=[i.value.name for i in WeightType],
210
+ label="Weights type",
211
+ multiselect=False,
212
+ value="Original",
213
+ interactive=True,
214
+ )
215
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
216
+
217
+ submit_button = gr.Button("Submit Eval")
218
+ submission_result = gr.Markdown()
219
+ submit_button.click(
220
+ add_new_eval,
221
+ [
222
+ model_name_textbox,
223
+ base_model_name_textbox,
224
+ revision_name_textbox,
225
+ precision,
226
+ weight_type,
227
+ model_type,
228
+ ],
229
+ submission_result,
230
+ )
231
+
232
+
233
+ with gr.TabItem("TE", elem_id="llm-benchmark-tab-table", id=4):
234
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
235
+ #leaderboard = init_leaderboard(LEADERBOARD_DF)
236
+
237
+ LEADERBOARD_DF_TE = LEADERBOARD_DF.rename(columns={"TE Prompt Average": "Prompt Average",
238
+ "TE Best Prompt": "Best Prompt",
239
+ "TE Best Prompt Id": "Best Prompt Id",
240
+ "TE": "Combined Performance"})
241
+
242
+ leaderboard = init_leaderboard2(
243
+ LEADERBOARD_DF_TE,
244
+ default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'],
245
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
246
+ col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
247
+ )
248
+
249
+
250
+ with gr.TabItem("SA", elem_id="llm-benchmark-tab-table", id=5):
251
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
252
+
253
+ LEADERBOARD_DF_SA = LEADERBOARD_DF.rename(columns={"SA Prompt Average": "Prompt Average",
254
+ "SA Best Prompt": "Best Prompt",
255
+ "SA Best Prompt Id": "Best Prompt Id",
256
+ "SA": "Combined Performance"})
257
+
258
+ leaderboard = init_leaderboard2(
259
+ LEADERBOARD_DF_SA,
260
+ default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
261
+ 'Best Prompt Id'],
262
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
263
+ col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
264
+ 'Best Prompt Id']]
265
+ )
266
+
267
+
268
+
269
+
270
+ with gr.TabItem("HS", elem_id="llm-benchmark-tab-table", id=6):
271
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
272
+
273
+ LEADERBOARD_DF_HS = LEADERBOARD_DF.rename(columns={"HS Prompt Average": "Prompt Average",
274
+ "HS Best Prompt": "Best Prompt",
275
+ "HS Best Prompt Id": "Best Prompt Id",
276
+ "HS": "Combined Performance"})
277
+
278
+ leaderboard = init_leaderboard2(
279
+ LEADERBOARD_DF_HS,
280
+ default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
281
+ 'Best Prompt Id'],
282
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
283
+ col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
284
+ 'Best Prompt Id']]
285
+ )
286
+
287
+
288
+
289
+ with gr.TabItem("AT", elem_id="llm-benchmark-tab-table", id=7):
290
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
291
+
292
+ with gr.TabItem("WIC", elem_id="llm-benchmark-tab-table", id=8):
293
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
294
+
295
+ with gr.TabItem("FAQ", elem_id="llm-benchmark-tab-table", id=9):
296
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
297
+
298
+ with gr.TabItem("LS", elem_id="llm-benchmark-tab-table", id=10):
299
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
300
+
301
+ with gr.TabItem("SU", elem_id="llm-benchmark-tab-table", id=11):
302
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
303
+
304
+ with gr.TabItem("NER", elem_id="llm-benchmark-tab-table", id=12):
305
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
306
+
307
+ with gr.TabItem("REL", elem_id="llm-benchmark-tab-table", id=13):
308
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
309
+
310
+
311
+ with gr.Row():
312
+ with gr.Accordion("📙 Citation", open=False):
313
+ citation_button = gr.Textbox(
314
+ value=CITATION_BUTTON_TEXT,
315
+ label=CITATION_BUTTON_LABEL,
316
+ lines=20,
317
+ elem_id="citation-button",
318
+ show_copy_button=True,
319
+ )
320
+
321
+ scheduler = BackgroundScheduler()
322
+ scheduler.add_job(restart_space, "interval", seconds=1800)
323
+ scheduler.start()
324
+ demo.queue(default_concurrency_limit=40).launch()
app2.py → example_app2.py RENAMED
@@ -8,7 +8,7 @@ from src.about import (
8
  CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT,
9
  INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
10
  )
11
- from src.tasks import TE_DESCRIPTION
12
  from src.display.css_html_js import custom_css
13
  from src.display.utils import (
14
  BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn,
@@ -19,6 +19,53 @@ from src.populate import get_evaluation_queue_df, get_leaderboard_df
19
  from src.submission.submit import add_new_eval
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def restart_space():
23
  """Restart the Hugging Face space."""
24
  API.restart_space(repo_id=REPO_ID)
@@ -59,10 +106,11 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
59
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
60
  hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
61
  filter_columns=[
62
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
63
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
 
64
  ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"),
65
- ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
66
  ],
67
  bool_checkboxgroup_label="Hide models",
68
  interactive=False,
@@ -89,15 +137,16 @@ with demo:
89
  with gr.TabItem("🏅 EVALITA-LLM Benchmark", elem_id="llm-benchmark-tab-table"):
90
  leaderboard = init_leaderboard(
91
  LEADERBOARD_DF,
92
- default_selection=['T', 'Model', 'Few-Shot', "Average ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
93
  hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in
94
- ['T', 'Model', 'Few-Shot', "Average ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
95
  )
96
 
97
  # About tab
98
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table"):
99
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
100
 
 
101
  # Submission tab
102
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table"):
103
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
@@ -129,16 +178,30 @@ with demo:
129
  [model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type],
130
  submission_result,
131
  )
 
132
 
133
  # Task-specific leaderboards
134
  for task in ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]:
135
- with gr.TabItem(task, elem_id="llm-benchmark-tab-table"):
136
- gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  leaderboard = init_leaderboard(
138
  prepare_leaderboard_df(LEADERBOARD_DF, task),
139
- default_selection=['T', 'Model', 'Few-Shot', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'],
140
  hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in
141
- ['T', 'Model', 'Few-Shot', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
142
  )
143
 
144
  # Citation section
 
8
  CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT,
9
  INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
10
  )
11
+ from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION
12
  from src.display.css_html_js import custom_css
13
  from src.display.utils import (
14
  BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn,
 
19
  from src.submission.submit import add_new_eval
20
 
21
 
22
+
23
+
24
+ # Define the task icons and names
25
+ TASK_ICONS = {
26
+ "TE": "📊", # Textual Entailment
27
+ "SA": "😃", # Sentiment Analysis
28
+ "HS": "⚠️", # Hate Speech
29
+ "AT": "🏥", # Admission Test
30
+ "WIC": "🔤", # Word in Context
31
+ "FAQ": "❓", # Frequently Asked Questions
32
+ "LS": "🔄", # Lexical Substitution
33
+ "SU": "📝", # Summarization
34
+ "NER": "🏷️", # Named Entity Recognition
35
+ "REL": "🔗", # Relation Extraction
36
+ }
37
+
38
+ TASK_NAMES = {
39
+ "TE": "Textual Entailment",
40
+ "SA": "Sentiment Analysis",
41
+ "HS": "Hate Speech",
42
+ "AT": "Admission Test",
43
+ "WIC": "Word in Context",
44
+ "FAQ": "Frequently Asked Questions",
45
+ "LS": "Lexical Substitution",
46
+ "SU": "Summarization",
47
+ "NER": "Named Entity Recognition",
48
+ "REL": "Relation Extraction",
49
+ }
50
+
51
+
52
+ # Tooltip descriptions for each task
53
+ TASK_TOOLTIPS = {
54
+ "TE": "Identify logical relationships between two text segments.",
55
+ "SA": "Classify the sentiment (positive, negative, neutral) of a text.",
56
+ "HS": "Detect hate speech in a text.",
57
+ "AT": "Classify whether a clinical statement pertains to an admission test.",
58
+ "WIC": "Identify words in context and their meaning.",
59
+ "FAQ": "Answer frequently asked questions based on given text.",
60
+ "LS": "Identify alternative words in a given context.",
61
+ "SU": "Summarize long text into a shorter version.",
62
+ "NER": "Identify named entities (e.g., persons, locations, organizations) in text.",
63
+ "REL": "Extract and link laboratory test results to the respective tests in clinical narratives.",
64
+ }
65
+
66
+
67
+
68
+
69
  def restart_space():
70
  """Restart the Hugging Face space."""
71
  API.restart_space(repo_id=REPO_ID)
 
106
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
107
  hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
108
  filter_columns=[
109
+ #ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
110
+ ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="Few-Shot Learning (FS)"),
111
+ #ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
112
  ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"),
113
+ #ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
114
  ],
115
  bool_checkboxgroup_label="Hide models",
116
  interactive=False,
 
137
  with gr.TabItem("🏅 EVALITA-LLM Benchmark", elem_id="llm-benchmark-tab-table"):
138
  leaderboard = init_leaderboard(
139
  LEADERBOARD_DF,
140
+ default_selection=['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
141
  hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in
142
+ ['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
143
  )
144
 
145
  # About tab
146
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table"):
147
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
148
 
149
+ '''
150
  # Submission tab
151
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table"):
152
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
178
  [model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type],
179
  submission_result,
180
  )
181
+ '''
182
 
183
  # Task-specific leaderboards
184
  for task in ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]:
185
+
186
+ with gr.TabItem(f"{TASK_ICONS[task]}{task}", elem_id="llm-benchmark-tab-table"):
187
+
188
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
189
+
190
+
191
+
192
+
193
+ gr.Markdown(task_description, elem_classes="markdown-text")
194
+
195
+
196
+ gr.Markdown(MEASURE_DESCRIPTION, elem_classes="markdown-text")
197
+
198
+
199
+
200
  leaderboard = init_leaderboard(
201
  prepare_leaderboard_df(LEADERBOARD_DF, task),
202
+ default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'],
203
  hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in
204
+ ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
205
  )
206
 
207
  # Citation section
get_model_info.py CHANGED
@@ -35,20 +35,33 @@ for filename in os.listdir(input_folder):
35
  # Ottieni le informazioni del modello da Hugging Face
36
  model_info = api.model_info(model_name)
37
 
 
 
 
 
 
 
 
 
 
 
 
38
  # Costruisci il dizionario con i metadati richiesti
39
  model_data = {
40
  "model": model_name,
41
  "base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "",
42
  "revision": model_info.sha,
43
- "precision": "bfloat16", # Se disponibile, sostituire con un valore reale
44
  #"weight_type": "Original",
45
  #"status": "FINISHED",
46
  "submitted_time": str(model_info.created_at),
47
- "model_type": "pretrained",
48
  #"likes": model_info.likes,
49
  #"params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None,
50
  #"license": model_info.license,
51
  #"private": model_info.private,
 
 
52
  }
53
 
54
  # Separare il model_name in due parti: prima e dopo "/"
@@ -71,4 +84,4 @@ for filename in os.listdir(input_folder):
71
  except Exception as e:
72
  print(f"Error retrieving info for {model_name}: {e}")
73
 
74
- print("Process completed.")
 
35
  # Ottieni le informazioni del modello da Hugging Face
36
  model_info = api.model_info(model_name)
37
 
38
+ # Calcola il numero di parametri in miliardi, se disponibile
39
+ num_params = None
40
+ if model_info.safetensors and "BF16" in model_info.safetensors.parameters:
41
+ num_params = model_info.safetensors.parameters["BF16"] / 1e9 # Converti in miliardi
42
+
43
+ # Estrai la lingua (può essere una lista, quindi prendiamo la prima se esiste)
44
+ # Estrai e concatena i linguaggi
45
+ language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""
46
+
47
+ print(model_info)
48
+
49
  # Costruisci il dizionario con i metadati richiesti
50
  model_data = {
51
  "model": model_name,
52
  "base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "",
53
  "revision": model_info.sha,
54
+ #"precision": "bfloat16", # Se disponibile, sostituire con un valore reale
55
  #"weight_type": "Original",
56
  #"status": "FINISHED",
57
  "submitted_time": str(model_info.created_at),
58
+ #"model_type": "pretrained",
59
  #"likes": model_info.likes,
60
  #"params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None,
61
  #"license": model_info.license,
62
  #"private": model_info.private,
63
+ "num_params_billion": num_params, # Numero di parametri in miliardi
64
+ "language": language, # Lingua estratta
65
  }
66
 
67
  # Separare il model_name in due parti: prima e dopo "/"
 
84
  except Exception as e:
85
  print(f"Error retrieving info for {model_name}: {e}")
86
 
87
+ print("Process completed1.")
src/about.py CHANGED
@@ -95,7 +95,9 @@ NUM_FEWSHOT = 0 # Change with your few shot
95
 
96
  # Your leaderboard name
97
 
98
- TITLE = """<h1 align="center" id="space-title">Work in progress!</h1>"""
 
 
99
 
100
 
101
  # What does your leaderboard evaluate?
 
95
 
96
  # Your leaderboard name
97
 
98
+ #TITLE = """<h1 align="center" id="space-title">Work in progress!</h1>"""
99
+ # Your leaderboard name
100
+ TITLE = """<h1 align="center" id="space-title">🚀 EVALITA-LLM Leaderboard 🚀</h1>"""
101
 
102
 
103
  # What does your leaderboard evaluate?
src/display/utils.py CHANGED
@@ -23,18 +23,26 @@ class ColumnContent:
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 
 
 
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- auto_eval_column_dict.append(["fewshot", ColumnContent, ColumnContent("Few-Shot", "str", True)])
 
 
 
 
29
  #Scores
30
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
31
  for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
33
  # Model information
34
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
35
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
36
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
37
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
38
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
39
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
40
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
@@ -50,7 +58,7 @@ class EvalQueueColumn: # Queue column
50
  model = ColumnContent("model", "markdown", True)
51
  revision = ColumnContent("revision", "str", True)
52
  private = ColumnContent("private", "bool", True)
53
- precision = ColumnContent("precision", "str", True)
54
  weight_type = ColumnContent("weight_type", "str", "Original")
55
  status = ColumnContent("status", "str", True)
56
 
@@ -84,6 +92,34 @@ class ModelType(Enum):
84
  return ModelType.IFT
85
  return ModelType.Unknown
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  class WeightType(Enum):
88
  Adapter = ModelDetails("Adapter")
89
  Original = ModelDetails("Original")
@@ -133,4 +169,5 @@ new_column_dict.append(["ID", NewColumnContent, NewColumnContent("ID", "str", Tr
133
  NewColumn = make_dataclass("NewColumn", new_column_dict, frozen=True)
134
 
135
  # Includi questi nuovi valori nei COLS o in altre variabili di configurazione, se necessario
136
- NEW_COLS = [c.name for c in fields(NewColumn) if not c.hidden]
 
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
+ #auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
+
28
+ auto_eval_column_dict.append(["fewshot_type", ColumnContent, ColumnContent("FS", "str", True, never_hidden=True)])
29
+
30
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
31
+ #auto_eval_column_dict.append(["fewshot", ColumnContent, ColumnContent("Few-Shot", "str", True)])
32
+
33
+
34
+
35
+
36
  #Scores
37
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg. Combined Performance ⬆️", "number", True)])
38
  for task in Tasks:
39
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
40
+
41
  # Model information
42
+ #auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
43
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
44
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
45
+ #auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
46
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
47
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
48
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
 
58
  model = ColumnContent("model", "markdown", True)
59
  revision = ColumnContent("revision", "str", True)
60
  private = ColumnContent("private", "bool", True)
61
+ #precision = ColumnContent("precision", "str", True)
62
  weight_type = ColumnContent("weight_type", "str", "Original")
63
  status = ColumnContent("status", "str", True)
64
 
 
92
  return ModelType.IFT
93
  return ModelType.Unknown
94
 
95
+
96
+
97
+ @dataclass
98
+ class FewShotDetails:
99
+ name: str
100
+ symbol: str = "" # emoji
101
+
102
+ class FewShotType(Enum):
103
+ ZS = FewShotDetails(name="zero-shot", symbol="0️⃣")
104
+ FS = FewShotDetails(name="5-few-shot", symbol="5️⃣")
105
+ Unknown = FewShotDetails(name="unknown", symbol="❓")
106
+
107
+ def to_str(self, separator=" "):
108
+ return f"{self.value.symbol}{separator}{self.value.name}"
109
+
110
+ @staticmethod
111
+ def from_num_fewshot(num_fewshot):
112
+ """Determines FewShotType based on num_fewshot."""
113
+ if num_fewshot == 0:
114
+ return FewShotType.ZS
115
+ if num_fewshot == 5:
116
+ return FewShotType.FS
117
+ return FewShotType.Unknown
118
+
119
+
120
+
121
+
122
+
123
  class WeightType(Enum):
124
  Adapter = ModelDetails("Adapter")
125
  Original = ModelDetails("Original")
 
169
  NewColumn = make_dataclass("NewColumn", new_column_dict, frozen=True)
170
 
171
  # Includi questi nuovi valori nei COLS o in altre variabili di configurazione, se necessario
172
+ NEW_COLS = [c.name for c in fields(NewColumn) if not c.hidden]
173
+
src/leaderboard/read_evals.py CHANGED
@@ -1,259 +1,95 @@
1
- import glob
2
- import json
3
- import math
4
- import os
5
- from dataclasses import dataclass
6
-
7
- import dateutil
8
- import numpy as np
9
-
10
- from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
- from src.submission.check_validity import is_model_on_hub
13
-
14
-
15
- @dataclass
16
- class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
- model: str
23
- revision: str # commit hash, "" if main
24
- results: dict
25
- average_CPS: str
26
- fewshot: str
27
- precision: Precision = Precision.Unknown
28
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
29
- weight_type: WeightType = WeightType.Original # Original or Adapter
30
- architecture: str = "Unknown"
31
- license: str = "?"
32
- likes: int = 0
33
- num_params: int = 0
34
- date: str = "" # submission date of request file
35
- still_on_hub: bool = False
36
-
37
- @classmethod
38
- def init_from_json_file(self, json_filepath):
39
- """Inits the result from the specific model result file"""
40
- with open(json_filepath) as fp:
41
- data = json.load(fp)
42
-
43
- config = data.get("config")
44
-
45
- average_CPS = data.get("average_CPS")
46
-
47
- num_fewshot = config.get("num_fewshot", 0) # Imposta il valore predefinito a 0
48
- try:
49
- num_fewshot = int(num_fewshot) # Converte in intero se possibile
50
- except ValueError:
51
- num_fewshot = 0 # Se la conversione fallisce, assegna 0
52
-
53
-
54
- precision = config.get("precision")
55
-
56
- print(precision)
57
-
58
- print(config, num_fewshot)
59
-
60
- # Precision
61
- precision = Precision.from_str(config.get("model_dtype"))
62
-
63
- model_type = config.get("model_type")
64
- # Modifica: Convertire model_type in un oggetto Enum (se è un Enum)
65
- model_type = ModelType.from_str(model_type) if model_type else None
66
-
67
- print("=====================", model_type, config.get("model_name"))
68
-
69
-
70
- # Get model and org
71
- org_and_model = config.get("model_name", config.get("model_args", None))
72
- org_and_model = org_and_model.split("/", 1)
73
-
74
- print(precision.value.name)
75
-
76
- if len(org_and_model) == 1:
77
- org = None
78
- model = org_and_model[0]
79
- #result_key = f"{model}_{precision.value.name}"
80
- result_key = f"{model}_{num_fewshot}"
81
- else:
82
- org = org_and_model[0]
83
- model = org_and_model[1]
84
- #result_key = f"{org}_{model}_{precision.value.name}"
85
- result_key = f"{org}_{model}_{num_fewshot}"
86
- full_model = "/".join(org_and_model)
87
-
88
- still_on_hub, _, model_config = is_model_on_hub(
89
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
90
- )
91
- architecture = "?"
92
- if model_config is not None:
93
- architectures = getattr(model_config, "architectures", None)
94
- if architectures:
95
- architecture = ";".join(architectures)
96
-
97
- # Extract results available in this file (some results are split in several files)
98
- results = {}
99
- for task in Tasks:
100
- task = task.value
101
-
102
- '''
103
- # We average all scores of a given metric (not all metrics are present in all files)
104
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
105
- if accs.size == 0 or any([acc is None for acc in accs]):
106
- continue
107
-
108
- mean_acc = np.mean(accs) * 100.0
109
- results[task.benchmark] = mean_acc
110
- '''
111
-
112
- for k, v in data["tasks"].items():
113
- #if task.benchmark == k:
114
- if task.benchmark[:-2] == k:
115
- # print(k, "==================", v)
116
- # results[task.benchmark] = v[task.cps]
117
-
118
- #print(task.benchmark, v[task.metric])
119
-
120
- if "Best Prompt Id" in task.col_name:
121
- results[task.benchmark] = int(v[task.metric_type][-1:])
122
- #print(results[task.benchmark],v[task.metric_type][-1:])
123
- else:
124
- results[task.benchmark] = v[task.metric_type]
125
-
126
-
127
- #results[task.benchmark + "_" + task.metric] = 1.0
128
-
129
-
130
- #results[task.benchmark] = v[task.accuracy]
131
- # print("======", results[task.benchmark])
132
- #results[task.benchmark] = 1.0
133
-
134
- return self(
135
- eval_name=result_key,
136
- full_model=full_model,
137
- org=org,
138
- model=model,
139
- results=results,
140
- average_CPS=average_CPS,
141
- fewshot=num_fewshot,
142
- model_type=model_type,
143
- precision=precision,
144
- revision= config.get("model_sha", ""),
145
- still_on_hub=still_on_hub,
146
- architecture=architecture
147
- )
148
-
149
- def update_with_request_file(self, requests_path):
150
- """Finds the relevant request file for the current model and updates info with it"""
151
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
152
-
153
- try:
154
- with open(request_file, "r") as f:
155
- request = json.load(f)
156
- self.model_type = ModelType.from_str(request.get("model_type", ""))
157
- self.weight_type = WeightType[request.get("weight_type", "Original")]
158
- self.license = request.get("license", "?")
159
- self.likes = request.get("likes", 0)
160
- self.num_params = request.get("params", 0)
161
- self.date = request.get("submitted_time", "")
162
- except Exception:
163
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
164
-
165
- def to_dict(self):
166
- """Converts the Eval Result to a dict compatible with our dataframe display"""
167
- #average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
168
- average = self.average_CPS
169
- fewshot = self.fewshot
170
- print("?????", fewshot)
171
- data_dict = {
172
- "eval_name": self.eval_name, # not a column, just a save name,
173
- AutoEvalColumn.precision.name: self.precision.value.name,
174
- #AutoEvalColumn.model_type.name: self.model_type.value.name,
175
- #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
176
-
177
- AutoEvalColumn.model_type.name: self.model_type.value.name if self.model_type else "Unknown",
178
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol if self.model_type else "Unknown",
179
-
180
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
181
- AutoEvalColumn.architecture.name: self.architecture,
182
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
183
- AutoEvalColumn.revision.name: self.revision,
184
- AutoEvalColumn.average.name: average,
185
- AutoEvalColumn.fewshot.name: fewshot,
186
- AutoEvalColumn.license.name: self.license,
187
- AutoEvalColumn.likes.name: self.likes,
188
- AutoEvalColumn.params.name: self.num_params,
189
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
190
- }
191
-
192
- for task in Tasks:
193
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
194
-
195
- return data_dict
196
-
197
-
198
- def get_request_file_for_model(requests_path, model_name, precision):
199
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
200
- request_files = os.path.join(
201
- requests_path,
202
- f"{model_name}_eval_request_*.json",
203
- )
204
- request_files = glob.glob(request_files)
205
-
206
- # Select correct request file (precision)
207
- request_file = ""
208
- request_files = sorted(request_files, reverse=True)
209
- for tmp_request_file in request_files:
210
- with open(tmp_request_file, "r") as f:
211
- req_content = json.load(f)
212
- if (
213
- req_content["status"] in ["FINISHED"]
214
- and req_content["precision"] == precision.split(".")[-1]
215
- ):
216
- request_file = tmp_request_file
217
- return request_file
218
-
219
-
220
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
221
- """From the path of the results folder root, extract all needed info for results"""
222
- model_result_filepaths = []
223
-
224
- for root, _, files in os.walk(results_path):
225
- # We should only have json files in model results
226
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
227
- continue
228
-
229
- # Sort the files by date
230
- try:
231
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
232
- except dateutil.parser._parser.ParserError:
233
- files = [files[-1]]
234
-
235
- for file in files:
236
- model_result_filepaths.append(os.path.join(root, file))
237
-
238
- eval_results = {}
239
- for model_result_filepath in model_result_filepaths:
240
- # Creation of result
241
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
242
- #eval_result.update_with_request_file(requests_path)
243
-
244
- # Store results of same eval together
245
- eval_name = eval_result.eval_name
246
- if eval_name in eval_results.keys():
247
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
248
- else:
249
- eval_results[eval_name] = eval_result
250
-
251
- results = []
252
- for v in eval_results.values():
253
- try:
254
- v.to_dict() # we test if the dict version is complete
255
- results.append(v)
256
- except KeyError: # not all eval values present
257
- continue
258
-
259
- return results
 
1
+ import glob
2
+ import json
3
+ import math
4
+ import os
5
+ from dataclasses import dataclass
6
+ from src.display.formatting import make_clickable_model
7
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, FewShotType
8
+ from src.submission.check_validity import is_model_on_hub
9
+
10
+ @dataclass
11
+ class EvalResult:
12
+ eval_name: str
13
+ full_model: str
14
+ org: str
15
+ model: str
16
+ revision: str
17
+ results: dict
18
+ average_CPS: str
19
+ fewshot: int
20
+ fewshot_type: FewShotType = FewShotType.Unknown
21
+ weight_type: WeightType = WeightType.Original
22
+ architecture: str = "Unknown"
23
+ license: str = "?"
24
+ likes: int = 0
25
+ num_params: int = 0
26
+ date: str = ""
27
+ still_on_hub: bool = False
28
+
29
+ @classmethod
30
+ def init_from_json_file(cls, json_filepath):
31
+ with open(json_filepath) as fp:
32
+ data = json.load(fp)
33
+
34
+ config = data.get("config")
35
+ average_CPS = f"{data.get('average_CPS'):.2f}"
36
+
37
+ num_fewshot = int(config.get("num_fewshot", 0))
38
+ fewshot_type = FewShotType.from_num_fewshot(num_fewshot)
39
+
40
+ model_type = ModelType.from_str(config.get("model_type")) if config.get("model_type") else None
41
+ num_params = math.ceil(config.get("num_params_billion", 0)) if config.get("num_params_billion") else 0
42
+
43
+ org_and_model = config.get("model_name", "").split("/", 1)
44
+ org, model = (org_and_model if len(org_and_model) == 2 else (None, org_and_model[0]))
45
+
46
+ full_model = "/".join([org, model] if org else [model])
47
+ still_on_hub, _, model_config = is_model_on_hub(full_model, config.get("model_sha", "main"))
48
+
49
+ architecture = ";".join(getattr(model_config, "architectures", [])) if model_config else "?"
50
+
51
+ results = {
52
+ task.value.benchmark: f"{data['tasks'].get(task.value.benchmark, {}).get(task.metric_type, 0):.2f}"
53
+ for task in Tasks
54
+ }
55
+
56
+ return cls(
57
+ eval_name=f"{model}_{num_fewshot}",
58
+ full_model=full_model,
59
+ org=org,
60
+ model=model,
61
+ results=results,
62
+ average_CPS=average_CPS,
63
+ fewshot=fewshot_type,
64
+ fewshot_type=fewshot_type,
65
+ revision=config.get("model_sha", ""),
66
+ still_on_hub=still_on_hub,
67
+ architecture=architecture,
68
+ num_params=num_params
69
+ )
70
+
71
+ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
72
+ model_result_filepaths = [
73
+ os.path.join(root, file)
74
+ for root, _, files in os.walk(results_path)
75
+ for file in sorted(files, key=lambda x: x.split("_")[-1], reverse=True) if file.endswith(".json")
76
+ ]
77
+
78
+ eval_results = {}
79
+ for model_result_filepath in model_result_filepaths:
80
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
81
+ eval_name = eval_result.eval_name
82
+ if eval_name not in eval_results:
83
+ eval_results[eval_name] = eval_result
84
+ else:
85
+ eval_results[eval_name].results.update(eval_result.results)
86
+
87
+ results = []
88
+ for v in eval_results.values():
89
+ try:
90
+ v.to_dict() # Test if the dict version is complete
91
+ results.append(v)
92
+ except KeyError:
93
+ continue
94
+
95
+ return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard/read_evals_old.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ import math
4
+ import os
5
+ from dataclasses import dataclass
6
+
7
+ import dateutil
8
+ import numpy as np
9
+
10
+ #from get_model_info import num_params
11
+ from src.display.formatting import make_clickable_model
12
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, FewShotType
13
+ from src.submission.check_validity import is_model_on_hub
14
+
15
+
16
+ @dataclass
17
+ class EvalResult:
18
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run.
19
+ """
20
+ eval_name: str # org_model_precision (uid)
21
+ full_model: str # org/model (path on hub)
22
+ org: str
23
+ model: str
24
+ revision: str # commit hash, "" if main
25
+ results: dict
26
+ average_CPS: str
27
+ fewshot: int
28
+ #fewshot_type: str
29
+ #precision: Precision = Precision.Unknown
30
+ #model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
31
+ fewshot_type: FewShotType = FewShotType.Unknown
32
+ weight_type: WeightType = WeightType.Original # Original or Adapter
33
+ architecture: str = "Unknown"
34
+ license: str = "?"
35
+ likes: int = 0
36
+ num_params: int = 0
37
+ date: str = "" # submission date of request file
38
+ still_on_hub: bool = False
39
+
40
+ @classmethod
41
+ def init_from_json_file(self, json_filepath):
42
+ """Inits the result from the specific model result file"""
43
+ with open(json_filepath) as fp:
44
+ data = json.load(fp)
45
+
46
+ config = data.get("config")
47
+
48
+ average_CPS = f"{data.get('average_CPS'):.2f}"
49
+
50
+ num_fewshot = config.get("num_fewshot", 0) # Imposta il valore predefinito a 0
51
+ try:
52
+ num_fewshot = int(num_fewshot) # Converte in intero se possibile
53
+ except ValueError:
54
+ num_fewshot = 0 # Se la conversione fallisce, assegna 0
55
+
56
+ # Determine the few-shot type (ZS or FS) based on num_fewshot
57
+ fewshot_type = FewShotType.from_num_fewshot(num_fewshot) # Use the new
58
+
59
+
60
+ #precision = config.get("precision")
61
+
62
+ #print(precision)
63
+
64
+ #print(config, num_fewshot)
65
+
66
+ # Precision
67
+ #precision = Precision.from_str(config.get("model_dtype"))
68
+
69
+ model_type = config.get("model_type")
70
+ # Modifica: Convertire model_type in un oggetto Enum (se è un Enum)
71
+ model_type = ModelType.from_str(model_type) if model_type else None
72
+
73
+ #print("=====================", model_type, config.get("model_name"))
74
+
75
+ # Initialize num_params with a default value (e.g., 0)
76
+ num_params = int(0)
77
+ # Controlla se "num_params_billion" esiste in config e non è null
78
+ num_params_billion = config.get("num_params_billion")
79
+ if num_params_billion is not None:
80
+ num_params = math.ceil(num_params_billion)
81
+
82
+ print("^^^^^^^^^^^^^^^^^^^^^^^^^", num_params, config.get("num_params_billion"))
83
+
84
+
85
+
86
+ # Get model and org
87
+ org_and_model = config.get("model_name", config.get("model_args", None))
88
+ org_and_model = org_and_model.split("/", 1)
89
+
90
+ #print(precision.value.name)
91
+
92
+ if len(org_and_model) == 1:
93
+ org = None
94
+ model = org_and_model[0]
95
+ #result_key = f"{model}_{precision.value.name}"
96
+ result_key = f"{model}_{num_fewshot}"
97
+ else:
98
+ org = org_and_model[0]
99
+ model = org_and_model[1]
100
+ #result_key = f"{org}_{model}_{precision.value.name}"
101
+ result_key = f"{org}_{model}_{num_fewshot}"
102
+ full_model = "/".join(org_and_model)
103
+
104
+ still_on_hub, _, model_config = is_model_on_hub(
105
+ full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
106
+ )
107
+ architecture = "?"
108
+ if model_config is not None:
109
+ architectures = getattr(model_config, "architectures", None)
110
+ if architectures:
111
+ architecture = ";".join(architectures)
112
+
113
+ # Extract results available in this file (some results are split in several files)
114
+ results = {}
115
+ for task in Tasks:
116
+ task = task.value
117
+
118
+ '''
119
+ # We average all scores of a given metric (not all metrics are present in all files)
120
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
121
+ if accs.size == 0 or any([acc is None for acc in accs]):
122
+ continue
123
+
124
+ mean_acc = np.mean(accs) * 100.0
125
+ results[task.benchmark] = mean_acc
126
+ '''
127
+
128
+ for k, v in data["tasks"].items():
129
+ #if task.benchmark == k:
130
+ if task.benchmark[:-2] == k:
131
+ # print(k, "==================", v)
132
+ # results[task.benchmark] = v[task.cps]
133
+
134
+ #print(task.benchmark, v[task.metric])
135
+
136
+ if "Best Prompt Id" in task.col_name:
137
+ results[task.benchmark] = int(v[task.metric_type][-1:])
138
+ #print(results[task.benchmark],v[task.metric_type][-1:])
139
+ else:
140
+ #results[task.benchmark] = round(v[task.metric_type], 2)
141
+ # Format the value to 2 decimal places (ensure it's always shown as xx.xx)
142
+ results[task.benchmark] = f"{v[task.metric_type]:.2f}" # Ensure two decimals for display
143
+
144
+
145
+ #results[task.benchmark + "_" + task.metric] = 1.0
146
+
147
+
148
+ #results[task.benchmark] = v[task.accuracy]
149
+ # print("======", results[task.benchmark])
150
+ #results[task.benchmark] = 1.0
151
+
152
+ return self(
153
+ eval_name=result_key,
154
+ full_model=full_model,
155
+ org=org,
156
+ model=model,
157
+ results=results,
158
+ average_CPS=average_CPS,
159
+ fewshot_type=fewshot_type, # Set the fewshot type (ZS or FS)
160
+ fewshot=num_fewshot,
161
+ #model_type=model_type,
162
+ #precision=precision,
163
+ revision= config.get("model_sha", ""),
164
+ still_on_hub=still_on_hub,
165
+ architecture=architecture,
166
+ num_params=num_params
167
+ )
168
+
169
+ '''
170
+ def update_with_request_file(self, requests_path):
171
+ """Finds the relevant request file for the current model and updates info with it"""
172
+ request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
173
+
174
+ try:
175
+ with open(request_file, "r") as f:
176
+ request = json.load(f)
177
+ self.model_type = ModelType.from_str(request.get("model_type", ""))
178
+ self.weight_type = WeightType[request.get("weight_type", "Original")]
179
+ self.license = request.get("license", "?")
180
+ self.likes = request.get("likes", 0)
181
+ self.num_params = request.get("params", 0)
182
+ self.date = request.get("submitted_time", "")
183
+ except Exception:
184
+ print(f"Could not find request file for {self.org}/{self.model} with precision
185
+ '''
186
+
187
+ def to_dict(self):
188
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
189
+ #average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
190
+ average = self.average_CPS
191
+ fewshot = self.fewshot
192
+
193
+ # Ottiene il simbolo di FewShotType in modo simile a ModelType
194
+ fewshot_type_symbol = (
195
+ self.fewshot_type.value.symbol if isinstance(self.fewshot_type, FewShotType) else "❓"
196
+ )
197
+
198
+ #("?????", fewshot)
199
+ data_dict = {
200
+ "eval_name": self.eval_name, # not a column, just a save name,
201
+ #AutoEvalColumn.precision.name: self.precision.value.name,
202
+ #AutoEvalColumn.model_type.name: self.model_type.value.name,
203
+ #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
204
+
205
+ #AutoEvalColumn.model_type.name: self.model_type.value.name if self.model_type else "Unknown",
206
+ #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol if self.model_type else "Unknown",
207
+
208
+
209
+
210
+ AutoEvalColumn.fewshot_type.name: fewshot_type_symbol, # Simbolo corretto per fewshot type
211
+
212
+
213
+
214
+
215
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
216
+ AutoEvalColumn.architecture.name: self.architecture,
217
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
218
+ AutoEvalColumn.revision.name: self.revision,
219
+ AutoEvalColumn.average.name: average,
220
+ #AutoEvalColumn.fewshot.name: fewshot,
221
+ AutoEvalColumn.license.name: self.license,
222
+ AutoEvalColumn.likes.name: self.likes,
223
+ AutoEvalColumn.params.name: self.num_params,
224
+ AutoEvalColumn.still_on_hub.name: self.still_on_hub,
225
+ }
226
+
227
+ for task in Tasks:
228
+ data_dict[task.value.col_name] = self.results[task.value.benchmark]
229
+
230
+ return data_dict
231
+
232
+ '''
233
+ def get_request_file_for_model(requests_path, model_name, precision):
234
+ """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
235
+ request_files = os.path.join(
236
+ requests_path,
237
+ f"{model_name}_eval_request_*.json",
238
+ )
239
+ request_files = glob.glob(request_files)
240
+
241
+ # Select correct request file (precision)
242
+ request_file = ""
243
+ request_files = sorted(request_files, reverse=True)
244
+ for tmp_request_file in request_files:
245
+ with open(tmp_request_file, "r") as f:
246
+ req_content = json.load(f)
247
+ if (
248
+ req_content["status"] in ["FINISHED"]
249
+ and req_content["precision"] == precision.split(".")[-1]
250
+ ):
251
+ request_file = tmp_request_file
252
+ return request_file
253
+ '''
254
+
255
+
256
+
257
+ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
258
+ """From the path of the results folder root, extract all needed info for results"""
259
+ model_result_filepaths = []
260
+
261
+ for root, _, files in os.walk(results_path):
262
+ # We should only have json files in model results
263
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
264
+ continue
265
+
266
+ # Sort the files by date
267
+ try:
268
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
269
+ except dateutil.parser._parser.ParserError:
270
+ files = [files[-1]]
271
+
272
+ for file in files:
273
+ model_result_filepaths.append(os.path.join(root, file))
274
+
275
+ eval_results = {}
276
+ for model_result_filepath in model_result_filepaths:
277
+ # Creation of result
278
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
279
+ #eval_result.update_with_request_file(requests_path)
280
+
281
+ # Store results of same eval together
282
+ eval_name = eval_result.eval_name
283
+ if eval_name in eval_results.keys():
284
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
285
+ else:
286
+ eval_results[eval_name] = eval_result
287
+
288
+ results = []
289
+ for v in eval_results.values():
290
+ try:
291
+ v.to_dict() # we test if the dict version is complete
292
+ results.append(v)
293
+ except KeyError: # not all eval values present
294
+ continue
295
+
296
+ return results
src/populate.py CHANGED
@@ -5,7 +5,7 @@ import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
+ from src.leaderboard.read_evals_old import get_raw_eval_results
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
src/submission/check_validity.py CHANGED
@@ -64,7 +64,7 @@ def get_model_size(model_info: ModelInfo, precision: str):
64
  try:
65
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
  except (AttributeError, TypeError):
67
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
 
69
  size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
70
  model_size = size_factor * model_size
 
64
  try:
65
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
  except (AttributeError, TypeError):
67
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in example_app.py
68
 
69
  size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
70
  model_size = size_factor * model_size
src/tasks.py CHANGED
@@ -8,7 +8,6 @@ class Task:
8
  accuracy: str
9
  col_name: str
10
 
11
-
12
  NUM_FEWSHOT = 0 # Change with your few shot
13
  # ---------------------------------------------------
14
 
@@ -20,12 +19,16 @@ INTRODUCTION_TEXT = """
20
  Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
21
  """
22
 
23
- # Which evaluations are you running? how can people reproduce what you have?
 
 
 
 
24
  TE_DESCRIPTION = """### Textual Entailment (TE)
25
  The input are two sentences: the text (T) and the hypothesis (H). The model has to determine whether the meaning of the hypothesis is logically entailed by the text.
26
 
27
  | # | Prompt | Answer Choices |
28
- |-----|--------|----------------|
29
  | 1 | La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? | ["Sì", "No"] |
30
  | 2 | Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? | ["Sì", "No"] |
31
  | 3 | La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: | ["A", "B"] |
@@ -33,5 +36,137 @@ TE_DESCRIPTION = """### Textual Entailment (TE)
33
  | 5 | Frase 1: '{{text1}}' Frase 2: '{{text2}}' | ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] |
34
  | 6 | Devi risolvere un compito di inferenza semantica. Frase 1: '{{text1}}' Frase 2: '{{text2}}' | ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] |
35
 
36
- Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)
37
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  accuracy: str
9
  col_name: str
10
 
 
11
  NUM_FEWSHOT = 0 # Change with your few shot
12
  # ---------------------------------------------------
13
 
 
19
  Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
20
  """
21
 
22
+ #MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"
23
+ MEASURE_DESCRIPTION = "**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the assessed prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above)."
24
+
25
+
26
+ # Tasks Descriptions
27
  TE_DESCRIPTION = """### Textual Entailment (TE)
28
  The input are two sentences: the text (T) and the hypothesis (H). The model has to determine whether the meaning of the hypothesis is logically entailed by the text.
29
 
30
  | # | Prompt | Answer Choices |
31
+ |-----|------------|--------------|
32
  | 1 | La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? | ["Sì", "No"] |
33
  | 2 | Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? | ["Sì", "No"] |
34
  | 3 | La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: | ["A", "B"] |
 
36
  | 5 | Frase 1: '{{text1}}' Frase 2: '{{text2}}' | ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] |
37
  | 6 | Devi risolvere un compito di inferenza semantica. Frase 1: '{{text1}}' Frase 2: '{{text2}}' | ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] |
38
 
39
+ """
40
+
41
+
42
+ SA_DESCRIPTION = """### Sentiment Analysis (SA)
43
+ The input is a tweet. The model has to determine the sentiment polarity of the text, categorizing it into one of four classes: positive, negative, neutral, or mixed.
44
+
45
+ | # | Prompt | Answer Choices |
46
+ |-----|--------------------------------------------------------------------------------|-----------------------------|
47
+ | 1 | Qual è il sentiment espresso nel seguente tweet: '{{text}}'? | ["Positivo", "Negativo", "Neutro", "Misto"] |
48
+ | 2 | Devi svolgere un compito di analisi del sentiment. Qual è il sentiment espresso nel seguente tweet: '{{text}}'? | ["Positivo", "Negativo", "Neutro", "Misto"] |
49
+ | 3 | Qual è il sentiment espresso nel seguente tweet: '{{text}}'?\\nA: Positivo\\nB: Negativo\\nC: Neutro\\nD: Misto\\nRisposta: | ["A", "B", "C", "D"] |
50
+ | 4 | Devi svolgere un compito di analisi del sentiment. Qual è il sentiment espresso nel seguente tweet: '{{text}}'?\\nA: Positivo\\nB: Negativo\\nC: Neutro\\nD: \\nRisposta: | ["A", "B", "C", "D"] |
51
+ | 5 | Il seguente tweet: '{{text}}' esprime un sentiment | ["Positivo", "Negativo", "Neutro", "Misto"] |
52
+ | 6 | Devi svolgere un compito di analisi del sentiment. Il seguente tweet: '{{text}}' esprime un sentiment | ["Positivo", "Negativo", "Neutro", "Misto"] |
53
+
54
+ """
55
+
56
+
57
+ HS_DESCRIPTION = """### Hate Speech (HS)
58
+ The input is a tweet. The model has to determine whether the text contains hateful content directed at specific target groups: immigrants, Muslims, or Roma. The output is a binary classification: hateful or not hateful.
59
+
60
+ | # | Prompt | Answer Choices |
61
+ |-----|--------------------------------------------------------------------------------|-------------------------------------------------|
62
+ | 1 | C'è incitamento all'odio nel seguente tweet: '{{full_text}}'? | ["Falso", "Vero"] |
63
+ | 2 | Devi svolgere un compito di identificazione di incitamento all'odio. C'è incitamento all'odio nel seguente tweet: '{{full_text}}'? | ["Falso", "Vero"] |
64
+ | 3 | C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?\\nA: \\nB: Falso\\nRisposta: | ["B", "A"] |
65
+ | 4 | Devi svolgere un compito di identificazione di incitamento all'odio. C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?\\nA: \\nB: \\nRisposta: | ["B", "A"] |
66
+ | 5 | Il tweet: '{{full_text}}' | ["non contiene incitamento all'odio", "contiene incitamento all'odio"] |
67
+ | 6 | Devi svolgere un compito di identificazione di incitamento all'odio. Il tweet: '{{full_text}}' | ["non contiene incitamento all'odio", "contiene incitamento all'odio"] |
68
+
69
+ """
70
+
71
+
72
+ AT_DESCRIPTION = """### Admission Tests (AT)
73
+ The input is a multiple-choice question with five options (A-E) from Italian medical specialty entrance exams, and the model must identify the correct answer.
74
+
75
+ | # | Prompt | Answer Choices |
76
+ |-----|--------------------------------------------------------------------------------|-----------------------------|
77
+ | 1 | Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta? | ["A", "B", "C", "D", "E"] |
78
+ | 2 | Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta? | ["A", "B", "C", "D", "E"] |
79
+ | 3 | Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: | ["A", "B", "C", "D", "E"] |
80
+ | 4 | Devi risolvere un compito a scelta multipla. Dato il seguente caso clinico: '{{background}}', qual è la risposta corretta alla domanda: '{{domanda}}'?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta:Devi risolvere un compito a scelta multipla. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: | ["A", "B", "C", "D", "E"] |
81
+ | 5 | Dato il seguente caso clinico: '{{background}}'. La risposta corretta alla domanda: '{{domanda}}' èDato il seguente quesito di medicina '{{Question}}' la risposta corretta è: | ["A", "B", "C", "D", "E"] |
82
+ | 6 | Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina '{{Question}}' la risposta corretta è: | ["A", "B", "C", "D", "E"] |
83
+
84
+ """
85
+
86
+ WIC_DESCRIPTION = """### Word in Context (WIC)
87
+ The input consists of a word (w) and two sentences. The model has to determine whether the word w has the same meaning in both sentences. The output is a binary classification: 1 (same meaning) or 0 (different meaning).
88
+
89
+ | # | Prompt | Answer Choices |
90
+ |-----|--------------------------------------------------------------------------------|-------------------------------------------------|
91
+ | 1 | La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'? | ["No", "Sì"] |
92
+ | 2 | Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'? | ["No", "Sì"] |
93
+ | 3 | La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'?\\nA: Sì\\nB: No\\nRisposta: | ["B", "A"] |
94
+ | 4 | Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'?\\nA: \\nB: No\\nRisposta: | ["B", "A"] |
95
+ | 5 | La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' e la parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}' | ["non hanno lo stesso significato", "hanno lo stesso significato"] |
96
+ | 6 | Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' e la parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}' | ["non hanno lo stesso significato", "hanno lo stesso significato"] |
97
+
98
+ """
99
+
100
+
101
+ FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ)
102
+ The input is a user query made by customers to the Acquedotto Pugliese service. The model must determine which of the 4 possible answers is the correct response to the question.
103
+
104
+ | # | Prompt | Answer Choices |
105
+ |-----|--------------------------------------------------------------------------------|-----------------------------|
106
+ | 1 | Rispondi alla seguente domanda: '{{question}}' | {{[A, B, C, D]}} |
107
+ | 2 | Devi risolvere un compito di risposte a domande. Rispondi alla seguente domanda: '{{question}}' | {{[A, B, C, D]}} |
108
+ | 3 | Rispondi alla seguente domanda: '{{question}}'\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nRisposta: | ["A", "B", "C", "D"] |
109
+ | 4 | Devi risolvere un compito a scelta multipla. Rispondi alla seguente domanda: '{{question}}'\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nRisposta: | ["A", "B", "C", "D"] |
110
+ | 5 | La risposta alla domanda: '{{question}}' è: | {{[A, B, C, D]}} |
111
+ | 6 | Devi risolvere un compito di risposte a domande. La risposta alla domanda: '{{question}}' è: | {{[A, B, C, D]}} |
112
+
113
+ """
114
+
115
+
116
+ LS_DESCRIPTION = """### Lexical Substitution (LS)
117
+ The input is a sentence containing a target word (w). The model has to replace the target word w with its most suitable synonyms that are contextually relevant.
118
+
119
+ | # | Prompt |
120
+ |-----|--------------------------------------------------------------------------------|
121
+ | 1 | Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `<head>` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: |
122
+ | 2 | Devi risolvere un compito di sostituzione lessicale. Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `<head>` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: |
123
+
124
+ """
125
+
126
+
127
+ SU_DESCRIPTION = """### Summarization (SUM)
128
+ The input is a news article. The model has to generate a concise summary of the input text, capturing the key information and main points.
129
+
130
+ | # | Prompt |
131
+ |-----|--------------------------------------------------------------------------------|
132
+ | 1 | Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: |
133
+ | 2 | Devi risolvere un compito di sintesi automatica del testo. Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: |
134
+
135
+ """
136
+
137
+
138
+ NER_DESCRIPTION = """### Named Entity Recognition (NER)
139
+ The input is a sentence. The model has to identify and classify Named Entities into predefined categories such as person, organization, and location.
140
+
141
+ | # | Prompt |
142
+ |-----|--------------------------------------------------------------------------------|
143
+ | 1 | Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: |
144
+ | 2 | Devi svolgere un compito di riconoscimento delle entità nei testi. Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: |
145
+
146
+ """
147
+
148
+
149
+ REL_DESCRIPTION = """### Relation Extraction (REL)
150
+ The task involves analyzing clinical text to extract relationships between laboratory test results (e.g., blood pressure) and the tests or procedures that produced them (e.g., blood pressure test).
151
+
152
+ | # | Prompt |
153
+ |-----|--------------------------------------------------------------------------------|
154
+ | 1 | Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: |
155
+ | 2 | Devi svolgere un compito di estrazione di relazioni da documenti medici. Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: |
156
+
157
+ """
158
+
159
+
160
+ # Create a dictionary to map task names to their descriptions
161
+ TASK_DESCRIPTIONS = {
162
+ "TE": TE_DESCRIPTION,
163
+ "SA": SA_DESCRIPTION,
164
+ "HS": HS_DESCRIPTION,
165
+ "AT": AT_DESCRIPTION,
166
+ "WIC": WIC_DESCRIPTION,
167
+ "FAQ": FAQ_DESCRIPTION,
168
+ "LS": LS_DESCRIPTION,
169
+ "SU": SU_DESCRIPTION,
170
+ "NER": NER_DESCRIPTION,
171
+ "REL": REL_DESCRIPTION
172
+ }