rzanoli commited on
Commit
d1c3cb5
·
1 Parent(s): ad489d5

Add new scripts for model processing and tasks management

Browse files
Files changed (7) hide show
  1. app.py +122 -2
  2. app2.py +153 -0
  3. get_model_info.py +2 -2
  4. src/about.py +125 -8
  5. src/display/utils.py +26 -0
  6. src/envs.py +9 -4
  7. src/leaderboard/read_evals.py +69 -6
app.py CHANGED
@@ -12,6 +12,11 @@ from src.about import (
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
14
  )
 
 
 
 
 
15
  from src.display.css_html_js import custom_css
16
  from src.display.utils import (
17
  BENCHMARK_COLS,
@@ -58,6 +63,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
  def init_leaderboard(dataframe):
 
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
63
  return Leaderboard(
@@ -89,14 +95,49 @@ def init_leaderboard(dataframe):
89
  )
90
 
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  demo = gr.Blocks(css=custom_css)
93
  with demo:
94
  gr.HTML(TITLE)
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
 
 
100
 
101
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -188,6 +229,85 @@ with demo:
188
  submission_result,
189
  )
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  with gr.Row():
192
  with gr.Accordion("📙 Citation", open=False):
193
  citation_button = gr.Textbox(
 
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
14
  )
15
+
16
+ from src.tasks import (
17
+ TE_DESCRIPTION,
18
+ )
19
+
20
  from src.display.css_html_js import custom_css
21
  from src.display.utils import (
22
  BENCHMARK_COLS,
 
63
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
64
 
65
  def init_leaderboard(dataframe):
66
+ print(dataframe)
67
  if dataframe is None or dataframe.empty:
68
  raise ValueError("Leaderboard DataFrame is empty or None.")
69
  return Leaderboard(
 
95
  )
96
 
97
 
98
+ def init_leaderboard2(dataframe, default_selection=None, hidden_columns=None):
99
+
100
+ print("entrato===============================================")
101
+
102
+ if dataframe is None or dataframe.empty:
103
+ raise ValueError("Leaderboard DataFrame is empty or None.")
104
+ return Leaderboard(
105
+ value=dataframe,
106
+ datatype=[c.type for c in fields(AutoEvalColumn)],
107
+ select_columns=SelectColumns(
108
+ default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
109
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
110
+ label="Select Columns to Display:",
111
+ ),
112
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
113
+ hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
114
+ filter_columns=[
115
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
116
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
117
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"),
118
+ ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
119
+ ],
120
+ bool_checkboxgroup_label="Hide models",
121
+ interactive=False,
122
+ )
123
+
124
+
125
  demo = gr.Blocks(css=custom_css)
126
  with demo:
127
  gr.HTML(TITLE)
128
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
129
 
130
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
131
+ with gr.TabItem("🏅 EVALITA-LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
132
+ #leaderboard = init_leaderboard(LEADERBOARD_DF)
133
+
134
+ leaderboard = init_leaderboard2(
135
+ LEADERBOARD_DF,
136
+ default_selection=['T', 'Model', "Average ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
137
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
138
+ col not in ['T', 'Model', "Average ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL" ]]
139
+ )
140
+
141
 
142
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
143
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
229
  submission_result,
230
  )
231
 
232
+
233
+ with gr.TabItem("TE", elem_id="llm-benchmark-tab-table", id=4):
234
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
235
+ #leaderboard = init_leaderboard(LEADERBOARD_DF)
236
+
237
+ LEADERBOARD_DF_TE = LEADERBOARD_DF.rename(columns={"TE Prompt Average": "Prompt Average",
238
+ "TE Best Prompt": "Best Prompt",
239
+ "TE Best Prompt Id": "Best Prompt Id",
240
+ "TE": "Combined Performance"})
241
+
242
+ leaderboard = init_leaderboard2(
243
+ LEADERBOARD_DF_TE,
244
+ default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'],
245
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
246
+ col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
247
+ )
248
+
249
+
250
+ with gr.TabItem("SA", elem_id="llm-benchmark-tab-table", id=5):
251
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
252
+
253
+ LEADERBOARD_DF_SA = LEADERBOARD_DF.rename(columns={"SA Prompt Average": "Prompt Average",
254
+ "SA Best Prompt": "Best Prompt",
255
+ "SA Best Prompt Id": "Best Prompt Id",
256
+ "SA": "Combined Performance"})
257
+
258
+ leaderboard = init_leaderboard2(
259
+ LEADERBOARD_DF_SA,
260
+ default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
261
+ 'Best Prompt Id'],
262
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
263
+ col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
264
+ 'Best Prompt Id']]
265
+ )
266
+
267
+
268
+
269
+
270
+ with gr.TabItem("HS", elem_id="llm-benchmark-tab-table", id=6):
271
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
272
+
273
+ LEADERBOARD_DF_HS = LEADERBOARD_DF.rename(columns={"HS Prompt Average": "Prompt Average",
274
+ "HS Best Prompt": "Best Prompt",
275
+ "HS Best Prompt Id": "Best Prompt Id",
276
+ "HS": "Combined Performance"})
277
+
278
+ leaderboard = init_leaderboard2(
279
+ LEADERBOARD_DF_HS,
280
+ default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
281
+ 'Best Prompt Id'],
282
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
283
+ col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
284
+ 'Best Prompt Id']]
285
+ )
286
+
287
+
288
+
289
+ with gr.TabItem("AT", elem_id="llm-benchmark-tab-table", id=7):
290
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
291
+
292
+ with gr.TabItem("WIC", elem_id="llm-benchmark-tab-table", id=8):
293
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
294
+
295
+ with gr.TabItem("FAQ", elem_id="llm-benchmark-tab-table", id=9):
296
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
297
+
298
+ with gr.TabItem("LS", elem_id="llm-benchmark-tab-table", id=10):
299
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
300
+
301
+ with gr.TabItem("SU", elem_id="llm-benchmark-tab-table", id=11):
302
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
303
+
304
+ with gr.TabItem("NER", elem_id="llm-benchmark-tab-table", id=12):
305
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
306
+
307
+ with gr.TabItem("REL", elem_id="llm-benchmark-tab-table", id=13):
308
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
309
+
310
+
311
  with gr.Row():
312
  with gr.Accordion("📙 Citation", open=False):
313
  citation_button = gr.Textbox(
app2.py CHANGED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+
7
+ from src.about import (
8
+ CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT,
9
+ INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
10
+ )
11
+ from src.tasks import TE_DESCRIPTION
12
+ from src.display.css_html_js import custom_css
13
+ from src.display.utils import (
14
+ BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn,
15
+ ModelType, fields, WeightType, Precision
16
+ )
17
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
18
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
19
+ from src.submission.submit import add_new_eval
20
+
21
+
22
+ def restart_space():
23
+ """Restart the Hugging Face space."""
24
+ API.restart_space(repo_id=REPO_ID)
25
+
26
+
27
+ def download_snapshot(repo, local_dir):
28
+ """Try to download a snapshot from the Hugging Face Hub, restarting space on failure."""
29
+ try:
30
+ print(f"Downloading from {repo} to {local_dir}...")
31
+ snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN)
32
+ except Exception as e:
33
+ print(f"Error downloading {repo}: {e}")
34
+ restart_space()
35
+
36
+
37
+ # Space initialization
38
+ download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
39
+ download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
40
+
41
+ # Load leaderboard and evaluation queue data
42
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
43
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
44
+
45
+
46
+ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
47
+ """Initialize a leaderboard with specific columns."""
48
+ if dataframe is None or dataframe.empty:
49
+ raise ValueError("Leaderboard DataFrame is empty or None.")
50
+
51
+ return Leaderboard(
52
+ value=dataframe,
53
+ datatype=[c.type for c in fields(AutoEvalColumn)],
54
+ select_columns=SelectColumns(
55
+ default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
56
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
57
+ label="Select Columns to Display:",
58
+ ),
59
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
60
+ hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
61
+ filter_columns=[
62
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
63
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
64
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"),
65
+ ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
66
+ ],
67
+ bool_checkboxgroup_label="Hide models",
68
+ interactive=False,
69
+ )
70
+
71
+
72
+ def prepare_leaderboard_df(df, task_prefix):
73
+ """Rename columns for a specific task to a standard format."""
74
+ return df.rename(columns={
75
+ f"{task_prefix} Prompt Average": "Prompt Average",
76
+ f"{task_prefix} Best Prompt": "Best Prompt",
77
+ f"{task_prefix} Best Prompt Id": "Best Prompt Id",
78
+ task_prefix: "Combined Performance"
79
+ })
80
+
81
+
82
+ demo = gr.Blocks(css=custom_css)
83
+ with demo:
84
+ gr.HTML(TITLE)
85
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
86
+
87
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
88
+ # Main leaderboard tab
89
+ with gr.TabItem("🏅 EVALITA-LLM Benchmark", elem_id="llm-benchmark-tab-table"):
90
+ leaderboard = init_leaderboard(
91
+ LEADERBOARD_DF,
92
+ default_selection=['T', 'Model', 'Few-Shot', "Average ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
93
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in
94
+ ['T', 'Model', 'Few-Shot', "Average ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
95
+ )
96
+
97
+ # About tab
98
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table"):
99
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
100
+
101
+ # Submission tab
102
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table"):
103
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
104
+
105
+ for queue_name, queue_df in [
106
+ ("✅ Finished Evaluations", finished_eval_queue_df),
107
+ ("🔄 Running Evaluation Queue", running_eval_queue_df),
108
+ ("⏳ Pending Evaluation Queue", pending_eval_queue_df)
109
+ ]:
110
+ with gr.Accordion(f"{queue_name} ({len(queue_df)})", open=False):
111
+ gr.components.Dataframe(value=queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5)
112
+
113
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
114
+ with gr.Row():
115
+ model_name_textbox = gr.Textbox(label="Model name")
116
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
117
+ model_type = gr.Dropdown(choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
118
+ label="Model type", multiselect=False, interactive=True)
119
+ precision = gr.Dropdown(choices=[i.value.name for i in Precision if i != Precision.Unknown],
120
+ label="Precision", multiselect=False, value="float16", interactive=True)
121
+ weight_type = gr.Dropdown(choices=[i.value.name for i in WeightType],
122
+ label="Weights type", multiselect=False, value="Original", interactive=True)
123
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
124
+
125
+ submit_button = gr.Button("Submit Eval")
126
+ submission_result = gr.Markdown()
127
+ submit_button.click(
128
+ add_new_eval,
129
+ [model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type],
130
+ submission_result,
131
+ )
132
+
133
+ # Task-specific leaderboards
134
+ for task in ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]:
135
+ with gr.TabItem(task, elem_id="llm-benchmark-tab-table"):
136
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
137
+ leaderboard = init_leaderboard(
138
+ prepare_leaderboard_df(LEADERBOARD_DF, task),
139
+ default_selection=['T', 'Model', 'Few-Shot', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'],
140
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in
141
+ ['T', 'Model', 'Few-Shot', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
142
+ )
143
+
144
+ # Citation section
145
+ with gr.Accordion("📙 Citation", open=False):
146
+ gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)
147
+
148
+ # Background job to restart space
149
+ scheduler = BackgroundScheduler()
150
+ scheduler.add_job(restart_space, "interval", seconds=1800)
151
+ scheduler.start()
152
+
153
+ demo.queue(default_concurrency_limit=40).launch()
get_model_info.py CHANGED
@@ -8,8 +8,8 @@ from huggingface_hub import HfApi
8
  api = HfApi()
9
 
10
  # Percorsi delle cartelle
11
- input_folder = "../evalita_llm_results/models_output/"
12
- output_folder = "../evalita_llm_requests2/"
13
 
14
  # Creazione della cartella di output se non esiste
15
  os.makedirs(output_folder, exist_ok=True)
 
8
  api = HfApi()
9
 
10
  # Percorsi delle cartelle
11
+ input_folder = "../evalita_llm_models_output/"
12
+ output_folder = "../evalita_llm_requests/"
13
 
14
  # Creazione della cartella di output se non esiste
15
  os.makedirs(output_folder, exist_ok=True)
src/about.py CHANGED
@@ -5,15 +5,88 @@ from enum import Enum
5
  class Task:
6
  benchmark: str
7
  metric: str
 
8
  col_name: str
9
 
10
 
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
@@ -21,19 +94,54 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
32
  LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
34
 
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  """
39
 
@@ -69,4 +177,13 @@ If everything is done, check you can launch the EleutherAIHarness on your model
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
 
 
72
  """
 
5
  class Task:
6
  benchmark: str
7
  metric: str
8
+ metric_type: str
9
  col_name: str
10
 
11
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
+
17
+ task1 = Task("text-entailment_1", "acc", "CPS", "TE")
18
+ task2 = Task("text-entailment_2", "acc", "average_accuracy", "TE Prompt Average")
19
+ task3 = Task("text-entailment_3", "acc", "best_prompt", "TE Best Prompt")
20
+ task4 = Task("text-entailment_4", "acc", "prompt_id", "TE Best Prompt Id")
21
+
22
+ task5 = Task("sentiment-analysis_1", "acc", "CPS", "SA")
23
+ task6 = Task("sentiment-analysis_2", "acc", "average_accuracy", "SA Prompt Average")
24
+ task7 = Task("sentiment-analysis_3", "acc", "best_prompt", "SA Best Prompt")
25
+ task8 = Task("sentiment-analysis_4", "acc", "prompt_id", "SA Best Prompt Id")
26
+
27
+ task9 = Task("hate-speech-detection_1", "acc", "CPS", "HS")
28
+ task10 = Task("hate-speech-detection_2", "acc", "average_accuracy", "HS Prompt Average")
29
+ task11 = Task("hate-speech-detection_3", "acc", "best_prompt", "HS Best Prompt")
30
+ task12 = Task("hate-speech-detection_4", "acc", "prompt_id", "HS Best Prompt Id")
31
+
32
+ task13 = Task("admission-test_1", "acc", "CPS", "AT")
33
+ task14 = Task("admission-test_2", "acc", "average_accuracy", "AT Prompt Average")
34
+ task15 = Task("admission-test_3", "acc", "best_prompt", "AT Best Prompt")
35
+ task16 = Task("admission-test_4", "acc", "prompt_id", "AT Best Prompt Id")
36
+
37
+ task17 = Task("word-in-context_1", "acc", "CPS", "WIC")
38
+ task18 = Task("word-in-context_2", "acc", "average_accuracy", "WIC Prompt Average")
39
+ task19 = Task("word-in-context_3", "acc", "best_prompt", "WIC Best Prompt")
40
+ task20 = Task("word-in-context_4", "acc", "prompt_id", "WIC Best Prompt Id")
41
+
42
+ task21 = Task("faq_1", "acc", "CPS", "FAQ")
43
+ task22 = Task("faq_2", "acc", "average_accuracy", "FAQ Prompt Average")
44
+ task23 = Task("faq_3", "acc", "best_prompt", "FAQ Best Prompt")
45
+ task24 = Task("faq_4", "acc", "prompt_id", "FAQ Best Prompt Id")
46
+
47
+ task25 = Task("lexical-substitution_1", "acc", "CPS", "LS")
48
+ task26 = Task("lexical-substitution_2", "acc", "average_accuracy", "LS Prompt Average")
49
+ task27 = Task("lexical-substitution_3", "acc", "best_prompt", "LS Best Prompt")
50
+ task28 = Task("lexical-substitution_4", "acc", "prompt_id", "LS Best Prompt Id")
51
+
52
+ task29 = Task("summarization-fanpage_1", "acc", "CPS", "SU")
53
+ task30 = Task("summarization-fanpage_2", "acc", "average_accuracy", "SU Prompt Average")
54
+ task31 = Task("summarization-fanpage_3", "acc", "best_prompt", "SU Best Prompt")
55
+ task32 = Task("summarization-fanpage_4", "acc", "prompt_id", "SU Best Prompt Id")
56
+
57
+ task33 = Task("evalita NER_1", "acc", "CPS", "NER")
58
+ task34 = Task("evalita NER_2", "acc", "average_accuracy", "NER Prompt Average")
59
+ task35 = Task("evalita NER_3", "acc", "best_prompt", "NER Best Prompt")
60
+ task36 = Task("evalita NER_4", "acc", "prompt_id", "NER Best Prompt Id")
61
+
62
+ task37 = Task("relation-extraction_1", "acc", "CPS", "REL")
63
+ task38 = Task("relation-extraction_2", "acc", "average_accuracy", "REL Prompt Average")
64
+ task39 = Task("relation-extraction_3", "acc", "best_prompt", "REL Best Prompt")
65
+ task40 = Task("relation-extraction_4", "acc", "prompt_id", "REL Best Prompt Id")
66
+
67
+
68
+ '''
69
+ task0 = Task("TextualEntailment", "acc", "Textual Entailment")
70
+ task1 = Task("TextualEntailment_best", "acc", "TextualEntailment Best")
71
+ task2 = Task("Sentiment Analysis", "acc", "Sentiment Analysis")
72
+ task3 = Task("Sentiment Analysis_best", "acc", "Sentiment Analysis_best")
73
+ task4 = Task("Hate Speech", "acc", "Hate Speech")
74
+ task5 = Task("Hate Speech_best", "acc", "Hate Speech_best")
75
+ task6 = Task("Admission Test", "acc", "Admission Test")
76
+ task7 = Task("Admission Test_best", "acc", "Admission Test_best")
77
+ task8 = Task("Word in Context", "acc", "Word in Context")
78
+ task9 = Task("Word in Context_best", "acc", "Word in Context_best")
79
+ task10 = Task("FAQ", "acc", "FAQ")
80
+ task11 = Task("FAQ_best", "acc", "FAQ_best")
81
+ task12 = Task("Lexical Substitution", "acc", "Lexical Substitution")
82
+ task13 = Task("Lexical Substitution_best", "acc", "Lexical Substitution_best")
83
+ task14 = Task("Summarization", "acc", "Summarization")
84
+ task15 = Task("Summarization_best", "acc", "Summarization_best")
85
+ task16 = Task("NER", "acc", "NER")
86
+ task17 = Task("NER_best", "acc", "NER_best")
87
+ task18 = Task("REL", "acc", "REL")
88
+ task19 = Task("REL_best", "acc", "REL_best")
89
+ '''
90
 
91
  NUM_FEWSHOT = 0 # Change with your few shot
92
  # ---------------------------------------------------
 
94
 
95
 
96
  # Your leaderboard name
97
+ TITLE = """<h1 align="center" id="space-title">🚀 Evalita Leaderboard 🚀</h1>"""
98
 
99
  # What does your leaderboard evaluate?
100
  INTRODUCTION_TEXT = """
101
+ Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
102
  """
103
 
104
  # Which evaluations are you running? how can people reproduce what you have?
105
  LLM_BENCHMARKS_TEXT = f"""
106
+ ### Groups
107
 
108
+ - `evalita-mp`: All tasks (perplexity and non-perplexity based).
109
+ - `evalita-mp_gen`: Only generative tasks.
110
+ - `evalita-mp_mc`: Only perplexity-based tasks.
111
+
112
+ #### Tasks
113
+
114
+ The following Evalita-LLM tasks can also be evaluated in isolation:
115
+ - `evalita-mp_te`: Textual Entailment
116
+ - `evalita-mp_sa`: Sentiment Analysis
117
+ - `evalita-mp_wic`: Word in Context
118
+ - `evalita-mp_hs`: Hate Speech Detection
119
+ - `evalita-mp_at`: Admission Tests
120
+ - `evalita-mp_faq`: FAQ
121
+ - `evalita-mp_sum_fp`: Summarization
122
+ - `evalita-mp_ls`: Lexical Substitution
123
+ - `evalita-mp_ner_group`: Named Entity Recognition
124
+ - `evalita-mp_re`: Relation Extraction
125
+
126
+
127
+ ### Usage
128
+
129
+ ```bash
130
+
131
+ lm_eval --model hf --model_args pretrained=meta-llama/Llama-2-7b-hf --tasks evalita-mp --device cuda:0 --batch_size auto
132
+ ```
133
+
134
+ ### Checklist
135
+
136
+ * [x] Is the task an existing benchmark in the literature?
137
+ * [x] Have you referenced the original paper that introduced the task?
138
+ * [x] If yes, does the original paper provide a reference implementation?
139
+ * [x] Yes, original implementation contributed by author of the benchmark
140
+
141
+ If other tasks on this dataset are already supported:
142
+ * [x] Is the "Main" variant of this task clearly denoted?
143
+ * [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
144
+ * [x] Have you noted which, if any, published evaluation setups are matched by this variant?
145
 
146
  """
147
 
 
177
 
178
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
179
  CITATION_BUTTON_TEXT = r"""
180
+ @misc{magnini2025evalitallmbenchmarkinglargelanguage,
181
+ title={Evalita-LLM: Benchmarking Large Language Models on Italian},
182
+ author={Bernardo Magnini and Roberto Zanoli and Michele Resta and Martin Cimmino and Paolo Albano and Marco Madeddu and Viviana Patti},
183
+ year={2025},
184
+ eprint={2502.02289},
185
+ archivePrefix={arXiv},
186
+ primaryClass={cs.CL},
187
+ url={https://arxiv.org/abs/2502.02289},
188
+ }
189
  """
src/display/utils.py CHANGED
@@ -25,6 +25,7 @@ auto_eval_column_dict = []
25
  # Init
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
28
  #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
@@ -108,3 +109,28 @@ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # Init
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
+ auto_eval_column_dict.append(["fewshot", ColumnContent, ColumnContent("Few-Shot", "str", True)])
29
  #Scores
30
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
31
  for task in Tasks:
 
109
 
110
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
111
 
112
+
113
+ # Roberto
114
+
115
+ # Nuovi valori per CPS, AVERAGE, BEST, e ID nella tabella
116
+ @dataclass
117
+ class NewColumnContent:
118
+ name: str
119
+ type: str
120
+ displayed_by_default: bool
121
+ hidden: bool = False
122
+ never_hidden: bool = False
123
+
124
+ # Inizializza i nuovi valori
125
+ new_column_dict = []
126
+ # Aggiungi CPS, VERAGE, BEST, ID
127
+ new_column_dict.append(["CPS", NewColumnContent, NewColumnContent("CPS", "number", True)])
128
+ new_column_dict.append(["AVERAGE", NewColumnContent, NewColumnContent("Average ⬆️", "number", True)])
129
+ new_column_dict.append(["BEST", NewColumnContent, NewColumnContent("Best Performance", "number", True)])
130
+ new_column_dict.append(["ID", NewColumnContent, NewColumnContent("ID", "str", True)])
131
+
132
+ # Puoi usare make_dataclass per creare la classe dinamicamente come per AutoEvalColumn
133
+ NewColumn = make_dataclass("NewColumn", new_column_dict, frozen=True)
134
+
135
+ # Includi questi nuovi valori nei COLS o in altre variabili di configurazione, se necessario
136
+ NEW_COLS = [c.name for c in fields(NewColumn) if not c.hidden]
src/envs.py CHANGED
@@ -6,12 +6,17 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
 
 
 
 
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ #OWNER = "giux78" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
+ OWNER = "evalitahf"
11
  # ----------------------------------
12
 
13
+ #REPO_ID = f"{OWNER}/leaderboard-evalita"
14
+ #QUEUE_REPO = f"{OWNER}/evalita-requests"
15
+ #RESULTS_REPO = f"{OWNER}/evalita-results"
16
+
17
+ REPO_ID = f"{OWNER}/evalita_llm_leaderboard"
18
+ QUEUE_REPO = f"{OWNER}/evalita_llm_requests"
19
+ RESULTS_REPO = f"{OWNER}/evalita_llm_results"
20
 
21
  # If you setup a cache later, just change HF_HOME
22
  CACHE_PATH=os.getenv("HF_HOME", ".")
src/leaderboard/read_evals.py CHANGED
@@ -22,6 +22,8 @@ class EvalResult:
22
  model: str
23
  revision: str # commit hash, "" if main
24
  results: dict
 
 
25
  precision: Precision = Precision.Unknown
26
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
  weight_type: WeightType = WeightType.Original # Original or Adapter
@@ -40,21 +42,47 @@ class EvalResult:
40
 
41
  config = data.get("config")
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  # Precision
44
  precision = Precision.from_str(config.get("model_dtype"))
45
 
 
 
 
 
 
 
 
46
  # Get model and org
47
  org_and_model = config.get("model_name", config.get("model_args", None))
48
  org_and_model = org_and_model.split("/", 1)
49
 
 
 
50
  if len(org_and_model) == 1:
51
  org = None
52
  model = org_and_model[0]
53
- result_key = f"{model}_{precision.value.name}"
 
54
  else:
55
  org = org_and_model[0]
56
  model = org_and_model[1]
57
- result_key = f"{org}_{model}_{precision.value.name}"
 
58
  full_model = "/".join(org_and_model)
59
 
60
  still_on_hub, _, model_config = is_model_on_hub(
@@ -71,6 +99,7 @@ class EvalResult:
71
  for task in Tasks:
72
  task = task.value
73
 
 
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
  if accs.size == 0 or any([acc is None for acc in accs]):
@@ -78,6 +107,29 @@ class EvalResult:
78
 
79
  mean_acc = np.mean(accs) * 100.0
80
  results[task.benchmark] = mean_acc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  return self(
83
  eval_name=result_key,
@@ -85,6 +137,9 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
 
 
 
88
  precision=precision,
89
  revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
@@ -109,17 +164,25 @@ class EvalResult:
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
 
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
  AutoEvalColumn.precision.name: self.precision.value.name,
116
- AutoEvalColumn.model_type.name: self.model_type.value.name,
117
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
 
 
 
 
118
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
  AutoEvalColumn.architecture.name: self.architecture,
120
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
  AutoEvalColumn.revision.name: self.revision,
122
  AutoEvalColumn.average.name: average,
 
123
  AutoEvalColumn.license.name: self.license,
124
  AutoEvalColumn.likes.name: self.likes,
125
  AutoEvalColumn.params.name: self.num_params,
@@ -176,7 +239,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
 
181
  # Store results of same eval together
182
  eval_name = eval_result.eval_name
 
22
  model: str
23
  revision: str # commit hash, "" if main
24
  results: dict
25
+ average_CPS: str
26
+ fewshot: str
27
  precision: Precision = Precision.Unknown
28
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
29
  weight_type: WeightType = WeightType.Original # Original or Adapter
 
42
 
43
  config = data.get("config")
44
 
45
+ average_CPS = data.get("average_CPS")
46
+
47
+ num_fewshot = config.get("num_fewshot", 0) # Imposta il valore predefinito a 0
48
+ try:
49
+ num_fewshot = int(num_fewshot) # Converte in intero se possibile
50
+ except ValueError:
51
+ num_fewshot = 0 # Se la conversione fallisce, assegna 0
52
+
53
+
54
+ precision = config.get("precision")
55
+
56
+ print(precision)
57
+
58
+ print(config, num_fewshot)
59
+
60
  # Precision
61
  precision = Precision.from_str(config.get("model_dtype"))
62
 
63
+ model_type = config.get("model_type")
64
+ # Modifica: Convertire model_type in un oggetto Enum (se è un Enum)
65
+ model_type = ModelType.from_str(model_type) if model_type else None
66
+
67
+ print("=====================", model_type, config.get("model_name"))
68
+
69
+
70
  # Get model and org
71
  org_and_model = config.get("model_name", config.get("model_args", None))
72
  org_and_model = org_and_model.split("/", 1)
73
 
74
+ print(precision.value.name)
75
+
76
  if len(org_and_model) == 1:
77
  org = None
78
  model = org_and_model[0]
79
+ #result_key = f"{model}_{precision.value.name}"
80
+ result_key = f"{model}_{num_fewshot}"
81
  else:
82
  org = org_and_model[0]
83
  model = org_and_model[1]
84
+ #result_key = f"{org}_{model}_{precision.value.name}"
85
+ result_key = f"{org}_{model}_{num_fewshot}"
86
  full_model = "/".join(org_and_model)
87
 
88
  still_on_hub, _, model_config = is_model_on_hub(
 
99
  for task in Tasks:
100
  task = task.value
101
 
102
+ '''
103
  # We average all scores of a given metric (not all metrics are present in all files)
104
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
105
  if accs.size == 0 or any([acc is None for acc in accs]):
 
107
 
108
  mean_acc = np.mean(accs) * 100.0
109
  results[task.benchmark] = mean_acc
110
+ '''
111
+
112
+ for k, v in data["tasks"].items():
113
+ #if task.benchmark == k:
114
+ if task.benchmark[:-2] == k:
115
+ # print(k, "==================", v)
116
+ # results[task.benchmark] = v[task.cps]
117
+
118
+ #print(task.benchmark, v[task.metric])
119
+
120
+ if "Best Prompt Id" in task.col_name:
121
+ results[task.benchmark] = int(v[task.metric_type][-1:])
122
+ #print(results[task.benchmark],v[task.metric_type][-1:])
123
+ else:
124
+ results[task.benchmark] = v[task.metric_type]
125
+
126
+
127
+ #results[task.benchmark + "_" + task.metric] = 1.0
128
+
129
+
130
+ #results[task.benchmark] = v[task.accuracy]
131
+ # print("======", results[task.benchmark])
132
+ #results[task.benchmark] = 1.0
133
 
134
  return self(
135
  eval_name=result_key,
 
137
  org=org,
138
  model=model,
139
  results=results,
140
+ average_CPS=average_CPS,
141
+ fewshot=num_fewshot,
142
+ model_type=model_type,
143
  precision=precision,
144
  revision= config.get("model_sha", ""),
145
  still_on_hub=still_on_hub,
 
164
 
165
  def to_dict(self):
166
  """Converts the Eval Result to a dict compatible with our dataframe display"""
167
+ #average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
168
+ average = self.average_CPS
169
+ fewshot = self.fewshot
170
+ print("?????", fewshot)
171
  data_dict = {
172
  "eval_name": self.eval_name, # not a column, just a save name,
173
  AutoEvalColumn.precision.name: self.precision.value.name,
174
+ #AutoEvalColumn.model_type.name: self.model_type.value.name,
175
+ #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
176
+
177
+ AutoEvalColumn.model_type.name: self.model_type.value.name if self.model_type else "Unknown",
178
+ AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol if self.model_type else "Unknown",
179
+
180
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
181
  AutoEvalColumn.architecture.name: self.architecture,
182
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
183
  AutoEvalColumn.revision.name: self.revision,
184
  AutoEvalColumn.average.name: average,
185
+ AutoEvalColumn.fewshot.name: fewshot,
186
  AutoEvalColumn.license.name: self.license,
187
  AutoEvalColumn.likes.name: self.likes,
188
  AutoEvalColumn.params.name: self.num_params,
 
239
  for model_result_filepath in model_result_filepaths:
240
  # Creation of result
241
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
242
+ #eval_result.update_with_request_file(requests_path)
243
 
244
  # Store results of same eval together
245
  eval_name = eval_result.eval_name