rzanoli commited on
Commit
dbd3b18
Β·
1 Parent(s): 12c62aa

Small changes

Browse files
app.py CHANGED
@@ -14,16 +14,16 @@ from src.submission.submit import add_new_eval
14
 
15
  # Define task metadata (icons, names, descriptions)
16
  TASK_METADATA = {
17
- "TE": {"icon": "πŸ“Š", "name": "Textual Entailment", "tooltip": "Identify logical relationships between two text segments."},
18
- "SA": {"icon": "πŸ˜ƒ", "name": "Sentiment Analysis", "tooltip": "Classify the sentiment (positive, negative, neutral) of a text."},
19
- "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": "Detect hate speech in a text."},
20
- "AT": {"icon": "πŸ₯", "name": "Admission Test", "tooltip": "Classify whether a clinical statement pertains to an admission test."},
21
- "WIC": {"icon": "πŸ”€", "name": "Word in Context", "tooltip": "Identify words in context and their meaning."},
22
- "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": "Answer frequently asked questions based on given text."},
23
- "LS": {"icon": "πŸ”„", "name": "Lexical Substitution", "tooltip": "Identify alternative words in a given context."},
24
- "SU": {"icon": "πŸ“", "name": "Summarization", "tooltip": "Summarize long text into a shorter version."},
25
- "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": "Identify named entities (e.g., persons, locations, organizations) in text."},
26
- "REL": {"icon": "πŸ”—", "name": "Relation Extraction", "tooltip": "Extract and link laboratory test results to the respective tests in clinical narratives."},
27
  }
28
 
29
  def restart_space():
@@ -47,8 +47,8 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
47
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
48
  hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
49
  filter_columns=[
50
- ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="Few-Shot Learning (FS)"),
51
- ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"),
52
  ],
53
  bool_checkboxgroup_label="Hide models",
54
  interactive=False,
@@ -82,6 +82,7 @@ with demo:
82
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
83
  # Main leaderboard tab
84
  with gr.TabItem("πŸ… EVALITA-LLM Benchmark"):
 
85
  leaderboard = init_leaderboard(
86
  LEADERBOARD_DF,
87
  default_selection=['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
 
14
 
15
  # Define task metadata (icons, names, descriptions)
16
  TASK_METADATA = {
17
+ "TE": {"icon": "πŸ“Š", "name": "Textual Entailment", "tooltip": ""},
18
+ "SA": {"icon": "πŸ˜ƒ", "name": "Sentiment Analysis", "tooltip": ""},
19
+ "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""},
20
+ "AT": {"icon": "πŸ₯", "name": "Admission Test", "tooltip": ""},
21
+ "WIC": {"icon": "πŸ”€", "name": "Word in Context", "tooltip": ""},
22
+ "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""},
23
+ "LS": {"icon": "πŸ”„", "name": "Lexical Substitution", "tooltip": ""},
24
+ "SU": {"icon": "πŸ“", "name": "Summarization", "tooltip": ""},
25
+ "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
26
+ "REL": {"icon": "πŸ”—", "name": "Relation Extraction", "tooltip": ""},
27
  }
28
 
29
  def restart_space():
 
47
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
48
  hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
49
  filter_columns=[
50
+ ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
51
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
52
  ],
53
  bool_checkboxgroup_label="Hide models",
54
  interactive=False,
 
82
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
83
  # Main leaderboard tab
84
  with gr.TabItem("πŸ… EVALITA-LLM Benchmark"):
85
+
86
  leaderboard = init_leaderboard(
87
  LEADERBOARD_DF,
88
  default_selection=['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
get_model_info.py CHANGED
@@ -1,87 +1,92 @@
 
 
 
1
  import os
2
  import re
3
  import json
4
  from huggingface_hub import HfApi
5
 
6
- # Configura il token di Hugging Face (se necessario)
7
- #TOKEN = "YOUR_HUGGINGFACE_API_TOKEN"
8
  api = HfApi()
9
 
10
- # Percorsi delle cartelle
 
11
  input_folder = "../evalita_llm_models_output/"
 
12
  output_folder = "../evalita_llm_requests/"
13
 
14
- # Creazione della cartella di output se non esiste
15
  os.makedirs(output_folder, exist_ok=True)
16
 
17
- # Espressione regolare per trovare il nome del modello
18
  model_pattern = re.compile(r"pretrained=([\w\-./]+)")
19
 
20
- # Scansiona i file nella cartella di input
21
  for filename in os.listdir(input_folder):
22
- file_path = os.path.join(input_folder, filename)
23
-
24
- # Leggi il contenuto del file
25
- with open(file_path, "r", encoding="utf-8") as f:
26
- content = f.read()
27
-
28
- # Estrai il nome del modello
29
- match = model_pattern.search(content)
30
- if match:
31
- model_name = match.group(1)
32
- print(f"Processing model: {model_name}")
33
-
34
- try:
35
- # Ottieni le informazioni del modello da Hugging Face
36
- model_info = api.model_info(model_name)
37
-
38
- # Calcola il numero di parametri in miliardi, se disponibile
39
- num_params = None
40
- if model_info.safetensors and "BF16" in model_info.safetensors.parameters:
41
- num_params = model_info.safetensors.parameters["BF16"] / 1e9 # Converti in miliardi
42
-
43
- # Estrai la lingua (puΓ² essere una lista, quindi prendiamo la prima se esiste)
44
- # Estrai e concatena i linguaggi
45
- language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""
46
-
47
- print(model_info)
48
-
49
- # Costruisci il dizionario con i metadati richiesti
50
- model_data = {
51
- "model": model_name,
52
- "base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "",
53
- "revision": model_info.sha,
54
- #"precision": "bfloat16", # Se disponibile, sostituire con un valore reale
55
- #"weight_type": "Original",
56
- #"status": "FINISHED",
57
- "submitted_time": str(model_info.created_at),
58
- #"model_type": "pretrained",
59
- #"likes": model_info.likes,
60
- #"params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None,
61
- #"license": model_info.license,
62
- #"private": model_info.private,
63
- "num_params_billion": num_params, # Numero di parametri in miliardi
64
- "language": language, # Lingua estratta
65
- }
66
-
67
- # Separare il model_name in due parti: prima e dopo "/"
68
- if "/" in model_name:
69
- dir_name, file_name = model_name.split("/", 1)
70
- else:
71
- dir_name, file_name = model_name, model_name # Se non c'Γ¨ "/", usa lo stesso nome
72
-
73
- # Creare la cartella per la prima parte del nome del modello
74
- model_output_folder = os.path.join(output_folder, dir_name)
75
- os.makedirs(model_output_folder, exist_ok=True)
76
-
77
- # Salvare il file JSON nella cartella appropriata
78
- output_file = os.path.join(model_output_folder, f"{file_name}.json")
79
- with open(output_file, "w", encoding="utf-8") as f:
80
- json.dump(model_data, f, indent=4)
81
-
82
- print(f"Saved metadata for {model_name} in {output_file}")
83
-
84
- except Exception as e:
85
- print(f"Error retrieving info for {model_name}: {e}")
86
-
87
- print("Process completed!")
 
1
+ # Reads model output files (including accuracy values) produced by lm-eval-harness,
2
+ # extracts model names, downloads their characteristics from HuggingFace, and saves metadata
3
+ # (such as parameter count and pre-training status) to model-specific JSON files.
4
  import os
5
  import re
6
  import json
7
  from huggingface_hub import HfApi
8
 
9
+ # Configures the Hugging Face token (if needed)
10
+ # TOKEN = "YOUR_HUGGINGFACE_API_TOKEN"
11
  api = HfApi()
12
 
13
+ # Directory paths
14
+ # input_folder: Directory containing the output files of the lm-eval-harness library, including model accuracy metrics.
15
  input_folder = "../evalita_llm_models_output/"
16
+ # output_folder: Directory where JSON files with model characteristics will be saved.
17
  output_folder = "../evalita_llm_requests/"
18
 
19
+ # Creates the output folder if it doesn't exist
20
  os.makedirs(output_folder, exist_ok=True)
21
 
22
+ # Regular expression to find the model name
23
  model_pattern = re.compile(r"pretrained=([\w\-./]+)")
24
 
25
+ # Scans files in the input folder
26
  for filename in os.listdir(input_folder):
27
+ if filename.endswith('.out'):
28
+ file_path = os.path.join(input_folder, filename)
29
+
30
+ # Reads the file content
31
+ with open(file_path, "r", encoding="utf-8") as f:
32
+ content = f.read()
33
+
34
+ # Extracts the model name
35
+ match = model_pattern.search(content)
36
+ if match:
37
+ model_name = match.group(1)
38
+ print(f"Processing model: {model_name}")
39
+
40
+ try:
41
+ # Retrieves model information from Hugging Face
42
+ model_info = api.model_info(model_name)
43
+
44
+ # Calculates the number of parameters in billions, if available
45
+ num_params = None
46
+ if model_info.safetensors and "BF16" in model_info.safetensors.parameters:
47
+ num_params = model_info.safetensors.parameters["BF16"] / 1e9 # Convert to billions
48
+
49
+ # Extracts and concatenates languages
50
+ language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""
51
+
52
+ print(model_info)
53
+
54
+ # Builds the dictionary with required metadata
55
+ model_data = {
56
+ "model": model_name,
57
+ "base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "",
58
+ "revision": model_info.sha,
59
+ # "precision": "bfloat16", # If available, replace with real value
60
+ # "weight_type": "Original",
61
+ # "status": "FINISHED",
62
+ "submitted_time": str(model_info.created_at),
63
+ # "model_type": "pretrained",
64
+ # "likes": model_info.likes,
65
+ # "params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None,
66
+ # "license": model_info.license,
67
+ # "private": model_info.private,
68
+ "num_params_billion": num_params, # Number of parameters in billions
69
+ "language": language, # Extracted language
70
+ }
71
+
72
+ # Separates the model_name into two parts: directory name and file name
73
+ if "/" in model_name:
74
+ dir_name, file_name = model_name.split("/", 1)
75
+ else:
76
+ dir_name, file_name = model_name, model_name # If no "/", use the same name
77
+
78
+ # Creates the folder for saving the produced json files
79
+ model_output_folder = os.path.join(output_folder, dir_name)
80
+ os.makedirs(model_output_folder, exist_ok=True)
81
+
82
+ # Saves the JSON file in the appropriate folder
83
+ output_file = os.path.join(model_output_folder, f"{file_name}.json")
84
+ with open(output_file, "w", encoding="utf-8") as f:
85
+ json.dump(model_data, f, indent=4)
86
+
87
+ print(f"Saved metadata for {model_name} in {output_file}")
88
+
89
+ except Exception as e:
90
+ print(f"Error retrieving info for {model_name}: {e}")
91
+
92
+ print("Process finished!")
preprocess_models_output.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  import re
4
 
5
  def safe_float(value):
6
- """Convert a value to float safely. Returns None if conversion fails."""
7
  try:
8
  return float(value)
9
  except ValueError:
@@ -11,7 +11,7 @@ def safe_float(value):
11
 
12
 
13
  def calculate_task_metrics(task_info):
14
- """Calculate average accuracy, best prompt, and CPS for a task."""
15
  accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
16
 
17
  if not accuracies:
@@ -29,7 +29,7 @@ def calculate_task_metrics(task_info):
29
 
30
 
31
  def extract_data_from_file(file_path):
32
- """Extract task and prompt data from the given file."""
33
  with open(file_path, 'r') as file:
34
  lines = file.readlines()
35
 
@@ -39,27 +39,23 @@ def extract_data_from_file(file_path):
39
  for line in lines:
40
  line = line.strip()
41
 
42
- # Skip irrelevant lines
43
  if not line:
44
  continue
45
 
46
-
47
  if line.startswith("| Tasks"):
48
  continue
49
 
 
50
  if line.startswith("hf (pretrained="):
51
-
52
- # Estrai la parte dopo "pretrained="
53
  start = line.find("pretrained=") + len("pretrained=")
54
- end = line.find(",", start) # Trova la virgola successiva
55
- # Estrai la stringa desiderata
56
  pretrained_model = line[start:end]
57
 
58
- # Estrarre num_fewshot
59
  num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
60
  num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
61
 
62
- # Estrarre batch_size
63
  batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
64
  batch_size = int(batch_size_match.group(1)) if batch_size_match else None
65
 
@@ -74,10 +70,11 @@ def extract_data_from_file(file_path):
74
  value = safe_float(columns[7])
75
  stderr = safe_float(columns[9])
76
 
 
77
  if metric == "acc_norm":
78
  continue
79
 
80
- # Identify task and prompts
81
  if task_name.startswith(" - "):
82
  task_name = task_name[3:].strip()
83
  current_task = task_name
@@ -91,7 +88,7 @@ def extract_data_from_file(file_path):
91
  'stderr': stderr}
92
  tasks_data[current_task]['prompts'].append(prompt_data)
93
 
94
- # Special handling for evalita NER
95
  if "evalita NER" in tasks_data:
96
  task_info = tasks_data["evalita NER"]
97
  weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
@@ -113,11 +110,11 @@ def extract_data_from_file(file_path):
113
  {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
114
  'stderr': None}]
115
 
116
- # Calculate metrics for each task
117
  for task_info in tasks_data.values():
118
  calculate_task_metrics(task_info)
119
 
120
- # Calculate average CPS
121
  tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
122
  average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
123
 
@@ -129,73 +126,36 @@ def extract_data_from_file(file_path):
129
 
130
  return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
131
 
132
-
133
- # Example usage
134
- #file_path = '../evalita_llm_results/models_output/slurm-7769.out'
135
- #json_output = extract_data_from_file(file_path)
136
- #print(json_output)
137
-
138
-
139
- # Directory da cui leggere i file .out
140
  directory_in_path = '../evalita_llm_models_output/'
 
141
  directory_out_results_path = '../evalita_llm_results/'
142
- directory_out_requests_path = '../evalita_llm_requests/'
143
 
144
- # Itera sui file nella directory
145
  for filename in os.listdir(directory_in_path):
146
  if filename.endswith('.out'):
147
- # Costruisci il percorso completo del file
148
  file_path = os.path.join(directory_in_path, filename)
149
-
150
- # Esegui la funzione extract_data_from_file
151
  json_output = extract_data_from_file(file_path)
152
 
153
- # Estrai model_org_name e model_name da model_name
154
  model_org_name, model_name = json_output['config']['model_name'].split('/')
155
 
156
 
 
157
 
158
-
159
-
160
-
161
- # Percorso del file JSON di configurazione in ../evalita_llm_requests2/
162
- config_file_path = os.path.join(directory_out_requests_path, model_org_name, f"{model_name}.json")
163
-
164
- # Se il file esiste, caricalo e aggiorna il dizionario config
165
  if os.path.exists(config_file_path):
166
  with open(config_file_path, 'r', encoding='utf-8') as config_file:
167
  additional_config = json.load(config_file)
168
-
169
- # Aggiorna la configurazione con i nuovi dati
170
  json_output['config'].update(additional_config)
171
 
172
 
173
-
174
-
175
- # Crea il percorso della cartella per model_org_name
176
  org_folder_path = os.path.join(directory_out_results_path, model_org_name)
177
- os.makedirs(org_folder_path, exist_ok=True) # Crea la cartella se non esiste
178
 
179
- # Crea il percorso completo del file JSON
180
  file_suffix = f"{json_output['config']['num_fewshot']}"
181
  output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
182
 
183
- # Salva il JSON in un file con ritorni a capo compatibili con Linux
184
  with open(output_file_path, 'w', newline="\n") as outfile:
185
  json.dump(json_output, outfile, indent=4)
186
 
187
- # Stampa il risultato
188
- print(f"File {filename} elaborato e salvato in {output_file_path}")
189
-
190
-
191
-
192
-
193
-
194
-
195
-
196
-
197
-
198
-
199
-
200
-
201
-
 
3
  import re
4
 
5
  def safe_float(value):
6
+ """Safely converts a value to float, returning None if the conversion fails."""
7
  try:
8
  return float(value)
9
  except ValueError:
 
11
 
12
 
13
  def calculate_task_metrics(task_info):
14
+ """Calculates average accuracy, best prompt accuracy, and CPS for a given task."""
15
  accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
16
 
17
  if not accuracies:
 
29
 
30
 
31
  def extract_data_from_file(file_path):
32
+ """Extracts task and prompt data from a specified file."""
33
  with open(file_path, 'r') as file:
34
  lines = file.readlines()
35
 
 
39
  for line in lines:
40
  line = line.strip()
41
 
42
+ # Skips empty lines
43
  if not line:
44
  continue
45
 
46
+ # Skips header lines
47
  if line.startswith("| Tasks"):
48
  continue
49
 
50
+ # Extracts model configuration details
51
  if line.startswith("hf (pretrained="):
 
 
52
  start = line.find("pretrained=") + len("pretrained=")
53
+ end = line.find(",", start)
 
54
  pretrained_model = line[start:end]
55
 
 
56
  num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
57
  num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
58
 
 
59
  batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
60
  batch_size = int(batch_size_match.group(1)) if batch_size_match else None
61
 
 
70
  value = safe_float(columns[7])
71
  stderr = safe_float(columns[9])
72
 
73
+ # Skips normalized accuracy metrics
74
  if metric == "acc_norm":
75
  continue
76
 
77
+ # Identifies task and prompt sections in the file
78
  if task_name.startswith(" - "):
79
  task_name = task_name[3:].strip()
80
  current_task = task_name
 
88
  'stderr': stderr}
89
  tasks_data[current_task]['prompts'].append(prompt_data)
90
 
91
+ # Special handling for evalita NER task to calculate weighted prompt averages
92
  if "evalita NER" in tasks_data:
93
  task_info = tasks_data["evalita NER"]
94
  weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
 
110
  {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
111
  'stderr': None}]
112
 
113
+ # Calculates task metrics for each task
114
  for task_info in tasks_data.values():
115
  calculate_task_metrics(task_info)
116
 
117
+ # Calculates the average CPS across all tasks
118
  tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
119
  average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
120
 
 
126
 
127
  return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
128
 
129
+ # Main script: processes .out files, extracts data, and saves JSON results.
130
+ # Reads .out files from directory_in_path, parses data including model config and task metrics,
131
+ # and saves results as JSON files in directory_out_results_path, merging config from directory_out_requests_path if available.
 
 
 
 
 
132
  directory_in_path = '../evalita_llm_models_output/'
133
+ directory_in_requests_path = '../evalita_llm_requests/'
134
  directory_out_results_path = '../evalita_llm_results/'
 
135
 
 
136
  for filename in os.listdir(directory_in_path):
137
  if filename.endswith('.out'):
 
138
  file_path = os.path.join(directory_in_path, filename)
 
 
139
  json_output = extract_data_from_file(file_path)
140
 
 
141
  model_org_name, model_name = json_output['config']['model_name'].split('/')
142
 
143
 
144
+ config_file_path = os.path.join(directory_in_requests_path, model_org_name, f"{model_name}.json")
145
 
 
 
 
 
 
 
 
146
  if os.path.exists(config_file_path):
147
  with open(config_file_path, 'r', encoding='utf-8') as config_file:
148
  additional_config = json.load(config_file)
 
 
149
  json_output['config'].update(additional_config)
150
 
151
 
 
 
 
152
  org_folder_path = os.path.join(directory_out_results_path, model_org_name)
153
+ os.makedirs(org_folder_path, exist_ok=True)
154
 
 
155
  file_suffix = f"{json_output['config']['num_fewshot']}"
156
  output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
157
 
 
158
  with open(output_file_path, 'w', newline="\n") as outfile:
159
  json.dump(json_output, outfile, indent=4)
160
 
161
+ print(f"File {filename} processed and saved to {output_file_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocess_models_output_old.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+
5
+ def safe_float(value):
6
+ """Convert a value to float safely. Returns None if conversion fails."""
7
+ try:
8
+ return float(value)
9
+ except ValueError:
10
+ return None
11
+
12
+
13
+ def calculate_task_metrics(task_info):
14
+ """Calculate average accuracy, best prompt, and CPS for a task."""
15
+ accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
16
+
17
+ if not accuracies:
18
+ return None
19
+
20
+ task_info['average_accuracy'] = sum(accuracies) / len(accuracies)
21
+ best_prompt_data = max(task_info['prompts'], key=lambda x: x['value'])
22
+ task_info['best_prompt'] = best_prompt_data['value']
23
+ task_info['prompt_id'] = best_prompt_data['prompt']
24
+
25
+ # Calculate CPS
26
+ avg_acc = task_info['average_accuracy']
27
+ best_acc = task_info['best_prompt']
28
+ task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc
29
+
30
+
31
+ def extract_data_from_file(file_path):
32
+ """Extract task and prompt data from the given file."""
33
+ with open(file_path, 'r') as file:
34
+ lines = file.readlines()
35
+
36
+ tasks_data = {}
37
+ current_task = None
38
+
39
+ for line in lines:
40
+ line = line.strip()
41
+
42
+ # Skip irrelevant lines
43
+ if not line:
44
+ continue
45
+
46
+
47
+ if line.startswith("| Tasks"):
48
+ continue
49
+
50
+ if line.startswith("hf (pretrained="):
51
+
52
+ # Estrai la parte dopo "pretrained="
53
+ start = line.find("pretrained=") + len("pretrained=")
54
+ end = line.find(",", start) # Trova la virgola successiva
55
+ # Estrai la stringa desiderata
56
+ pretrained_model = line[start:end]
57
+
58
+ # Estrarre num_fewshot
59
+ num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
60
+ num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
61
+
62
+ # Estrarre batch_size
63
+ batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
64
+ batch_size = int(batch_size_match.group(1)) if batch_size_match else None
65
+
66
+ continue
67
+
68
+ columns = line.split('|')
69
+ if len(columns) != 11:
70
+ continue
71
+
72
+ task_name = columns[1]
73
+ metric = columns[5].strip()
74
+ value = safe_float(columns[7])
75
+ stderr = safe_float(columns[9])
76
+
77
+ if metric == "acc_norm":
78
+ continue
79
+
80
+ # Identify task and prompts
81
+ if task_name.startswith(" - "):
82
+ task_name = task_name[3:].strip()
83
+ current_task = task_name
84
+ tasks_data.setdefault(current_task,
85
+ {'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None,
86
+ 'CPS': None})
87
+
88
+ elif task_name.startswith(" - ") and current_task:
89
+ prompt_name = task_name[4:].strip()
90
+ prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100,
91
+ 'stderr': stderr}
92
+ tasks_data[current_task]['prompts'].append(prompt_data)
93
+
94
+ # Special handling for evalita NER
95
+ if "evalita NER" in tasks_data:
96
+ task_info = tasks_data["evalita NER"]
97
+ weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
98
+ "WN prompt-1": 2088, "WN prompt-2": 2088}
99
+
100
+ weighted_values = {"prompt-1": 0, "prompt-2": 0}
101
+ total_weights = sum(weight_map.values())
102
+
103
+ for prompt in task_info['prompts']:
104
+ if prompt['prompt'] in weight_map:
105
+ if "prompt-1" in prompt['prompt']:
106
+ weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value']
107
+ elif "prompt-2" in prompt['prompt']:
108
+ weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value']
109
+
110
+ task_info['prompts'] = [
111
+ {"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights,
112
+ 'stderr': None},
113
+ {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
114
+ 'stderr': None}]
115
+
116
+ # Calculate metrics for each task
117
+ for task_info in tasks_data.values():
118
+ calculate_task_metrics(task_info)
119
+
120
+ # Calculate average CPS
121
+ tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
122
+ average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
123
+
124
+ config = {
125
+ "model_name": pretrained_model,
126
+ "num_fewshot": num_fewshot,
127
+ "batch_size": batch_size
128
+ }
129
+
130
+ return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
131
+
132
+
133
+ # Example usage
134
+ #file_path = '../evalita_llm_results/models_output/slurm-7769.out'
135
+ #json_output = extract_data_from_file(file_path)
136
+ #print(json_output)
137
+
138
+
139
+ # Directory da cui leggere i file .out
140
+ directory_in_path = '../evalita_llm_models_output/'
141
+ directory_out_results_path = '../evalita_llm_results/'
142
+ directory_out_requests_path = '../evalita_llm_requests/'
143
+
144
+ # Itera sui file nella directory
145
+ for filename in os.listdir(directory_in_path):
146
+ if filename.endswith('.out'):
147
+ # Costruisci il percorso completo del file
148
+ file_path = os.path.join(directory_in_path, filename)
149
+
150
+ # Esegui la funzione extract_data_from_file
151
+ json_output = extract_data_from_file(file_path)
152
+
153
+ # Estrai model_org_name e model_name da model_name
154
+ model_org_name, model_name = json_output['config']['model_name'].split('/')
155
+
156
+
157
+
158
+
159
+
160
+
161
+ # Percorso del file JSON di configurazione in ../evalita_llm_requests2/
162
+ config_file_path = os.path.join(directory_out_requests_path, model_org_name, f"{model_name}.json")
163
+
164
+ # Se il file esiste, caricalo e aggiorna il dizionario config
165
+ if os.path.exists(config_file_path):
166
+ with open(config_file_path, 'r', encoding='utf-8') as config_file:
167
+ additional_config = json.load(config_file)
168
+
169
+ # Aggiorna la configurazione con i nuovi dati
170
+ json_output['config'].update(additional_config)
171
+
172
+
173
+
174
+
175
+ # Crea il percorso della cartella per model_org_name
176
+ org_folder_path = os.path.join(directory_out_results_path, model_org_name)
177
+ os.makedirs(org_folder_path, exist_ok=True) # Crea la cartella se non esiste
178
+
179
+ # Crea il percorso completo del file JSON
180
+ file_suffix = f"{json_output['config']['num_fewshot']}"
181
+ output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
182
+
183
+ # Salva il JSON in un file con ritorni a capo compatibili con Linux
184
+ with open(output_file_path, 'w', newline="\n") as outfile:
185
+ json.dump(json_output, outfile, indent=4)
186
+
187
+ # Stampa il risultato
188
+ print(f"File {filename} elaborato e salvato in {output_file_path}")
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+
198
+
199
+
200
+
201
+
run_instructions.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model Evaluation and Leaderboard
2
+
3
+ 1) Model Evaluation
4
+ Before integrating a model into the leaderboard, it must first be evaluated using the lm-eval-harness library in both zero-shot and 5-shot configurations.
5
+
6
+ This can be done with the following command:
7
+
8
+ lm_eval --model hf --model_args pretrained=google/gemma-3-12b-it \
9
+ --tasks evalita-mp --device cuda:0 --batch_size 1 --trust_remote_code \
10
+ --output_path model_output --num_fewshot 5 --
11
+
12
+ The output generated by the library will include the model's accuracy scores on the benchmark tasks.
13
+ This output is written to the standard output and should be saved in a txt file (e.g., slurm-8368.out), which needs to be placed in the
14
+ evalita_llm_models_output directory for further processing.
15
+
16
+ 2) Extracting Model Metadata
17
+ To display model details on the leaderboard (e.g., organization/group, model name, and parameter count), metadata must be retrieved from Hugging Face.
18
+
19
+ This can be done by running:
20
+
21
+ python get_model_info.py
22
+
23
+ This script processes the evaluation files from Step 1 and saves each model's metadata in a JSON file within the evalita_llm_requests directory.
24
+
25
+ 3) Generating Leaderboard Submission File
26
+ The leaderboard requires a structured file containing each model’s metadata along with its benchmark accuracy scores.
27
+
28
+ To generate this file, run:
29
+
30
+ python preprocess_model_output.
31
+
32
+ This script combines the accuracy results from Step 1 with the metadata from Step 2 and outputs a JSON file in the evalita_llm_results directory.
33
+
34
+ 4) Updating the Hugging Face Repository
35
+ The evalita_llm_results repository on HuggingFace must be updated with the newly generated files from Step 3.
36
+
37
+ 5) Running the Leaderboard Application
38
+ Finally, execute the leaderboard application by running:
39
+
40
+ python app.py
41
+
42
+
src/about.py CHANGED
@@ -8,7 +8,6 @@ class Task:
8
  metric_type: str
9
  col_name: str
10
 
11
-
12
  # Select your tasks here
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
@@ -122,9 +121,10 @@ The following Evalita-LLM tasks can also be evaluated in isolation:
122
 
123
  ```bash
124
 
125
- lm_eval --model hf --model_args pretrained=meta-llama/Llama-2-7b-hf --tasks evalita-mp --device cuda:0 --batch_size auto
126
  ```
127
 
 
128
  ### Checklist
129
 
130
  * [x] Is the task an existing benchmark in the literature?
@@ -136,6 +136,8 @@ If other tasks on this dataset are already supported:
136
  * [x] Is the "Main" variant of this task clearly denoted?
137
  * [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
138
  * [x] Have you noted which, if any, published evaluation setups are matched by this variant?
 
 
139
 
140
  """
141
 
 
8
  metric_type: str
9
  col_name: str
10
 
 
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
 
121
 
122
  ```bash
123
 
124
+ lm_eval --model hf --model_args pretrained=meta-llama/Llama-2-7b-hf --tasks evalita-mp --device cuda:0 --batch_size 1
125
  ```
126
 
127
+ <!--
128
  ### Checklist
129
 
130
  * [x] Is the task an existing benchmark in the literature?
 
136
  * [x] Is the "Main" variant of this task clearly denoted?
137
  * [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
138
  * [x] Have you noted which, if any, published evaluation setups are matched by this variant?
139
+ -->
140
+
141
 
142
  """
143
 
src/display/utils.py CHANGED
@@ -30,9 +30,6 @@ auto_eval_column_dict.append(["fewshot_type", ColumnContent, ColumnContent("FS",
30
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
31
  #auto_eval_column_dict.append(["fewshot", ColumnContent, ColumnContent("Few-Shot", "str", True)])
32
 
33
-
34
-
35
-
36
  #Scores
37
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg. Combined Performance ⬆️", "number", True)])
38
  for task in Tasks:
 
30
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
31
  #auto_eval_column_dict.append(["fewshot", ColumnContent, ColumnContent("Few-Shot", "str", True)])
32
 
 
 
 
33
  #Scores
34
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg. Combined Performance ⬆️", "number", True)])
35
  for task in Tasks:
src/envs.py CHANGED
@@ -15,7 +15,7 @@ OWNER = "evalitahf"
15
  #RESULTS_REPO = f"{OWNER}/evalita-results"
16
 
17
  REPO_ID = f"{OWNER}/evalita_llm_leaderboard"
18
- QUEUE_REPO = f"{OWNER}/evalita_llm_requests"
19
  RESULTS_REPO = f"{OWNER}/evalita_llm_results"
20
 
21
  # If you setup a cache later, just change HF_HOME
 
15
  #RESULTS_REPO = f"{OWNER}/evalita-results"
16
 
17
  REPO_ID = f"{OWNER}/evalita_llm_leaderboard"
18
+ #QUEUE_REPO = f"{OWNER}/evalita_llm_requests"
19
  RESULTS_REPO = f"{OWNER}/evalita_llm_results"
20
 
21
  # If you setup a cache later, just change HF_HOME
src/tasks.py CHANGED
@@ -56,7 +56,7 @@ SA_DESCRIPTION = """### Sentiment Analysis (SA)
56
  """
57
 
58
  HS_DESCRIPTION = """### Hate Speech (HS)
59
- The input is a tweet. The model has to determine whether the text contains hateful content directed at specific target groups: immigrants, Muslims, or Roma. The output is a binary classification: hateful or not hateful.
60
 
61
  | # | Prompt | Answer Choices |
62
  |-----|--------------------------------------------------------------------------------|-------------------------------------------------|
@@ -104,7 +104,7 @@ WIC_DESCRIPTION = """### Word in Context (WIC)
104
  """
105
 
106
  FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ)
107
- The input is a user query made by customers to the Acquedotto Pugliese service. The model must determine which of the 4 possible answers is the correct response to the question.
108
 
109
  | # | Prompt | Answer Choices |
110
  |-----|--------------------------------------------------------------------------------|-----------------------------|
 
56
  """
57
 
58
  HS_DESCRIPTION = """### Hate Speech (HS)
59
+ The input is a tweet. The model has to determine whether the text contains hateful content directed towards marginalized or minority groups. The output is a binary classification: hateful or not hateful.
60
 
61
  | # | Prompt | Answer Choices |
62
  |-----|--------------------------------------------------------------------------------|-------------------------------------------------|
 
104
  """
105
 
106
  FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ)
107
+ The input is a user query regarding the water supply service. The model must identify the correct answer from the 4 available options.
108
 
109
  | # | Prompt | Answer Choices |
110
  |-----|--------------------------------------------------------------------------------|-----------------------------|