Small changes
Browse files- app.py +13 -12
- get_model_info.py +77 -72
- preprocess_models_output.py +19 -59
- preprocess_models_output_old.py +201 -0
- run_instructions.txt +42 -0
- src/about.py +4 -2
- src/display/utils.py +0 -3
- src/envs.py +1 -1
- src/tasks.py +2 -2
app.py
CHANGED
@@ -14,16 +14,16 @@ from src.submission.submit import add_new_eval
|
|
14 |
|
15 |
# Define task metadata (icons, names, descriptions)
|
16 |
TASK_METADATA = {
|
17 |
-
"TE": {"icon": "π", "name": "Textual Entailment", "tooltip": "
|
18 |
-
"SA": {"icon": "π", "name": "Sentiment Analysis", "tooltip": "
|
19 |
-
"HS": {"icon": "β οΈ", "name": "Hate Speech", "tooltip": "
|
20 |
-
"AT": {"icon": "π₯", "name": "Admission Test", "tooltip": "
|
21 |
-
"WIC": {"icon": "π€", "name": "Word in Context", "tooltip": "
|
22 |
-
"FAQ": {"icon": "β", "name": "Frequently Asked Questions", "tooltip": "
|
23 |
-
"LS": {"icon": "π", "name": "Lexical Substitution", "tooltip": "
|
24 |
-
"SU": {"icon": "π", "name": "Summarization", "tooltip": "
|
25 |
-
"NER": {"icon": "π·οΈ", "name": "Named Entity Recognition", "tooltip": "
|
26 |
-
"REL": {"icon": "π", "name": "Relation Extraction", "tooltip": "
|
27 |
}
|
28 |
|
29 |
def restart_space():
|
@@ -47,8 +47,8 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
|
|
47 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
48 |
hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
|
49 |
filter_columns=[
|
50 |
-
ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="Few-Shot Learning (FS)"),
|
51 |
-
ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0
|
52 |
],
|
53 |
bool_checkboxgroup_label="Hide models",
|
54 |
interactive=False,
|
@@ -82,6 +82,7 @@ with demo:
|
|
82 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
83 |
# Main leaderboard tab
|
84 |
with gr.TabItem("π
EVALITA-LLM Benchmark"):
|
|
|
85 |
leaderboard = init_leaderboard(
|
86 |
LEADERBOARD_DF,
|
87 |
default_selection=['FS', 'Model', "Avg. Combined Performance β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
|
|
|
14 |
|
15 |
# Define task metadata (icons, names, descriptions)
|
16 |
TASK_METADATA = {
|
17 |
+
"TE": {"icon": "π", "name": "Textual Entailment", "tooltip": ""},
|
18 |
+
"SA": {"icon": "π", "name": "Sentiment Analysis", "tooltip": ""},
|
19 |
+
"HS": {"icon": "β οΈ", "name": "Hate Speech", "tooltip": ""},
|
20 |
+
"AT": {"icon": "π₯", "name": "Admission Test", "tooltip": ""},
|
21 |
+
"WIC": {"icon": "π€", "name": "Word in Context", "tooltip": ""},
|
22 |
+
"FAQ": {"icon": "β", "name": "Frequently Asked Questions", "tooltip": ""},
|
23 |
+
"LS": {"icon": "π", "name": "Lexical Substitution", "tooltip": ""},
|
24 |
+
"SU": {"icon": "π", "name": "Summarization", "tooltip": ""},
|
25 |
+
"NER": {"icon": "π·οΈ", "name": "Named Entity Recognition", "tooltip": ""},
|
26 |
+
"REL": {"icon": "π", "name": "Relation Extraction", "tooltip": ""},
|
27 |
}
|
28 |
|
29 |
def restart_space():
|
|
|
47 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
48 |
hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
|
49 |
filter_columns=[
|
50 |
+
ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
|
51 |
+
ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
|
52 |
],
|
53 |
bool_checkboxgroup_label="Hide models",
|
54 |
interactive=False,
|
|
|
82 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
83 |
# Main leaderboard tab
|
84 |
with gr.TabItem("π
EVALITA-LLM Benchmark"):
|
85 |
+
|
86 |
leaderboard = init_leaderboard(
|
87 |
LEADERBOARD_DF,
|
88 |
default_selection=['FS', 'Model', "Avg. Combined Performance β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
|
get_model_info.py
CHANGED
@@ -1,87 +1,92 @@
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import re
|
3 |
import json
|
4 |
from huggingface_hub import HfApi
|
5 |
|
6 |
-
#
|
7 |
-
#TOKEN = "YOUR_HUGGINGFACE_API_TOKEN"
|
8 |
api = HfApi()
|
9 |
|
10 |
-
#
|
|
|
11 |
input_folder = "../evalita_llm_models_output/"
|
|
|
12 |
output_folder = "../evalita_llm_requests/"
|
13 |
|
14 |
-
#
|
15 |
os.makedirs(output_folder, exist_ok=True)
|
16 |
|
17 |
-
#
|
18 |
model_pattern = re.compile(r"pretrained=([\w\-./]+)")
|
19 |
|
20 |
-
#
|
21 |
for filename in os.listdir(input_folder):
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
1 |
+
# Reads model output files (including accuracy values) produced by lm-eval-harness,
|
2 |
+
# extracts model names, downloads their characteristics from HuggingFace, and saves metadata
|
3 |
+
# (such as parameter count and pre-training status) to model-specific JSON files.
|
4 |
import os
|
5 |
import re
|
6 |
import json
|
7 |
from huggingface_hub import HfApi
|
8 |
|
9 |
+
# Configures the Hugging Face token (if needed)
|
10 |
+
# TOKEN = "YOUR_HUGGINGFACE_API_TOKEN"
|
11 |
api = HfApi()
|
12 |
|
13 |
+
# Directory paths
|
14 |
+
# input_folder: Directory containing the output files of the lm-eval-harness library, including model accuracy metrics.
|
15 |
input_folder = "../evalita_llm_models_output/"
|
16 |
+
# output_folder: Directory where JSON files with model characteristics will be saved.
|
17 |
output_folder = "../evalita_llm_requests/"
|
18 |
|
19 |
+
# Creates the output folder if it doesn't exist
|
20 |
os.makedirs(output_folder, exist_ok=True)
|
21 |
|
22 |
+
# Regular expression to find the model name
|
23 |
model_pattern = re.compile(r"pretrained=([\w\-./]+)")
|
24 |
|
25 |
+
# Scans files in the input folder
|
26 |
for filename in os.listdir(input_folder):
|
27 |
+
if filename.endswith('.out'):
|
28 |
+
file_path = os.path.join(input_folder, filename)
|
29 |
+
|
30 |
+
# Reads the file content
|
31 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
32 |
+
content = f.read()
|
33 |
+
|
34 |
+
# Extracts the model name
|
35 |
+
match = model_pattern.search(content)
|
36 |
+
if match:
|
37 |
+
model_name = match.group(1)
|
38 |
+
print(f"Processing model: {model_name}")
|
39 |
+
|
40 |
+
try:
|
41 |
+
# Retrieves model information from Hugging Face
|
42 |
+
model_info = api.model_info(model_name)
|
43 |
+
|
44 |
+
# Calculates the number of parameters in billions, if available
|
45 |
+
num_params = None
|
46 |
+
if model_info.safetensors and "BF16" in model_info.safetensors.parameters:
|
47 |
+
num_params = model_info.safetensors.parameters["BF16"] / 1e9 # Convert to billions
|
48 |
+
|
49 |
+
# Extracts and concatenates languages
|
50 |
+
language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""
|
51 |
+
|
52 |
+
print(model_info)
|
53 |
+
|
54 |
+
# Builds the dictionary with required metadata
|
55 |
+
model_data = {
|
56 |
+
"model": model_name,
|
57 |
+
"base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "",
|
58 |
+
"revision": model_info.sha,
|
59 |
+
# "precision": "bfloat16", # If available, replace with real value
|
60 |
+
# "weight_type": "Original",
|
61 |
+
# "status": "FINISHED",
|
62 |
+
"submitted_time": str(model_info.created_at),
|
63 |
+
# "model_type": "pretrained",
|
64 |
+
# "likes": model_info.likes,
|
65 |
+
# "params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None,
|
66 |
+
# "license": model_info.license,
|
67 |
+
# "private": model_info.private,
|
68 |
+
"num_params_billion": num_params, # Number of parameters in billions
|
69 |
+
"language": language, # Extracted language
|
70 |
+
}
|
71 |
+
|
72 |
+
# Separates the model_name into two parts: directory name and file name
|
73 |
+
if "/" in model_name:
|
74 |
+
dir_name, file_name = model_name.split("/", 1)
|
75 |
+
else:
|
76 |
+
dir_name, file_name = model_name, model_name # If no "/", use the same name
|
77 |
+
|
78 |
+
# Creates the folder for saving the produced json files
|
79 |
+
model_output_folder = os.path.join(output_folder, dir_name)
|
80 |
+
os.makedirs(model_output_folder, exist_ok=True)
|
81 |
+
|
82 |
+
# Saves the JSON file in the appropriate folder
|
83 |
+
output_file = os.path.join(model_output_folder, f"{file_name}.json")
|
84 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
85 |
+
json.dump(model_data, f, indent=4)
|
86 |
+
|
87 |
+
print(f"Saved metadata for {model_name} in {output_file}")
|
88 |
+
|
89 |
+
except Exception as e:
|
90 |
+
print(f"Error retrieving info for {model_name}: {e}")
|
91 |
+
|
92 |
+
print("Process finished!")
|
preprocess_models_output.py
CHANGED
@@ -3,7 +3,7 @@ import os
|
|
3 |
import re
|
4 |
|
5 |
def safe_float(value):
|
6 |
-
"""
|
7 |
try:
|
8 |
return float(value)
|
9 |
except ValueError:
|
@@ -11,7 +11,7 @@ def safe_float(value):
|
|
11 |
|
12 |
|
13 |
def calculate_task_metrics(task_info):
|
14 |
-
"""
|
15 |
accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
|
16 |
|
17 |
if not accuracies:
|
@@ -29,7 +29,7 @@ def calculate_task_metrics(task_info):
|
|
29 |
|
30 |
|
31 |
def extract_data_from_file(file_path):
|
32 |
-
"""
|
33 |
with open(file_path, 'r') as file:
|
34 |
lines = file.readlines()
|
35 |
|
@@ -39,27 +39,23 @@ def extract_data_from_file(file_path):
|
|
39 |
for line in lines:
|
40 |
line = line.strip()
|
41 |
|
42 |
-
#
|
43 |
if not line:
|
44 |
continue
|
45 |
|
46 |
-
|
47 |
if line.startswith("| Tasks"):
|
48 |
continue
|
49 |
|
|
|
50 |
if line.startswith("hf (pretrained="):
|
51 |
-
|
52 |
-
# Estrai la parte dopo "pretrained="
|
53 |
start = line.find("pretrained=") + len("pretrained=")
|
54 |
-
end = line.find(",", start)
|
55 |
-
# Estrai la stringa desiderata
|
56 |
pretrained_model = line[start:end]
|
57 |
|
58 |
-
# Estrarre num_fewshot
|
59 |
num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
|
60 |
num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
|
61 |
|
62 |
-
# Estrarre batch_size
|
63 |
batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
|
64 |
batch_size = int(batch_size_match.group(1)) if batch_size_match else None
|
65 |
|
@@ -74,10 +70,11 @@ def extract_data_from_file(file_path):
|
|
74 |
value = safe_float(columns[7])
|
75 |
stderr = safe_float(columns[9])
|
76 |
|
|
|
77 |
if metric == "acc_norm":
|
78 |
continue
|
79 |
|
80 |
-
#
|
81 |
if task_name.startswith(" - "):
|
82 |
task_name = task_name[3:].strip()
|
83 |
current_task = task_name
|
@@ -91,7 +88,7 @@ def extract_data_from_file(file_path):
|
|
91 |
'stderr': stderr}
|
92 |
tasks_data[current_task]['prompts'].append(prompt_data)
|
93 |
|
94 |
-
# Special handling for evalita NER
|
95 |
if "evalita NER" in tasks_data:
|
96 |
task_info = tasks_data["evalita NER"]
|
97 |
weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
|
@@ -113,11 +110,11 @@ def extract_data_from_file(file_path):
|
|
113 |
{"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
|
114 |
'stderr': None}]
|
115 |
|
116 |
-
#
|
117 |
for task_info in tasks_data.values():
|
118 |
calculate_task_metrics(task_info)
|
119 |
|
120 |
-
#
|
121 |
tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
|
122 |
average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
|
123 |
|
@@ -129,73 +126,36 @@ def extract_data_from_file(file_path):
|
|
129 |
|
130 |
return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
|
131 |
|
132 |
-
|
133 |
-
#
|
134 |
-
#
|
135 |
-
#json_output = extract_data_from_file(file_path)
|
136 |
-
#print(json_output)
|
137 |
-
|
138 |
-
|
139 |
-
# Directory da cui leggere i file .out
|
140 |
directory_in_path = '../evalita_llm_models_output/'
|
|
|
141 |
directory_out_results_path = '../evalita_llm_results/'
|
142 |
-
directory_out_requests_path = '../evalita_llm_requests/'
|
143 |
|
144 |
-
# Itera sui file nella directory
|
145 |
for filename in os.listdir(directory_in_path):
|
146 |
if filename.endswith('.out'):
|
147 |
-
# Costruisci il percorso completo del file
|
148 |
file_path = os.path.join(directory_in_path, filename)
|
149 |
-
|
150 |
-
# Esegui la funzione extract_data_from_file
|
151 |
json_output = extract_data_from_file(file_path)
|
152 |
|
153 |
-
# Estrai model_org_name e model_name da model_name
|
154 |
model_org_name, model_name = json_output['config']['model_name'].split('/')
|
155 |
|
156 |
|
|
|
157 |
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
# Percorso del file JSON di configurazione in ../evalita_llm_requests2/
|
162 |
-
config_file_path = os.path.join(directory_out_requests_path, model_org_name, f"{model_name}.json")
|
163 |
-
|
164 |
-
# Se il file esiste, caricalo e aggiorna il dizionario config
|
165 |
if os.path.exists(config_file_path):
|
166 |
with open(config_file_path, 'r', encoding='utf-8') as config_file:
|
167 |
additional_config = json.load(config_file)
|
168 |
-
|
169 |
-
# Aggiorna la configurazione con i nuovi dati
|
170 |
json_output['config'].update(additional_config)
|
171 |
|
172 |
|
173 |
-
|
174 |
-
|
175 |
-
# Crea il percorso della cartella per model_org_name
|
176 |
org_folder_path = os.path.join(directory_out_results_path, model_org_name)
|
177 |
-
os.makedirs(org_folder_path, exist_ok=True)
|
178 |
|
179 |
-
# Crea il percorso completo del file JSON
|
180 |
file_suffix = f"{json_output['config']['num_fewshot']}"
|
181 |
output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
|
182 |
|
183 |
-
# Salva il JSON in un file con ritorni a capo compatibili con Linux
|
184 |
with open(output_file_path, 'w', newline="\n") as outfile:
|
185 |
json.dump(json_output, outfile, indent=4)
|
186 |
|
187 |
-
|
188 |
-
print(f"File {filename} elaborato e salvato in {output_file_path}")
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
|
|
3 |
import re
|
4 |
|
5 |
def safe_float(value):
|
6 |
+
"""Safely converts a value to float, returning None if the conversion fails."""
|
7 |
try:
|
8 |
return float(value)
|
9 |
except ValueError:
|
|
|
11 |
|
12 |
|
13 |
def calculate_task_metrics(task_info):
|
14 |
+
"""Calculates average accuracy, best prompt accuracy, and CPS for a given task."""
|
15 |
accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
|
16 |
|
17 |
if not accuracies:
|
|
|
29 |
|
30 |
|
31 |
def extract_data_from_file(file_path):
|
32 |
+
"""Extracts task and prompt data from a specified file."""
|
33 |
with open(file_path, 'r') as file:
|
34 |
lines = file.readlines()
|
35 |
|
|
|
39 |
for line in lines:
|
40 |
line = line.strip()
|
41 |
|
42 |
+
# Skips empty lines
|
43 |
if not line:
|
44 |
continue
|
45 |
|
46 |
+
# Skips header lines
|
47 |
if line.startswith("| Tasks"):
|
48 |
continue
|
49 |
|
50 |
+
# Extracts model configuration details
|
51 |
if line.startswith("hf (pretrained="):
|
|
|
|
|
52 |
start = line.find("pretrained=") + len("pretrained=")
|
53 |
+
end = line.find(",", start)
|
|
|
54 |
pretrained_model = line[start:end]
|
55 |
|
|
|
56 |
num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
|
57 |
num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
|
58 |
|
|
|
59 |
batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
|
60 |
batch_size = int(batch_size_match.group(1)) if batch_size_match else None
|
61 |
|
|
|
70 |
value = safe_float(columns[7])
|
71 |
stderr = safe_float(columns[9])
|
72 |
|
73 |
+
# Skips normalized accuracy metrics
|
74 |
if metric == "acc_norm":
|
75 |
continue
|
76 |
|
77 |
+
# Identifies task and prompt sections in the file
|
78 |
if task_name.startswith(" - "):
|
79 |
task_name = task_name[3:].strip()
|
80 |
current_task = task_name
|
|
|
88 |
'stderr': stderr}
|
89 |
tasks_data[current_task]['prompts'].append(prompt_data)
|
90 |
|
91 |
+
# Special handling for evalita NER task to calculate weighted prompt averages
|
92 |
if "evalita NER" in tasks_data:
|
93 |
task_info = tasks_data["evalita NER"]
|
94 |
weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
|
|
|
110 |
{"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
|
111 |
'stderr': None}]
|
112 |
|
113 |
+
# Calculates task metrics for each task
|
114 |
for task_info in tasks_data.values():
|
115 |
calculate_task_metrics(task_info)
|
116 |
|
117 |
+
# Calculates the average CPS across all tasks
|
118 |
tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
|
119 |
average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
|
120 |
|
|
|
126 |
|
127 |
return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
|
128 |
|
129 |
+
# Main script: processes .out files, extracts data, and saves JSON results.
|
130 |
+
# Reads .out files from directory_in_path, parses data including model config and task metrics,
|
131 |
+
# and saves results as JSON files in directory_out_results_path, merging config from directory_out_requests_path if available.
|
|
|
|
|
|
|
|
|
|
|
132 |
directory_in_path = '../evalita_llm_models_output/'
|
133 |
+
directory_in_requests_path = '../evalita_llm_requests/'
|
134 |
directory_out_results_path = '../evalita_llm_results/'
|
|
|
135 |
|
|
|
136 |
for filename in os.listdir(directory_in_path):
|
137 |
if filename.endswith('.out'):
|
|
|
138 |
file_path = os.path.join(directory_in_path, filename)
|
|
|
|
|
139 |
json_output = extract_data_from_file(file_path)
|
140 |
|
|
|
141 |
model_org_name, model_name = json_output['config']['model_name'].split('/')
|
142 |
|
143 |
|
144 |
+
config_file_path = os.path.join(directory_in_requests_path, model_org_name, f"{model_name}.json")
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
if os.path.exists(config_file_path):
|
147 |
with open(config_file_path, 'r', encoding='utf-8') as config_file:
|
148 |
additional_config = json.load(config_file)
|
|
|
|
|
149 |
json_output['config'].update(additional_config)
|
150 |
|
151 |
|
|
|
|
|
|
|
152 |
org_folder_path = os.path.join(directory_out_results_path, model_org_name)
|
153 |
+
os.makedirs(org_folder_path, exist_ok=True)
|
154 |
|
|
|
155 |
file_suffix = f"{json_output['config']['num_fewshot']}"
|
156 |
output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
|
157 |
|
|
|
158 |
with open(output_file_path, 'w', newline="\n") as outfile:
|
159 |
json.dump(json_output, outfile, indent=4)
|
160 |
|
161 |
+
print(f"File {filename} processed and saved to {output_file_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
preprocess_models_output_old.py
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
|
5 |
+
def safe_float(value):
|
6 |
+
"""Convert a value to float safely. Returns None if conversion fails."""
|
7 |
+
try:
|
8 |
+
return float(value)
|
9 |
+
except ValueError:
|
10 |
+
return None
|
11 |
+
|
12 |
+
|
13 |
+
def calculate_task_metrics(task_info):
|
14 |
+
"""Calculate average accuracy, best prompt, and CPS for a task."""
|
15 |
+
accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
|
16 |
+
|
17 |
+
if not accuracies:
|
18 |
+
return None
|
19 |
+
|
20 |
+
task_info['average_accuracy'] = sum(accuracies) / len(accuracies)
|
21 |
+
best_prompt_data = max(task_info['prompts'], key=lambda x: x['value'])
|
22 |
+
task_info['best_prompt'] = best_prompt_data['value']
|
23 |
+
task_info['prompt_id'] = best_prompt_data['prompt']
|
24 |
+
|
25 |
+
# Calculate CPS
|
26 |
+
avg_acc = task_info['average_accuracy']
|
27 |
+
best_acc = task_info['best_prompt']
|
28 |
+
task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc
|
29 |
+
|
30 |
+
|
31 |
+
def extract_data_from_file(file_path):
|
32 |
+
"""Extract task and prompt data from the given file."""
|
33 |
+
with open(file_path, 'r') as file:
|
34 |
+
lines = file.readlines()
|
35 |
+
|
36 |
+
tasks_data = {}
|
37 |
+
current_task = None
|
38 |
+
|
39 |
+
for line in lines:
|
40 |
+
line = line.strip()
|
41 |
+
|
42 |
+
# Skip irrelevant lines
|
43 |
+
if not line:
|
44 |
+
continue
|
45 |
+
|
46 |
+
|
47 |
+
if line.startswith("| Tasks"):
|
48 |
+
continue
|
49 |
+
|
50 |
+
if line.startswith("hf (pretrained="):
|
51 |
+
|
52 |
+
# Estrai la parte dopo "pretrained="
|
53 |
+
start = line.find("pretrained=") + len("pretrained=")
|
54 |
+
end = line.find(",", start) # Trova la virgola successiva
|
55 |
+
# Estrai la stringa desiderata
|
56 |
+
pretrained_model = line[start:end]
|
57 |
+
|
58 |
+
# Estrarre num_fewshot
|
59 |
+
num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
|
60 |
+
num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
|
61 |
+
|
62 |
+
# Estrarre batch_size
|
63 |
+
batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
|
64 |
+
batch_size = int(batch_size_match.group(1)) if batch_size_match else None
|
65 |
+
|
66 |
+
continue
|
67 |
+
|
68 |
+
columns = line.split('|')
|
69 |
+
if len(columns) != 11:
|
70 |
+
continue
|
71 |
+
|
72 |
+
task_name = columns[1]
|
73 |
+
metric = columns[5].strip()
|
74 |
+
value = safe_float(columns[7])
|
75 |
+
stderr = safe_float(columns[9])
|
76 |
+
|
77 |
+
if metric == "acc_norm":
|
78 |
+
continue
|
79 |
+
|
80 |
+
# Identify task and prompts
|
81 |
+
if task_name.startswith(" - "):
|
82 |
+
task_name = task_name[3:].strip()
|
83 |
+
current_task = task_name
|
84 |
+
tasks_data.setdefault(current_task,
|
85 |
+
{'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None,
|
86 |
+
'CPS': None})
|
87 |
+
|
88 |
+
elif task_name.startswith(" - ") and current_task:
|
89 |
+
prompt_name = task_name[4:].strip()
|
90 |
+
prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100,
|
91 |
+
'stderr': stderr}
|
92 |
+
tasks_data[current_task]['prompts'].append(prompt_data)
|
93 |
+
|
94 |
+
# Special handling for evalita NER
|
95 |
+
if "evalita NER" in tasks_data:
|
96 |
+
task_info = tasks_data["evalita NER"]
|
97 |
+
weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
|
98 |
+
"WN prompt-1": 2088, "WN prompt-2": 2088}
|
99 |
+
|
100 |
+
weighted_values = {"prompt-1": 0, "prompt-2": 0}
|
101 |
+
total_weights = sum(weight_map.values())
|
102 |
+
|
103 |
+
for prompt in task_info['prompts']:
|
104 |
+
if prompt['prompt'] in weight_map:
|
105 |
+
if "prompt-1" in prompt['prompt']:
|
106 |
+
weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value']
|
107 |
+
elif "prompt-2" in prompt['prompt']:
|
108 |
+
weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value']
|
109 |
+
|
110 |
+
task_info['prompts'] = [
|
111 |
+
{"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights,
|
112 |
+
'stderr': None},
|
113 |
+
{"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
|
114 |
+
'stderr': None}]
|
115 |
+
|
116 |
+
# Calculate metrics for each task
|
117 |
+
for task_info in tasks_data.values():
|
118 |
+
calculate_task_metrics(task_info)
|
119 |
+
|
120 |
+
# Calculate average CPS
|
121 |
+
tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
|
122 |
+
average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
|
123 |
+
|
124 |
+
config = {
|
125 |
+
"model_name": pretrained_model,
|
126 |
+
"num_fewshot": num_fewshot,
|
127 |
+
"batch_size": batch_size
|
128 |
+
}
|
129 |
+
|
130 |
+
return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
|
131 |
+
|
132 |
+
|
133 |
+
# Example usage
|
134 |
+
#file_path = '../evalita_llm_results/models_output/slurm-7769.out'
|
135 |
+
#json_output = extract_data_from_file(file_path)
|
136 |
+
#print(json_output)
|
137 |
+
|
138 |
+
|
139 |
+
# Directory da cui leggere i file .out
|
140 |
+
directory_in_path = '../evalita_llm_models_output/'
|
141 |
+
directory_out_results_path = '../evalita_llm_results/'
|
142 |
+
directory_out_requests_path = '../evalita_llm_requests/'
|
143 |
+
|
144 |
+
# Itera sui file nella directory
|
145 |
+
for filename in os.listdir(directory_in_path):
|
146 |
+
if filename.endswith('.out'):
|
147 |
+
# Costruisci il percorso completo del file
|
148 |
+
file_path = os.path.join(directory_in_path, filename)
|
149 |
+
|
150 |
+
# Esegui la funzione extract_data_from_file
|
151 |
+
json_output = extract_data_from_file(file_path)
|
152 |
+
|
153 |
+
# Estrai model_org_name e model_name da model_name
|
154 |
+
model_org_name, model_name = json_output['config']['model_name'].split('/')
|
155 |
+
|
156 |
+
|
157 |
+
|
158 |
+
|
159 |
+
|
160 |
+
|
161 |
+
# Percorso del file JSON di configurazione in ../evalita_llm_requests2/
|
162 |
+
config_file_path = os.path.join(directory_out_requests_path, model_org_name, f"{model_name}.json")
|
163 |
+
|
164 |
+
# Se il file esiste, caricalo e aggiorna il dizionario config
|
165 |
+
if os.path.exists(config_file_path):
|
166 |
+
with open(config_file_path, 'r', encoding='utf-8') as config_file:
|
167 |
+
additional_config = json.load(config_file)
|
168 |
+
|
169 |
+
# Aggiorna la configurazione con i nuovi dati
|
170 |
+
json_output['config'].update(additional_config)
|
171 |
+
|
172 |
+
|
173 |
+
|
174 |
+
|
175 |
+
# Crea il percorso della cartella per model_org_name
|
176 |
+
org_folder_path = os.path.join(directory_out_results_path, model_org_name)
|
177 |
+
os.makedirs(org_folder_path, exist_ok=True) # Crea la cartella se non esiste
|
178 |
+
|
179 |
+
# Crea il percorso completo del file JSON
|
180 |
+
file_suffix = f"{json_output['config']['num_fewshot']}"
|
181 |
+
output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
|
182 |
+
|
183 |
+
# Salva il JSON in un file con ritorni a capo compatibili con Linux
|
184 |
+
with open(output_file_path, 'w', newline="\n") as outfile:
|
185 |
+
json.dump(json_output, outfile, indent=4)
|
186 |
+
|
187 |
+
# Stampa il risultato
|
188 |
+
print(f"File {filename} elaborato e salvato in {output_file_path}")
|
189 |
+
|
190 |
+
|
191 |
+
|
192 |
+
|
193 |
+
|
194 |
+
|
195 |
+
|
196 |
+
|
197 |
+
|
198 |
+
|
199 |
+
|
200 |
+
|
201 |
+
|
run_instructions.txt
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model Evaluation and Leaderboard
|
2 |
+
|
3 |
+
1) Model Evaluation
|
4 |
+
Before integrating a model into the leaderboard, it must first be evaluated using the lm-eval-harness library in both zero-shot and 5-shot configurations.
|
5 |
+
|
6 |
+
This can be done with the following command:
|
7 |
+
|
8 |
+
lm_eval --model hf --model_args pretrained=google/gemma-3-12b-it \
|
9 |
+
--tasks evalita-mp --device cuda:0 --batch_size 1 --trust_remote_code \
|
10 |
+
--output_path model_output --num_fewshot 5 --
|
11 |
+
|
12 |
+
The output generated by the library will include the model's accuracy scores on the benchmark tasks.
|
13 |
+
This output is written to the standard output and should be saved in a txt file (e.g., slurm-8368.out), which needs to be placed in the
|
14 |
+
evalita_llm_models_output directory for further processing.
|
15 |
+
|
16 |
+
2) Extracting Model Metadata
|
17 |
+
To display model details on the leaderboard (e.g., organization/group, model name, and parameter count), metadata must be retrieved from Hugging Face.
|
18 |
+
|
19 |
+
This can be done by running:
|
20 |
+
|
21 |
+
python get_model_info.py
|
22 |
+
|
23 |
+
This script processes the evaluation files from Step 1 and saves each model's metadata in a JSON file within the evalita_llm_requests directory.
|
24 |
+
|
25 |
+
3) Generating Leaderboard Submission File
|
26 |
+
The leaderboard requires a structured file containing each modelβs metadata along with its benchmark accuracy scores.
|
27 |
+
|
28 |
+
To generate this file, run:
|
29 |
+
|
30 |
+
python preprocess_model_output.
|
31 |
+
|
32 |
+
This script combines the accuracy results from Step 1 with the metadata from Step 2 and outputs a JSON file in the evalita_llm_results directory.
|
33 |
+
|
34 |
+
4) Updating the Hugging Face Repository
|
35 |
+
The evalita_llm_results repository on HuggingFace must be updated with the newly generated files from Step 3.
|
36 |
+
|
37 |
+
5) Running the Leaderboard Application
|
38 |
+
Finally, execute the leaderboard application by running:
|
39 |
+
|
40 |
+
python app.py
|
41 |
+
|
42 |
+
|
src/about.py
CHANGED
@@ -8,7 +8,6 @@ class Task:
|
|
8 |
metric_type: str
|
9 |
col_name: str
|
10 |
|
11 |
-
|
12 |
# Select your tasks here
|
13 |
# ---------------------------------------------------
|
14 |
class Tasks(Enum):
|
@@ -122,9 +121,10 @@ The following Evalita-LLM tasks can also be evaluated in isolation:
|
|
122 |
|
123 |
```bash
|
124 |
|
125 |
-
lm_eval --model hf --model_args pretrained=meta-llama/Llama-2-7b-hf --tasks evalita-mp --device cuda:0 --batch_size
|
126 |
```
|
127 |
|
|
|
128 |
### Checklist
|
129 |
|
130 |
* [x] Is the task an existing benchmark in the literature?
|
@@ -136,6 +136,8 @@ If other tasks on this dataset are already supported:
|
|
136 |
* [x] Is the "Main" variant of this task clearly denoted?
|
137 |
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
|
138 |
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
|
|
|
|
|
139 |
|
140 |
"""
|
141 |
|
|
|
8 |
metric_type: str
|
9 |
col_name: str
|
10 |
|
|
|
11 |
# Select your tasks here
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
|
|
121 |
|
122 |
```bash
|
123 |
|
124 |
+
lm_eval --model hf --model_args pretrained=meta-llama/Llama-2-7b-hf --tasks evalita-mp --device cuda:0 --batch_size 1
|
125 |
```
|
126 |
|
127 |
+
<!--
|
128 |
### Checklist
|
129 |
|
130 |
* [x] Is the task an existing benchmark in the literature?
|
|
|
136 |
* [x] Is the "Main" variant of this task clearly denoted?
|
137 |
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
|
138 |
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
|
139 |
+
-->
|
140 |
+
|
141 |
|
142 |
"""
|
143 |
|
src/display/utils.py
CHANGED
@@ -30,9 +30,6 @@ auto_eval_column_dict.append(["fewshot_type", ColumnContent, ColumnContent("FS",
|
|
30 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
31 |
#auto_eval_column_dict.append(["fewshot", ColumnContent, ColumnContent("Few-Shot", "str", True)])
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
#Scores
|
37 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg. Combined Performance β¬οΈ", "number", True)])
|
38 |
for task in Tasks:
|
|
|
30 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
31 |
#auto_eval_column_dict.append(["fewshot", ColumnContent, ColumnContent("Few-Shot", "str", True)])
|
32 |
|
|
|
|
|
|
|
33 |
#Scores
|
34 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg. Combined Performance β¬οΈ", "number", True)])
|
35 |
for task in Tasks:
|
src/envs.py
CHANGED
@@ -15,7 +15,7 @@ OWNER = "evalitahf"
|
|
15 |
#RESULTS_REPO = f"{OWNER}/evalita-results"
|
16 |
|
17 |
REPO_ID = f"{OWNER}/evalita_llm_leaderboard"
|
18 |
-
QUEUE_REPO = f"{OWNER}/evalita_llm_requests"
|
19 |
RESULTS_REPO = f"{OWNER}/evalita_llm_results"
|
20 |
|
21 |
# If you setup a cache later, just change HF_HOME
|
|
|
15 |
#RESULTS_REPO = f"{OWNER}/evalita-results"
|
16 |
|
17 |
REPO_ID = f"{OWNER}/evalita_llm_leaderboard"
|
18 |
+
#QUEUE_REPO = f"{OWNER}/evalita_llm_requests"
|
19 |
RESULTS_REPO = f"{OWNER}/evalita_llm_results"
|
20 |
|
21 |
# If you setup a cache later, just change HF_HOME
|
src/tasks.py
CHANGED
@@ -56,7 +56,7 @@ SA_DESCRIPTION = """### Sentiment Analysis (SA)
|
|
56 |
"""
|
57 |
|
58 |
HS_DESCRIPTION = """### Hate Speech (HS)
|
59 |
-
The input is a tweet. The model has to determine whether the text contains hateful content directed
|
60 |
|
61 |
| # | Prompt | Answer Choices |
|
62 |
|-----|--------------------------------------------------------------------------------|-------------------------------------------------|
|
@@ -104,7 +104,7 @@ WIC_DESCRIPTION = """### Word in Context (WIC)
|
|
104 |
"""
|
105 |
|
106 |
FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ)
|
107 |
-
The input is a user query
|
108 |
|
109 |
| # | Prompt | Answer Choices |
|
110 |
|-----|--------------------------------------------------------------------------------|-----------------------------|
|
|
|
56 |
"""
|
57 |
|
58 |
HS_DESCRIPTION = """### Hate Speech (HS)
|
59 |
+
The input is a tweet. The model has to determine whether the text contains hateful content directed towards marginalized or minority groups. The output is a binary classification: hateful or not hateful.
|
60 |
|
61 |
| # | Prompt | Answer Choices |
|
62 |
|-----|--------------------------------------------------------------------------------|-------------------------------------------------|
|
|
|
104 |
"""
|
105 |
|
106 |
FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ)
|
107 |
+
The input is a user query regarding the water supply service. The model must identify the correct answer from the 4 available options.
|
108 |
|
109 |
| # | Prompt | Answer Choices |
|
110 |
|-----|--------------------------------------------------------------------------------|-----------------------------|
|