Add new scripts for model processing and tasks management
Browse files- app2.py +0 -0
- get_model_info.py +74 -0
- preprocess_models_output.py +201 -0
- src/tasks.py +37 -0
app2.py
ADDED
|
File without changes
|
get_model_info.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import json
|
| 4 |
+
from huggingface_hub import HfApi
|
| 5 |
+
|
| 6 |
+
# Configura il token di Hugging Face (se necessario)
|
| 7 |
+
#TOKEN = "YOUR_HUGGINGFACE_API_TOKEN"
|
| 8 |
+
api = HfApi()
|
| 9 |
+
|
| 10 |
+
# Percorsi delle cartelle
|
| 11 |
+
input_folder = "../evalita_llm_results/models_output/"
|
| 12 |
+
output_folder = "../evalita_llm_requests2/"
|
| 13 |
+
|
| 14 |
+
# Creazione della cartella di output se non esiste
|
| 15 |
+
os.makedirs(output_folder, exist_ok=True)
|
| 16 |
+
|
| 17 |
+
# Espressione regolare per trovare il nome del modello
|
| 18 |
+
model_pattern = re.compile(r"pretrained=([\w\-./]+)")
|
| 19 |
+
|
| 20 |
+
# Scansiona i file nella cartella di input
|
| 21 |
+
for filename in os.listdir(input_folder):
|
| 22 |
+
file_path = os.path.join(input_folder, filename)
|
| 23 |
+
|
| 24 |
+
# Leggi il contenuto del file
|
| 25 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 26 |
+
content = f.read()
|
| 27 |
+
|
| 28 |
+
# Estrai il nome del modello
|
| 29 |
+
match = model_pattern.search(content)
|
| 30 |
+
if match:
|
| 31 |
+
model_name = match.group(1)
|
| 32 |
+
print(f"Processing model: {model_name}")
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
# Ottieni le informazioni del modello da Hugging Face
|
| 36 |
+
model_info = api.model_info(model_name)
|
| 37 |
+
|
| 38 |
+
# Costruisci il dizionario con i metadati richiesti
|
| 39 |
+
model_data = {
|
| 40 |
+
"model": model_name,
|
| 41 |
+
"base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "",
|
| 42 |
+
"revision": model_info.sha,
|
| 43 |
+
"precision": "bfloat16", # Se disponibile, sostituire con un valore reale
|
| 44 |
+
#"weight_type": "Original",
|
| 45 |
+
#"status": "FINISHED",
|
| 46 |
+
"submitted_time": str(model_info.created_at),
|
| 47 |
+
"model_type": "pretrained",
|
| 48 |
+
#"likes": model_info.likes,
|
| 49 |
+
#"params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None,
|
| 50 |
+
#"license": model_info.license,
|
| 51 |
+
#"private": model_info.private,
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
# Separare il model_name in due parti: prima e dopo "/"
|
| 55 |
+
if "/" in model_name:
|
| 56 |
+
dir_name, file_name = model_name.split("/", 1)
|
| 57 |
+
else:
|
| 58 |
+
dir_name, file_name = model_name, model_name # Se non c'è "/", usa lo stesso nome
|
| 59 |
+
|
| 60 |
+
# Creare la cartella per la prima parte del nome del modello
|
| 61 |
+
model_output_folder = os.path.join(output_folder, dir_name)
|
| 62 |
+
os.makedirs(model_output_folder, exist_ok=True)
|
| 63 |
+
|
| 64 |
+
# Salvare il file JSON nella cartella appropriata
|
| 65 |
+
output_file = os.path.join(model_output_folder, f"{file_name}.json")
|
| 66 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
| 67 |
+
json.dump(model_data, f, indent=4)
|
| 68 |
+
|
| 69 |
+
print(f"Saved metadata for {model_name} in {output_file}")
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
print(f"Error retrieving info for {model_name}: {e}")
|
| 73 |
+
|
| 74 |
+
print("Process completed.")
|
preprocess_models_output.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
def safe_float(value):
|
| 6 |
+
"""Convert a value to float safely. Returns None if conversion fails."""
|
| 7 |
+
try:
|
| 8 |
+
return float(value)
|
| 9 |
+
except ValueError:
|
| 10 |
+
return None
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def calculate_task_metrics(task_info):
|
| 14 |
+
"""Calculate average accuracy, best prompt, and CPS for a task."""
|
| 15 |
+
accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
|
| 16 |
+
|
| 17 |
+
if not accuracies:
|
| 18 |
+
return None
|
| 19 |
+
|
| 20 |
+
task_info['average_accuracy'] = sum(accuracies) / len(accuracies)
|
| 21 |
+
best_prompt_data = max(task_info['prompts'], key=lambda x: x['value'])
|
| 22 |
+
task_info['best_prompt'] = best_prompt_data['value']
|
| 23 |
+
task_info['prompt_id'] = best_prompt_data['prompt']
|
| 24 |
+
|
| 25 |
+
# Calculate CPS
|
| 26 |
+
avg_acc = task_info['average_accuracy']
|
| 27 |
+
best_acc = task_info['best_prompt']
|
| 28 |
+
task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def extract_data_from_file(file_path):
|
| 32 |
+
"""Extract task and prompt data from the given file."""
|
| 33 |
+
with open(file_path, 'r') as file:
|
| 34 |
+
lines = file.readlines()
|
| 35 |
+
|
| 36 |
+
tasks_data = {}
|
| 37 |
+
current_task = None
|
| 38 |
+
|
| 39 |
+
for line in lines:
|
| 40 |
+
line = line.strip()
|
| 41 |
+
|
| 42 |
+
# Skip irrelevant lines
|
| 43 |
+
if not line:
|
| 44 |
+
continue
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
if line.startswith("| Tasks"):
|
| 48 |
+
continue
|
| 49 |
+
|
| 50 |
+
if line.startswith("hf (pretrained="):
|
| 51 |
+
|
| 52 |
+
# Estrai la parte dopo "pretrained="
|
| 53 |
+
start = line.find("pretrained=") + len("pretrained=")
|
| 54 |
+
end = line.find(",", start) # Trova la virgola successiva
|
| 55 |
+
# Estrai la stringa desiderata
|
| 56 |
+
pretrained_model = line[start:end]
|
| 57 |
+
|
| 58 |
+
# Estrarre num_fewshot
|
| 59 |
+
num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
|
| 60 |
+
num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
|
| 61 |
+
|
| 62 |
+
# Estrarre batch_size
|
| 63 |
+
batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
|
| 64 |
+
batch_size = int(batch_size_match.group(1)) if batch_size_match else None
|
| 65 |
+
|
| 66 |
+
continue
|
| 67 |
+
|
| 68 |
+
columns = line.split('|')
|
| 69 |
+
if len(columns) != 11:
|
| 70 |
+
continue
|
| 71 |
+
|
| 72 |
+
task_name = columns[1]
|
| 73 |
+
metric = columns[5].strip()
|
| 74 |
+
value = safe_float(columns[7])
|
| 75 |
+
stderr = safe_float(columns[9])
|
| 76 |
+
|
| 77 |
+
if metric == "acc_norm":
|
| 78 |
+
continue
|
| 79 |
+
|
| 80 |
+
# Identify task and prompts
|
| 81 |
+
if task_name.startswith(" - "):
|
| 82 |
+
task_name = task_name[3:].strip()
|
| 83 |
+
current_task = task_name
|
| 84 |
+
tasks_data.setdefault(current_task,
|
| 85 |
+
{'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None,
|
| 86 |
+
'CPS': None})
|
| 87 |
+
|
| 88 |
+
elif task_name.startswith(" - ") and current_task:
|
| 89 |
+
prompt_name = task_name[4:].strip()
|
| 90 |
+
prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100,
|
| 91 |
+
'stderr': stderr}
|
| 92 |
+
tasks_data[current_task]['prompts'].append(prompt_data)
|
| 93 |
+
|
| 94 |
+
# Special handling for evalita NER
|
| 95 |
+
if "evalita NER" in tasks_data:
|
| 96 |
+
task_info = tasks_data["evalita NER"]
|
| 97 |
+
weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
|
| 98 |
+
"WN prompt-1": 2088, "WN prompt-2": 2088}
|
| 99 |
+
|
| 100 |
+
weighted_values = {"prompt-1": 0, "prompt-2": 0}
|
| 101 |
+
total_weights = sum(weight_map.values())
|
| 102 |
+
|
| 103 |
+
for prompt in task_info['prompts']:
|
| 104 |
+
if prompt['prompt'] in weight_map:
|
| 105 |
+
if "prompt-1" in prompt['prompt']:
|
| 106 |
+
weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value']
|
| 107 |
+
elif "prompt-2" in prompt['prompt']:
|
| 108 |
+
weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value']
|
| 109 |
+
|
| 110 |
+
task_info['prompts'] = [
|
| 111 |
+
{"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights,
|
| 112 |
+
'stderr': None},
|
| 113 |
+
{"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
|
| 114 |
+
'stderr': None}]
|
| 115 |
+
|
| 116 |
+
# Calculate metrics for each task
|
| 117 |
+
for task_info in tasks_data.values():
|
| 118 |
+
calculate_task_metrics(task_info)
|
| 119 |
+
|
| 120 |
+
# Calculate average CPS
|
| 121 |
+
tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
|
| 122 |
+
average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
|
| 123 |
+
|
| 124 |
+
config = {
|
| 125 |
+
"model_name": pretrained_model,
|
| 126 |
+
"num_fewshot": num_fewshot,
|
| 127 |
+
"batch_size": batch_size
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
# Example usage
|
| 134 |
+
#file_path = '../evalita_llm_results/models_output/slurm-7769.out'
|
| 135 |
+
#json_output = extract_data_from_file(file_path)
|
| 136 |
+
#print(json_output)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
# Directory da cui leggere i file .out
|
| 140 |
+
directory_in_path = '../evalita_llm_models_output/'
|
| 141 |
+
directory_out_results_path = '../evalita_llm_results/'
|
| 142 |
+
directory_out_requests_path = '../evalita_llm_requests/'
|
| 143 |
+
|
| 144 |
+
# Itera sui file nella directory
|
| 145 |
+
for filename in os.listdir(directory_in_path):
|
| 146 |
+
if filename.endswith('.out'):
|
| 147 |
+
# Costruisci il percorso completo del file
|
| 148 |
+
file_path = os.path.join(directory_in_path, filename)
|
| 149 |
+
|
| 150 |
+
# Esegui la funzione extract_data_from_file
|
| 151 |
+
json_output = extract_data_from_file(file_path)
|
| 152 |
+
|
| 153 |
+
# Estrai model_org_name e model_name da model_name
|
| 154 |
+
model_org_name, model_name = json_output['config']['model_name'].split('/')
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
# Percorso del file JSON di configurazione in ../evalita_llm_requests2/
|
| 162 |
+
config_file_path = os.path.join(directory_out_requests_path, model_org_name, f"{model_name}.json")
|
| 163 |
+
|
| 164 |
+
# Se il file esiste, caricalo e aggiorna il dizionario config
|
| 165 |
+
if os.path.exists(config_file_path):
|
| 166 |
+
with open(config_file_path, 'r', encoding='utf-8') as config_file:
|
| 167 |
+
additional_config = json.load(config_file)
|
| 168 |
+
|
| 169 |
+
# Aggiorna la configurazione con i nuovi dati
|
| 170 |
+
json_output['config'].update(additional_config)
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
# Crea il percorso della cartella per model_org_name
|
| 176 |
+
org_folder_path = os.path.join(directory_out_results_path, model_org_name)
|
| 177 |
+
os.makedirs(org_folder_path, exist_ok=True) # Crea la cartella se non esiste
|
| 178 |
+
|
| 179 |
+
# Crea il percorso completo del file JSON
|
| 180 |
+
file_suffix = f"{json_output['config']['num_fewshot']}"
|
| 181 |
+
output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
|
| 182 |
+
|
| 183 |
+
# Salva il JSON in un file con ritorni a capo compatibili con Linux
|
| 184 |
+
with open(output_file_path, 'w', newline="\n") as outfile:
|
| 185 |
+
json.dump(json_output, outfile, indent=4)
|
| 186 |
+
|
| 187 |
+
# Stampa il risultato
|
| 188 |
+
print(f"File {filename} elaborato e salvato in {output_file_path}")
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
|
src/tasks.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from enum import Enum
|
| 3 |
+
|
| 4 |
+
@dataclass
|
| 5 |
+
class Task:
|
| 6 |
+
benchmark: str
|
| 7 |
+
# metric: str
|
| 8 |
+
accuracy: str
|
| 9 |
+
col_name: str
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
NUM_FEWSHOT = 0 # Change with your few shot
|
| 13 |
+
# ---------------------------------------------------
|
| 14 |
+
|
| 15 |
+
# Your leaderboard name
|
| 16 |
+
TITLE = """<h1 align="center" id="space-title">🚀 EVALITA-LLM Leaderboard 🚀</h1>"""
|
| 17 |
+
|
| 18 |
+
# What does your leaderboard evaluate?
|
| 19 |
+
INTRODUCTION_TEXT = """
|
| 20 |
+
Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
# Which evaluations are you running? how can people reproduce what you have?
|
| 24 |
+
TE_DESCRIPTION = """### Textual Entailment (TE)
|
| 25 |
+
The input are two sentences: the text (T) and the hypothesis (H). The model has to determine whether the meaning of the hypothesis is logically entailed by the text.
|
| 26 |
+
|
| 27 |
+
| # | Prompt | Answer Choices |
|
| 28 |
+
|-----|--------|----------------|
|
| 29 |
+
| 1 | La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? | ["Sì", "No"] |
|
| 30 |
+
| 2 | Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? | ["Sì", "No"] |
|
| 31 |
+
| 3 | La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: | ["A", "B"] |
|
| 32 |
+
| 4 | Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: | ["A", "B"] |
|
| 33 |
+
| 5 | Frase 1: '{{text1}}' Frase 2: '{{text2}}' | ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] |
|
| 34 |
+
| 6 | Devi risolvere un compito di inferenza semantica. Frase 1: '{{text1}}' Frase 2: '{{text2}}' | ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] |
|
| 35 |
+
|
| 36 |
+
Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)
|
| 37 |
+
"""
|