rzanoli commited on
Commit
ad489d5
·
1 Parent(s): 36a0318

Add new scripts for model processing and tasks management

Browse files
Files changed (4) hide show
  1. app2.py +0 -0
  2. get_model_info.py +74 -0
  3. preprocess_models_output.py +201 -0
  4. src/tasks.py +37 -0
app2.py ADDED
File without changes
get_model_info.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ from huggingface_hub import HfApi
5
+
6
+ # Configura il token di Hugging Face (se necessario)
7
+ #TOKEN = "YOUR_HUGGINGFACE_API_TOKEN"
8
+ api = HfApi()
9
+
10
+ # Percorsi delle cartelle
11
+ input_folder = "../evalita_llm_results/models_output/"
12
+ output_folder = "../evalita_llm_requests2/"
13
+
14
+ # Creazione della cartella di output se non esiste
15
+ os.makedirs(output_folder, exist_ok=True)
16
+
17
+ # Espressione regolare per trovare il nome del modello
18
+ model_pattern = re.compile(r"pretrained=([\w\-./]+)")
19
+
20
+ # Scansiona i file nella cartella di input
21
+ for filename in os.listdir(input_folder):
22
+ file_path = os.path.join(input_folder, filename)
23
+
24
+ # Leggi il contenuto del file
25
+ with open(file_path, "r", encoding="utf-8") as f:
26
+ content = f.read()
27
+
28
+ # Estrai il nome del modello
29
+ match = model_pattern.search(content)
30
+ if match:
31
+ model_name = match.group(1)
32
+ print(f"Processing model: {model_name}")
33
+
34
+ try:
35
+ # Ottieni le informazioni del modello da Hugging Face
36
+ model_info = api.model_info(model_name)
37
+
38
+ # Costruisci il dizionario con i metadati richiesti
39
+ model_data = {
40
+ "model": model_name,
41
+ "base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "",
42
+ "revision": model_info.sha,
43
+ "precision": "bfloat16", # Se disponibile, sostituire con un valore reale
44
+ #"weight_type": "Original",
45
+ #"status": "FINISHED",
46
+ "submitted_time": str(model_info.created_at),
47
+ "model_type": "pretrained",
48
+ #"likes": model_info.likes,
49
+ #"params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None,
50
+ #"license": model_info.license,
51
+ #"private": model_info.private,
52
+ }
53
+
54
+ # Separare il model_name in due parti: prima e dopo "/"
55
+ if "/" in model_name:
56
+ dir_name, file_name = model_name.split("/", 1)
57
+ else:
58
+ dir_name, file_name = model_name, model_name # Se non c'è "/", usa lo stesso nome
59
+
60
+ # Creare la cartella per la prima parte del nome del modello
61
+ model_output_folder = os.path.join(output_folder, dir_name)
62
+ os.makedirs(model_output_folder, exist_ok=True)
63
+
64
+ # Salvare il file JSON nella cartella appropriata
65
+ output_file = os.path.join(model_output_folder, f"{file_name}.json")
66
+ with open(output_file, "w", encoding="utf-8") as f:
67
+ json.dump(model_data, f, indent=4)
68
+
69
+ print(f"Saved metadata for {model_name} in {output_file}")
70
+
71
+ except Exception as e:
72
+ print(f"Error retrieving info for {model_name}: {e}")
73
+
74
+ print("Process completed.")
preprocess_models_output.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+
5
+ def safe_float(value):
6
+ """Convert a value to float safely. Returns None if conversion fails."""
7
+ try:
8
+ return float(value)
9
+ except ValueError:
10
+ return None
11
+
12
+
13
+ def calculate_task_metrics(task_info):
14
+ """Calculate average accuracy, best prompt, and CPS for a task."""
15
+ accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
16
+
17
+ if not accuracies:
18
+ return None
19
+
20
+ task_info['average_accuracy'] = sum(accuracies) / len(accuracies)
21
+ best_prompt_data = max(task_info['prompts'], key=lambda x: x['value'])
22
+ task_info['best_prompt'] = best_prompt_data['value']
23
+ task_info['prompt_id'] = best_prompt_data['prompt']
24
+
25
+ # Calculate CPS
26
+ avg_acc = task_info['average_accuracy']
27
+ best_acc = task_info['best_prompt']
28
+ task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc
29
+
30
+
31
+ def extract_data_from_file(file_path):
32
+ """Extract task and prompt data from the given file."""
33
+ with open(file_path, 'r') as file:
34
+ lines = file.readlines()
35
+
36
+ tasks_data = {}
37
+ current_task = None
38
+
39
+ for line in lines:
40
+ line = line.strip()
41
+
42
+ # Skip irrelevant lines
43
+ if not line:
44
+ continue
45
+
46
+
47
+ if line.startswith("| Tasks"):
48
+ continue
49
+
50
+ if line.startswith("hf (pretrained="):
51
+
52
+ # Estrai la parte dopo "pretrained="
53
+ start = line.find("pretrained=") + len("pretrained=")
54
+ end = line.find(",", start) # Trova la virgola successiva
55
+ # Estrai la stringa desiderata
56
+ pretrained_model = line[start:end]
57
+
58
+ # Estrarre num_fewshot
59
+ num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
60
+ num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
61
+
62
+ # Estrarre batch_size
63
+ batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
64
+ batch_size = int(batch_size_match.group(1)) if batch_size_match else None
65
+
66
+ continue
67
+
68
+ columns = line.split('|')
69
+ if len(columns) != 11:
70
+ continue
71
+
72
+ task_name = columns[1]
73
+ metric = columns[5].strip()
74
+ value = safe_float(columns[7])
75
+ stderr = safe_float(columns[9])
76
+
77
+ if metric == "acc_norm":
78
+ continue
79
+
80
+ # Identify task and prompts
81
+ if task_name.startswith(" - "):
82
+ task_name = task_name[3:].strip()
83
+ current_task = task_name
84
+ tasks_data.setdefault(current_task,
85
+ {'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None,
86
+ 'CPS': None})
87
+
88
+ elif task_name.startswith(" - ") and current_task:
89
+ prompt_name = task_name[4:].strip()
90
+ prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100,
91
+ 'stderr': stderr}
92
+ tasks_data[current_task]['prompts'].append(prompt_data)
93
+
94
+ # Special handling for evalita NER
95
+ if "evalita NER" in tasks_data:
96
+ task_info = tasks_data["evalita NER"]
97
+ weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
98
+ "WN prompt-1": 2088, "WN prompt-2": 2088}
99
+
100
+ weighted_values = {"prompt-1": 0, "prompt-2": 0}
101
+ total_weights = sum(weight_map.values())
102
+
103
+ for prompt in task_info['prompts']:
104
+ if prompt['prompt'] in weight_map:
105
+ if "prompt-1" in prompt['prompt']:
106
+ weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value']
107
+ elif "prompt-2" in prompt['prompt']:
108
+ weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value']
109
+
110
+ task_info['prompts'] = [
111
+ {"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights,
112
+ 'stderr': None},
113
+ {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
114
+ 'stderr': None}]
115
+
116
+ # Calculate metrics for each task
117
+ for task_info in tasks_data.values():
118
+ calculate_task_metrics(task_info)
119
+
120
+ # Calculate average CPS
121
+ tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
122
+ average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
123
+
124
+ config = {
125
+ "model_name": pretrained_model,
126
+ "num_fewshot": num_fewshot,
127
+ "batch_size": batch_size
128
+ }
129
+
130
+ return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
131
+
132
+
133
+ # Example usage
134
+ #file_path = '../evalita_llm_results/models_output/slurm-7769.out'
135
+ #json_output = extract_data_from_file(file_path)
136
+ #print(json_output)
137
+
138
+
139
+ # Directory da cui leggere i file .out
140
+ directory_in_path = '../evalita_llm_models_output/'
141
+ directory_out_results_path = '../evalita_llm_results/'
142
+ directory_out_requests_path = '../evalita_llm_requests/'
143
+
144
+ # Itera sui file nella directory
145
+ for filename in os.listdir(directory_in_path):
146
+ if filename.endswith('.out'):
147
+ # Costruisci il percorso completo del file
148
+ file_path = os.path.join(directory_in_path, filename)
149
+
150
+ # Esegui la funzione extract_data_from_file
151
+ json_output = extract_data_from_file(file_path)
152
+
153
+ # Estrai model_org_name e model_name da model_name
154
+ model_org_name, model_name = json_output['config']['model_name'].split('/')
155
+
156
+
157
+
158
+
159
+
160
+
161
+ # Percorso del file JSON di configurazione in ../evalita_llm_requests2/
162
+ config_file_path = os.path.join(directory_out_requests_path, model_org_name, f"{model_name}.json")
163
+
164
+ # Se il file esiste, caricalo e aggiorna il dizionario config
165
+ if os.path.exists(config_file_path):
166
+ with open(config_file_path, 'r', encoding='utf-8') as config_file:
167
+ additional_config = json.load(config_file)
168
+
169
+ # Aggiorna la configurazione con i nuovi dati
170
+ json_output['config'].update(additional_config)
171
+
172
+
173
+
174
+
175
+ # Crea il percorso della cartella per model_org_name
176
+ org_folder_path = os.path.join(directory_out_results_path, model_org_name)
177
+ os.makedirs(org_folder_path, exist_ok=True) # Crea la cartella se non esiste
178
+
179
+ # Crea il percorso completo del file JSON
180
+ file_suffix = f"{json_output['config']['num_fewshot']}"
181
+ output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
182
+
183
+ # Salva il JSON in un file con ritorni a capo compatibili con Linux
184
+ with open(output_file_path, 'w', newline="\n") as outfile:
185
+ json.dump(json_output, outfile, indent=4)
186
+
187
+ # Stampa il risultato
188
+ print(f"File {filename} elaborato e salvato in {output_file_path}")
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+
198
+
199
+
200
+
201
+
src/tasks.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+ @dataclass
5
+ class Task:
6
+ benchmark: str
7
+ # metric: str
8
+ accuracy: str
9
+ col_name: str
10
+
11
+
12
+ NUM_FEWSHOT = 0 # Change with your few shot
13
+ # ---------------------------------------------------
14
+
15
+ # Your leaderboard name
16
+ TITLE = """<h1 align="center" id="space-title">🚀 EVALITA-LLM Leaderboard 🚀</h1>"""
17
+
18
+ # What does your leaderboard evaluate?
19
+ INTRODUCTION_TEXT = """
20
+ Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
21
+ """
22
+
23
+ # Which evaluations are you running? how can people reproduce what you have?
24
+ TE_DESCRIPTION = """### Textual Entailment (TE)
25
+ The input are two sentences: the text (T) and the hypothesis (H). The model has to determine whether the meaning of the hypothesis is logically entailed by the text.
26
+
27
+ | # | Prompt | Answer Choices |
28
+ |-----|--------|----------------|
29
+ | 1 | La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? | ["Sì", "No"] |
30
+ | 2 | Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? | ["Sì", "No"] |
31
+ | 3 | La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: | ["A", "B"] |
32
+ | 4 | Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: | ["A", "B"] |
33
+ | 5 | Frase 1: '{{text1}}' Frase 2: '{{text2}}' | ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] |
34
+ | 6 | Devi risolvere un compito di inferenza semantica. Frase 1: '{{text1}}' Frase 2: '{{text2}}' | ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] |
35
+
36
+ Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)
37
+ """