Small changes
Browse files- get_model_info.py +41 -5
- preprocess_models_output.py +92 -3
- src/tasks.py +1 -1
get_model_info.py
CHANGED
@@ -1,6 +1,36 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import os
|
5 |
import re
|
6 |
import json
|
@@ -38,7 +68,7 @@ for filename in os.listdir(input_folder):
|
|
38 |
print(f"Processing model: {model_name}")
|
39 |
|
40 |
try:
|
41 |
-
# Retrieves model information from
|
42 |
model_info = api.model_info(model_name)
|
43 |
|
44 |
# Calculates the number of parameters in billions, if available
|
@@ -49,7 +79,7 @@ for filename in os.listdir(input_folder):
|
|
49 |
# Extracts and concatenates languages
|
50 |
language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""
|
51 |
|
52 |
-
print(model_info)
|
53 |
|
54 |
# Builds the dictionary with required metadata
|
55 |
model_data = {
|
@@ -81,6 +111,12 @@ for filename in os.listdir(input_folder):
|
|
81 |
|
82 |
# Saves the JSON file in the appropriate folder
|
83 |
output_file = os.path.join(model_output_folder, f"{file_name}.json")
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
with open(output_file, "w", encoding="utf-8") as f:
|
85 |
json.dump(model_data, f, indent=4)
|
86 |
|
|
|
1 |
+
"""
|
2 |
+
MODEL METADATA EXTRACTOR
|
3 |
+
|
4 |
+
This script processes model evaluation output files (input_folder) from the lm-eval-harness library,
|
5 |
+
extracts model identifiers, retrieves detailed metadata from HuggingFace
|
6 |
+
and saves the information as structured JSON files (output_folder).
|
7 |
+
|
8 |
+
Input: Directory containing .out files from lm-eval-harness
|
9 |
+
Output: Directory with JSON files containing model metadata
|
10 |
+
"""
|
11 |
+
|
12 |
+
# Example input file format (lm-eval-harness output):
|
13 |
+
'''
|
14 |
+
hf (pretrained=swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 1
|
15 |
+
| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
|
16 |
+
|------------------------|------:|------|-----:|--------|---|-----:|---|------|
|
17 |
+
|evalita-mp | 1|none | |acc |↑ |0.5605|± |0.0052|
|
18 |
+
...
|
19 |
+
Job completed
|
20 |
+
'''
|
21 |
+
|
22 |
+
# Example output JSON format:
|
23 |
+
'''
|
24 |
+
{
|
25 |
+
"model": "swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA",
|
26 |
+
"base_model": "LlamaForCausalLM",
|
27 |
+
"revision": "2b6e46e4c9d341dc8bf8350a167492c880116b66",
|
28 |
+
"submitted_time": "2024-04-29 09:34:12+00:00",
|
29 |
+
"num_params_billion": 8.030261248,
|
30 |
+
"language": "en_it"
|
31 |
+
}
|
32 |
+
'''
|
33 |
+
|
34 |
import os
|
35 |
import re
|
36 |
import json
|
|
|
68 |
print(f"Processing model: {model_name}")
|
69 |
|
70 |
try:
|
71 |
+
# Retrieves model information from HuggingFace
|
72 |
model_info = api.model_info(model_name)
|
73 |
|
74 |
# Calculates the number of parameters in billions, if available
|
|
|
79 |
# Extracts and concatenates languages
|
80 |
language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""
|
81 |
|
82 |
+
#print(model_info)
|
83 |
|
84 |
# Builds the dictionary with required metadata
|
85 |
model_data = {
|
|
|
111 |
|
112 |
# Saves the JSON file in the appropriate folder
|
113 |
output_file = os.path.join(model_output_folder, f"{file_name}.json")
|
114 |
+
|
115 |
+
# Check if the file already exists
|
116 |
+
if os.path.exists(output_file):
|
117 |
+
print(f"File {output_file} already exists. Skipping...")
|
118 |
+
continue
|
119 |
+
|
120 |
with open(output_file, "w", encoding="utf-8") as f:
|
121 |
json.dump(model_data, f, indent=4)
|
122 |
|
preprocess_models_output.py
CHANGED
@@ -1,3 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
import re
|
@@ -126,9 +198,26 @@ def extract_data_from_file(file_path):
|
|
126 |
|
127 |
return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
|
128 |
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
directory_in_path = '../evalita_llm_models_output/'
|
133 |
directory_in_requests_path = '../evalita_llm_requests/'
|
134 |
directory_out_results_path = '../evalita_llm_results/'
|
|
|
1 |
+
"""
|
2 |
+
EVALITA LLM EVALUATION PROCESSOR
|
3 |
+
|
4 |
+
Transforms raw model evaluation outputs into structured performance reports for leaderboard integration.
|
5 |
+
|
6 |
+
DATA PIPELINE OVERVIEW:
|
7 |
+
|
8 |
+
1. Inputs:
|
9 |
+
- Evaluation Results: Raw .out files from lm-eval-harness
|
10 |
+
- Model Metadata: Pre-collected .json files from HuggingFace
|
11 |
+
|
12 |
+
2. Output:
|
13 |
+
- Comprehensive evaluation reports in JSON format
|
14 |
+
- Ready for ingestion into the evaluation leaderboard
|
15 |
+
|
16 |
+
--------------------------------------------------------------------
|
17 |
+
INPUT SPECIFICATION
|
18 |
+
|
19 |
+
Evaluation Results (.out format):
|
20 |
+
hf (pretrained=model-org/model-name), num_fewshot: 5, batch_size: 1
|
21 |
+
| Task | Metric | Value | Stderr |
|
22 |
+
|---------------|--------|--------|--------|
|
23 |
+
| main-task | acc | 0.5605 | 0.0052 |
|
24 |
+
| - sub-task | acc | 0.4640 | 0.0088 |
|
25 |
+
| - prompt-1 | acc | 0.3720 | 0.0216 |
|
26 |
+
|
27 |
+
Model Metadata (.json format):
|
28 |
+
{
|
29 |
+
"model": "model-org/model-name",
|
30 |
+
"base_model": "ModelArchitecture",
|
31 |
+
"revision": "git_commit_hash",
|
32 |
+
"parameters": 8.03,
|
33 |
+
"language": "en_it"
|
34 |
+
}
|
35 |
+
|
36 |
+
--------------------------------------------------------------------
|
37 |
+
OUTPUT SPECIFICATION
|
38 |
+
|
39 |
+
Evaluation Report (.json format):
|
40 |
+
{
|
41 |
+
"summary_metrics": {
|
42 |
+
"average_CPS": 41.74,
|
43 |
+
"num_tasks": 12
|
44 |
+
},
|
45 |
+
"model_config": {
|
46 |
+
"identifier": "model-org/model-name",
|
47 |
+
"architecture": "ModelArchitecture",
|
48 |
+
"parameters": 8.03,
|
49 |
+
"evaluation_settings": {
|
50 |
+
"fewshot": 5,
|
51 |
+
"batch_size": 1
|
52 |
+
}
|
53 |
+
},
|
54 |
+
"task_results": {
|
55 |
+
"task-name": {
|
56 |
+
"average_score": 52.60,
|
57 |
+
"best_prompt": {
|
58 |
+
"id": "prompt-6",
|
59 |
+
"score": 66.57
|
60 |
+
},
|
61 |
+
"prompt_analysis": [
|
62 |
+
{
|
63 |
+
"prompt_id": "prompt-1",
|
64 |
+
"score": 37.20,
|
65 |
+
"stderr": 0.0216
|
66 |
+
}
|
67 |
+
]
|
68 |
+
}
|
69 |
+
}
|
70 |
+
}
|
71 |
+
"""
|
72 |
+
|
73 |
import json
|
74 |
import os
|
75 |
import re
|
|
|
198 |
|
199 |
return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
|
200 |
|
201 |
+
|
202 |
+
"""
|
203 |
+
MAIN PROCESSING PIPELINE
|
204 |
+
|
205 |
+
This script executes the complete evaluation data processing workflow:
|
206 |
+
|
207 |
+
1. Input Sources:
|
208 |
+
- Raw evaluation results (.out files) from: ../evalita_llm_models_output/
|
209 |
+
- Model metadata JSON files from: ../evalita_llm_requests/
|
210 |
+
|
211 |
+
2. Processing Steps:
|
212 |
+
- Parses evaluation metrics from .out files
|
213 |
+
- Combines with model metadata
|
214 |
+
- Calculates aggregated performance statistics
|
215 |
+
|
216 |
+
3. Output:
|
217 |
+
- Structured JSON results saved to: ../evalita_llm_results/
|
218 |
+
- Organized by model organization/name
|
219 |
+
- Contains complete evaluation results with metadata
|
220 |
+
"""
|
221 |
directory_in_path = '../evalita_llm_models_output/'
|
222 |
directory_in_requests_path = '../evalita_llm_requests/'
|
223 |
directory_out_results_path = '../evalita_llm_results/'
|
src/tasks.py
CHANGED
@@ -156,7 +156,7 @@ NER_DESCRIPTION = """### Named Entity Recognition (NER)
|
|
156 |
"""
|
157 |
|
158 |
REL_DESCRIPTION = """### Relation Extraction (REL)
|
159 |
-
The
|
160 |
|
161 |
| # | Prompt |
|
162 |
|-----|--------------------------------------------------------------------------------|
|
|
|
156 |
"""
|
157 |
|
158 |
REL_DESCRIPTION = """### Relation Extraction (REL)
|
159 |
+
The input is a sentence of a clinical text. The model must identify and extract relationships between laboratory test results (e.g., blood pressure) and the corresponding tests or procedures that generated them (e.g., blood pressure test).
|
160 |
|
161 |
| # | Prompt |
|
162 |
|-----|--------------------------------------------------------------------------------|
|