rzanoli commited on
Commit
5a8f6c4
·
1 Parent(s): 5b04d4e

Small changes

Browse files
Files changed (3) hide show
  1. get_model_info.py +41 -5
  2. preprocess_models_output.py +92 -3
  3. src/tasks.py +1 -1
get_model_info.py CHANGED
@@ -1,6 +1,36 @@
1
- # Reads model output files (including accuracy values) produced by lm-eval-harness,
2
- # extracts model names, downloads their characteristics from HuggingFace, and saves metadata
3
- # (such as parameter count and pre-training status) to model-specific JSON files.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import os
5
  import re
6
  import json
@@ -38,7 +68,7 @@ for filename in os.listdir(input_folder):
38
  print(f"Processing model: {model_name}")
39
 
40
  try:
41
- # Retrieves model information from Hugging Face
42
  model_info = api.model_info(model_name)
43
 
44
  # Calculates the number of parameters in billions, if available
@@ -49,7 +79,7 @@ for filename in os.listdir(input_folder):
49
  # Extracts and concatenates languages
50
  language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""
51
 
52
- print(model_info)
53
 
54
  # Builds the dictionary with required metadata
55
  model_data = {
@@ -81,6 +111,12 @@ for filename in os.listdir(input_folder):
81
 
82
  # Saves the JSON file in the appropriate folder
83
  output_file = os.path.join(model_output_folder, f"{file_name}.json")
 
 
 
 
 
 
84
  with open(output_file, "w", encoding="utf-8") as f:
85
  json.dump(model_data, f, indent=4)
86
 
 
1
+ """
2
+ MODEL METADATA EXTRACTOR
3
+
4
+ This script processes model evaluation output files (input_folder) from the lm-eval-harness library,
5
+ extracts model identifiers, retrieves detailed metadata from HuggingFace
6
+ and saves the information as structured JSON files (output_folder).
7
+
8
+ Input: Directory containing .out files from lm-eval-harness
9
+ Output: Directory with JSON files containing model metadata
10
+ """
11
+
12
+ # Example input file format (lm-eval-harness output):
13
+ '''
14
+ hf (pretrained=swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 1
15
+ | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
16
+ |------------------------|------:|------|-----:|--------|---|-----:|---|------|
17
+ |evalita-mp | 1|none | |acc |↑ |0.5605|± |0.0052|
18
+ ...
19
+ Job completed
20
+ '''
21
+
22
+ # Example output JSON format:
23
+ '''
24
+ {
25
+ "model": "swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA",
26
+ "base_model": "LlamaForCausalLM",
27
+ "revision": "2b6e46e4c9d341dc8bf8350a167492c880116b66",
28
+ "submitted_time": "2024-04-29 09:34:12+00:00",
29
+ "num_params_billion": 8.030261248,
30
+ "language": "en_it"
31
+ }
32
+ '''
33
+
34
  import os
35
  import re
36
  import json
 
68
  print(f"Processing model: {model_name}")
69
 
70
  try:
71
+ # Retrieves model information from HuggingFace
72
  model_info = api.model_info(model_name)
73
 
74
  # Calculates the number of parameters in billions, if available
 
79
  # Extracts and concatenates languages
80
  language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""
81
 
82
+ #print(model_info)
83
 
84
  # Builds the dictionary with required metadata
85
  model_data = {
 
111
 
112
  # Saves the JSON file in the appropriate folder
113
  output_file = os.path.join(model_output_folder, f"{file_name}.json")
114
+
115
+ # Check if the file already exists
116
+ if os.path.exists(output_file):
117
+ print(f"File {output_file} already exists. Skipping...")
118
+ continue
119
+
120
  with open(output_file, "w", encoding="utf-8") as f:
121
  json.dump(model_data, f, indent=4)
122
 
preprocess_models_output.py CHANGED
@@ -1,3 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import os
3
  import re
@@ -126,9 +198,26 @@ def extract_data_from_file(file_path):
126
 
127
  return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
128
 
129
- # Main script: processes .out files, extracts data, and saves JSON results.
130
- # Reads .out files from directory_in_path, parses data including model config and task metrics,
131
- # and saves results as JSON files in directory_out_results_path, merging config from directory_out_requests_path if available.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  directory_in_path = '../evalita_llm_models_output/'
133
  directory_in_requests_path = '../evalita_llm_requests/'
134
  directory_out_results_path = '../evalita_llm_results/'
 
1
+ """
2
+ EVALITA LLM EVALUATION PROCESSOR
3
+
4
+ Transforms raw model evaluation outputs into structured performance reports for leaderboard integration.
5
+
6
+ DATA PIPELINE OVERVIEW:
7
+
8
+ 1. Inputs:
9
+ - Evaluation Results: Raw .out files from lm-eval-harness
10
+ - Model Metadata: Pre-collected .json files from HuggingFace
11
+
12
+ 2. Output:
13
+ - Comprehensive evaluation reports in JSON format
14
+ - Ready for ingestion into the evaluation leaderboard
15
+
16
+ --------------------------------------------------------------------
17
+ INPUT SPECIFICATION
18
+
19
+ Evaluation Results (.out format):
20
+ hf (pretrained=model-org/model-name), num_fewshot: 5, batch_size: 1
21
+ | Task | Metric | Value | Stderr |
22
+ |---------------|--------|--------|--------|
23
+ | main-task | acc | 0.5605 | 0.0052 |
24
+ | - sub-task | acc | 0.4640 | 0.0088 |
25
+ | - prompt-1 | acc | 0.3720 | 0.0216 |
26
+
27
+ Model Metadata (.json format):
28
+ {
29
+ "model": "model-org/model-name",
30
+ "base_model": "ModelArchitecture",
31
+ "revision": "git_commit_hash",
32
+ "parameters": 8.03,
33
+ "language": "en_it"
34
+ }
35
+
36
+ --------------------------------------------------------------------
37
+ OUTPUT SPECIFICATION
38
+
39
+ Evaluation Report (.json format):
40
+ {
41
+ "summary_metrics": {
42
+ "average_CPS": 41.74,
43
+ "num_tasks": 12
44
+ },
45
+ "model_config": {
46
+ "identifier": "model-org/model-name",
47
+ "architecture": "ModelArchitecture",
48
+ "parameters": 8.03,
49
+ "evaluation_settings": {
50
+ "fewshot": 5,
51
+ "batch_size": 1
52
+ }
53
+ },
54
+ "task_results": {
55
+ "task-name": {
56
+ "average_score": 52.60,
57
+ "best_prompt": {
58
+ "id": "prompt-6",
59
+ "score": 66.57
60
+ },
61
+ "prompt_analysis": [
62
+ {
63
+ "prompt_id": "prompt-1",
64
+ "score": 37.20,
65
+ "stderr": 0.0216
66
+ }
67
+ ]
68
+ }
69
+ }
70
+ }
71
+ """
72
+
73
  import json
74
  import os
75
  import re
 
198
 
199
  return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
200
 
201
+
202
+ """
203
+ MAIN PROCESSING PIPELINE
204
+
205
+ This script executes the complete evaluation data processing workflow:
206
+
207
+ 1. Input Sources:
208
+ - Raw evaluation results (.out files) from: ../evalita_llm_models_output/
209
+ - Model metadata JSON files from: ../evalita_llm_requests/
210
+
211
+ 2. Processing Steps:
212
+ - Parses evaluation metrics from .out files
213
+ - Combines with model metadata
214
+ - Calculates aggregated performance statistics
215
+
216
+ 3. Output:
217
+ - Structured JSON results saved to: ../evalita_llm_results/
218
+ - Organized by model organization/name
219
+ - Contains complete evaluation results with metadata
220
+ """
221
  directory_in_path = '../evalita_llm_models_output/'
222
  directory_in_requests_path = '../evalita_llm_requests/'
223
  directory_out_results_path = '../evalita_llm_results/'
src/tasks.py CHANGED
@@ -156,7 +156,7 @@ NER_DESCRIPTION = """### Named Entity Recognition (NER)
156
  """
157
 
158
  REL_DESCRIPTION = """### Relation Extraction (REL)
159
- The task involves analyzing clinical text to extract relationships between laboratory test results (e.g., blood pressure) and the tests or procedures that produced them (e.g., blood pressure test).
160
 
161
  | # | Prompt |
162
  |-----|--------------------------------------------------------------------------------|
 
156
  """
157
 
158
  REL_DESCRIPTION = """### Relation Extraction (REL)
159
+ The input is a sentence of a clinical text. The model must identify and extract relationships between laboratory test results (e.g., blood pressure) and the corresponding tests or procedures that generated them (e.g., blood pressure test).
160
 
161
  | # | Prompt |
162
  |-----|--------------------------------------------------------------------------------|