|
"""
|
|
EVALITA LLM EVALUATION PROCESSOR
|
|
|
|
Transforms raw model evaluation outputs into structured performance reports for leaderboard integration.
|
|
|
|
DATA PIPELINE OVERVIEW:
|
|
|
|
1. Inputs:
|
|
- Evaluation Results: Raw .out files from lm-eval-harness
|
|
- Model Metadata: Pre-collected .json files from HuggingFace
|
|
|
|
2. Output:
|
|
- Comprehensive evaluation reports in JSON format
|
|
- Ready for ingestion into the evaluation leaderboard
|
|
|
|
--------------------------------------------------------------------
|
|
INPUT SPECIFICATION
|
|
|
|
Evaluation Results (.out format):
|
|
hf (pretrained=model-org/model-name), num_fewshot: 5, batch_size: 1
|
|
| Task | Metric | Value | Stderr |
|
|
|---------------|--------|--------|--------|
|
|
| main-task | acc | 0.5605 | 0.0052 |
|
|
| - sub-task | acc | 0.4640 | 0.0088 |
|
|
| - prompt-1 | acc | 0.3720 | 0.0216 |
|
|
|
|
Model Metadata (.json format):
|
|
{
|
|
"model": "model-org/model-name",
|
|
"base_model": "ModelArchitecture",
|
|
"revision": "git_commit_hash",
|
|
"parameters": 8.03,
|
|
"language": "en_it"
|
|
}
|
|
|
|
--------------------------------------------------------------------
|
|
OUTPUT SPECIFICATION
|
|
|
|
Evaluation Report (.json format):
|
|
{
|
|
"summary_metrics": {
|
|
"average_CPS": 41.74,
|
|
"num_tasks": 12
|
|
},
|
|
"model_config": {
|
|
"identifier": "model-org/model-name",
|
|
"architecture": "ModelArchitecture",
|
|
"parameters": 8.03,
|
|
"evaluation_settings": {
|
|
"fewshot": 5,
|
|
"batch_size": 1
|
|
}
|
|
},
|
|
"task_results": {
|
|
"task-name": {
|
|
"average_score": 52.60,
|
|
"best_prompt": {
|
|
"id": "prompt-6",
|
|
"score": 66.57
|
|
},
|
|
"prompt_analysis": [
|
|
{
|
|
"prompt_id": "prompt-1",
|
|
"score": 37.20,
|
|
"stderr": 0.0216
|
|
}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
|
|
def safe_float(value):
|
|
"""Safely converts a value to float, returning None if the conversion fails."""
|
|
try:
|
|
return float(value)
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def calculate_task_metrics(task_info):
|
|
"""Calculates average accuracy, best prompt accuracy, and CPS for a given task."""
|
|
accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
|
|
|
|
if not accuracies:
|
|
return None
|
|
|
|
task_info['average_accuracy'] = sum(accuracies) / len(accuracies)
|
|
best_prompt_data = max(task_info['prompts'], key=lambda x: x['value'])
|
|
task_info['best_prompt'] = best_prompt_data['value']
|
|
task_info['prompt_id'] = best_prompt_data['prompt']
|
|
|
|
|
|
avg_acc = task_info['average_accuracy']
|
|
best_acc = task_info['best_prompt']
|
|
task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc
|
|
|
|
|
|
def extract_data_from_file(file_path):
|
|
"""Extracts task and prompt data from a specified file."""
|
|
with open(file_path, 'r') as file:
|
|
lines = file.readlines()
|
|
|
|
tasks_data = {}
|
|
current_task = None
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
|
|
|
|
if not line:
|
|
continue
|
|
|
|
|
|
if line.startswith("| Tasks"):
|
|
continue
|
|
|
|
|
|
if line.startswith("hf (pretrained="):
|
|
start = line.find("pretrained=") + len("pretrained=")
|
|
end = line.find(",", start)
|
|
pretrained_model = line[start:end]
|
|
|
|
num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
|
|
num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
|
|
|
|
batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
|
|
batch_size = int(batch_size_match.group(1)) if batch_size_match else None
|
|
|
|
continue
|
|
|
|
columns = line.split('|')
|
|
if len(columns) != 11:
|
|
continue
|
|
|
|
task_name = columns[1]
|
|
metric = columns[5].strip()
|
|
value = safe_float(columns[7])
|
|
stderr = safe_float(columns[9])
|
|
|
|
|
|
if metric == "acc_norm":
|
|
continue
|
|
|
|
|
|
if task_name.startswith(" - "):
|
|
task_name = task_name[3:].strip()
|
|
current_task = task_name
|
|
tasks_data.setdefault(current_task,
|
|
{'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None,
|
|
'CPS': None})
|
|
|
|
elif task_name.startswith(" - ") and current_task:
|
|
prompt_name = task_name[4:].strip()
|
|
prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100,
|
|
'stderr': stderr}
|
|
tasks_data[current_task]['prompts'].append(prompt_data)
|
|
|
|
|
|
if "evalita NER" in tasks_data:
|
|
task_info = tasks_data["evalita NER"]
|
|
weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
|
|
"WN prompt-1": 2088, "WN prompt-2": 2088}
|
|
|
|
weighted_values = {"prompt-1": 0, "prompt-2": 0}
|
|
total_weights = sum(weight_map.values())
|
|
|
|
for prompt in task_info['prompts']:
|
|
if prompt['prompt'] in weight_map:
|
|
if "prompt-1" in prompt['prompt']:
|
|
weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value']
|
|
elif "prompt-2" in prompt['prompt']:
|
|
weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value']
|
|
|
|
task_info['prompts'] = [
|
|
{"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights,
|
|
'stderr': None},
|
|
{"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
|
|
'stderr': None}]
|
|
|
|
|
|
for task_info in tasks_data.values():
|
|
calculate_task_metrics(task_info)
|
|
|
|
|
|
tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
|
|
average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
|
|
|
|
config = {
|
|
"model_name": pretrained_model,
|
|
"num_fewshot": num_fewshot,
|
|
"batch_size": batch_size
|
|
}
|
|
|
|
return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
|
|
|
|
|
|
"""
|
|
MAIN PROCESSING PIPELINE
|
|
|
|
This script executes the complete evaluation data processing workflow:
|
|
|
|
1. Input Sources:
|
|
- Raw evaluation results (.out files) from: ../evalita_llm_models_output/
|
|
- Model metadata JSON files from: ../evalita_llm_requests/
|
|
|
|
2. Processing Steps:
|
|
- Parses evaluation metrics from .out files
|
|
- Combines with model metadata
|
|
- Calculates aggregated performance statistics
|
|
|
|
3. Output:
|
|
- Structured JSON results saved to: ../evalita_llm_results/
|
|
- Organized by model organization/name
|
|
- Contains complete evaluation results with metadata
|
|
"""
|
|
directory_in_path = '../evalita_llm_models_output/'
|
|
directory_in_requests_path = '../evalita_llm_requests/'
|
|
directory_out_results_path = '../evalita_llm_results/'
|
|
|
|
for filename in os.listdir(directory_in_path):
|
|
if filename.endswith('.out'):
|
|
file_path = os.path.join(directory_in_path, filename)
|
|
json_output = extract_data_from_file(file_path)
|
|
|
|
model_org_name, model_name = json_output['config']['model_name'].split('/')
|
|
|
|
|
|
config_file_path = os.path.join(directory_in_requests_path, model_org_name, f"{model_name}.json")
|
|
|
|
if os.path.exists(config_file_path):
|
|
with open(config_file_path, 'r', encoding='utf-8') as config_file:
|
|
additional_config = json.load(config_file)
|
|
json_output['config'].update(additional_config)
|
|
|
|
|
|
org_folder_path = os.path.join(directory_out_results_path, model_org_name)
|
|
os.makedirs(org_folder_path, exist_ok=True)
|
|
|
|
file_suffix = f"{json_output['config']['num_fewshot']}"
|
|
output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
|
|
|
|
with open(output_file_path, 'w', newline="\n") as outfile:
|
|
json.dump(json_output, outfile, indent=4)
|
|
|
|
print(f"File {filename} processed and saved to {output_file_path}") |