""" EVALITA LLM EVALUATION PROCESSOR Transforms raw model evaluation outputs into structured performance reports for leaderboard integration. DATA PIPELINE OVERVIEW: 1. Inputs: - Evaluation Results: Raw .out files from lm-eval-harness - Model Metadata: Pre-collected .json files from HuggingFace 2. Output: - Comprehensive evaluation reports in JSON format - Ready for ingestion into the evaluation leaderboard -------------------------------------------------------------------- INPUT SPECIFICATION Evaluation Results (.out format): hf (pretrained=model-org/model-name), num_fewshot: 5, batch_size: 1 | Task | Metric | Value | Stderr | |---------------|--------|--------|--------| | main-task | acc | 0.5605 | 0.0052 | | - sub-task | acc | 0.4640 | 0.0088 | | - prompt-1 | acc | 0.3720 | 0.0216 | Model Metadata (.json format): { "model": "model-org/model-name", "base_model": "ModelArchitecture", "revision": "git_commit_hash", "parameters": 8.03, "language": "en_it" } -------------------------------------------------------------------- OUTPUT SPECIFICATION Evaluation Report (.json format): { "summary_metrics": { "average_CPS": 41.74, "num_tasks": 12 }, "model_config": { "identifier": "model-org/model-name", "architecture": "ModelArchitecture", "parameters": 8.03, "evaluation_settings": { "fewshot": 5, "batch_size": 1 } }, "task_results": { "task-name": { "average_score": 52.60, "best_prompt": { "id": "prompt-6", "score": 66.57 }, "prompt_analysis": [ { "prompt_id": "prompt-1", "score": 37.20, "stderr": 0.0216 } ] } } } """ import json import os import re def safe_float(value): """Safely converts a value to float, returning None if the conversion fails.""" try: return float(value) except ValueError: return None def calculate_task_metrics(task_info): """Calculates average accuracy, best prompt accuracy, and CPS for a given task.""" accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None] if not accuracies: return None task_info['average_accuracy'] = sum(accuracies) / len(accuracies) best_prompt_data = max(task_info['prompts'], key=lambda x: x['value']) task_info['best_prompt'] = best_prompt_data['value'] task_info['prompt_id'] = best_prompt_data['prompt'] # Calculate CPS avg_acc = task_info['average_accuracy'] best_acc = task_info['best_prompt'] task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc def extract_data_from_file(file_path): """Extracts task and prompt data from a specified file.""" with open(file_path, 'r') as file: lines = file.readlines() tasks_data = {} current_task = None for line in lines: line = line.strip() # Skips empty lines if not line: continue # Skips header lines if line.startswith("| Tasks"): continue # Extracts model configuration details if line.startswith("hf (pretrained="): start = line.find("pretrained=") + len("pretrained=") end = line.find(",", start) pretrained_model = line[start:end] num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line) num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None batch_size_match = re.search(r"batch_size:\s*(\d+)", line) batch_size = int(batch_size_match.group(1)) if batch_size_match else None continue columns = line.split('|') if len(columns) != 11: continue task_name = columns[1] metric = columns[5].strip() value = safe_float(columns[7]) stderr = safe_float(columns[9]) # Skips normalized accuracy metrics if metric == "acc_norm": continue # Identifies task and prompt sections in the file if task_name.startswith(" - "): task_name = task_name[3:].strip() current_task = task_name tasks_data.setdefault(current_task, {'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None, 'CPS': None}) elif task_name.startswith(" - ") and current_task: prompt_name = task_name[4:].strip() prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100, 'stderr': stderr} tasks_data[current_task]['prompts'].append(prompt_data) # Special handling for evalita NER task to calculate weighted prompt averages if "evalita NER" in tasks_data: task_info = tasks_data["evalita NER"] weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517, "WN prompt-1": 2088, "WN prompt-2": 2088} weighted_values = {"prompt-1": 0, "prompt-2": 0} total_weights = sum(weight_map.values()) for prompt in task_info['prompts']: if prompt['prompt'] in weight_map: if "prompt-1" in prompt['prompt']: weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value'] elif "prompt-2" in prompt['prompt']: weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value'] task_info['prompts'] = [ {"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights, 'stderr': None}, {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights, 'stderr': None}] # Calculates task metrics for each task for task_info in tasks_data.values(): calculate_task_metrics(task_info) # Calculates the average CPS across all tasks tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None] average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0 config = { "model_name": pretrained_model, "num_fewshot": num_fewshot, "batch_size": batch_size } return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data} """ MAIN PROCESSING PIPELINE This script executes the complete evaluation data processing workflow: 1. Input Sources: - Raw evaluation results (.out files) from: ../evalita_llm_models_output/ - Model metadata JSON files from: ../evalita_llm_requests/ 2. Processing Steps: - Parses evaluation metrics from .out files - Combines with model metadata - Calculates aggregated performance statistics 3. Output: - Structured JSON results saved to: ../evalita_llm_results/ - Organized by model organization/name - Contains complete evaluation results with metadata """ directory_in_path = '../evalita_llm_models_output/' directory_in_requests_path = '../evalita_llm_requests/' directory_out_results_path = '../evalita_llm_results/' for filename in os.listdir(directory_in_path): if filename.endswith('.out'): file_path = os.path.join(directory_in_path, filename) json_output = extract_data_from_file(file_path) model_org_name, model_name = json_output['config']['model_name'].split('/') config_file_path = os.path.join(directory_in_requests_path, model_org_name, f"{model_name}.json") if os.path.exists(config_file_path): with open(config_file_path, 'r', encoding='utf-8') as config_file: additional_config = json.load(config_file) json_output['config'].update(additional_config) org_folder_path = os.path.join(directory_out_results_path, model_org_name) os.makedirs(org_folder_path, exist_ok=True) file_suffix = f"{json_output['config']['num_fewshot']}" output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json") with open(output_file_path, 'w', newline="\n") as outfile: json.dump(json_output, outfile, indent=4) print(f"File {filename} processed and saved to {output_file_path}")