Spaces:

evalitahf
/

evalita_llm_leaderboard

Running

App Files Files Community

evalita_llm_leaderboard / preprocess_models_output.py

rzanoli

Small changes

5a8f6c4 11 days ago

raw

history blame contribute delete

8.72 kB

	"""
	EVALITA LLM EVALUATION PROCESSOR

	Transforms raw model evaluation outputs into structured performance reports for leaderboard integration.

	DATA PIPELINE OVERVIEW:

	1. Inputs:
	- Evaluation Results: Raw .out files from lm-eval-harness
	- Model Metadata: Pre-collected .json files from HuggingFace

	2. Output:
	- Comprehensive evaluation reports in JSON format
	- Ready for ingestion into the evaluation leaderboard

	--------------------------------------------------------------------
	INPUT SPECIFICATION

	Evaluation Results (.out format):
	hf (pretrained=model-org/model-name), num_fewshot: 5, batch_size: 1
	\| Task \| Metric \| Value \| Stderr \|
	\|---------------\|--------\|--------\|--------\|
	\| main-task \| acc \| 0.5605 \| 0.0052 \|
	\| - sub-task \| acc \| 0.4640 \| 0.0088 \|
	\| - prompt-1 \| acc \| 0.3720 \| 0.0216 \|

	Model Metadata (.json format):
	{
	"model": "model-org/model-name",
	"base_model": "ModelArchitecture",
	"revision": "git_commit_hash",
	"parameters": 8.03,
	"language": "en_it"
	}

	--------------------------------------------------------------------
	OUTPUT SPECIFICATION

	Evaluation Report (.json format):
	{
	"summary_metrics": {
	"average_CPS": 41.74,
	"num_tasks": 12
	},
	"model_config": {
	"identifier": "model-org/model-name",
	"architecture": "ModelArchitecture",
	"parameters": 8.03,
	"evaluation_settings": {
	"fewshot": 5,
	"batch_size": 1
	}
	},
	"task_results": {
	"task-name": {
	"average_score": 52.60,
	"best_prompt": {
	"id": "prompt-6",
	"score": 66.57
	},
	"prompt_analysis": [
	{
	"prompt_id": "prompt-1",
	"score": 37.20,
	"stderr": 0.0216
	}
	]
	}
	}
	}
	"""

	import json
	import os
	import re

	def safe_float(value):
	"""Safely converts a value to float, returning None if the conversion fails."""
	try:
	return float(value)
	except ValueError:
	return None


	def calculate_task_metrics(task_info):
	"""Calculates average accuracy, best prompt accuracy, and CPS for a given task."""
	accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]

	if not accuracies:
	return None

	task_info['average_accuracy'] = sum(accuracies) / len(accuracies)
	best_prompt_data = max(task_info['prompts'], key=lambda x: x['value'])
	task_info['best_prompt'] = best_prompt_data['value']
	task_info['prompt_id'] = best_prompt_data['prompt']

	# Calculate CPS
	avg_acc = task_info['average_accuracy']
	best_acc = task_info['best_prompt']
	task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc


	def extract_data_from_file(file_path):
	"""Extracts task and prompt data from a specified file."""
	with open(file_path, 'r') as file:
	lines = file.readlines()

	tasks_data = {}
	current_task = None

	for line in lines:
	line = line.strip()

	# Skips empty lines
	if not line:
	continue

	# Skips header lines
	if line.startswith("\| Tasks"):
	continue

	# Extracts model configuration details
	if line.startswith("hf (pretrained="):
	start = line.find("pretrained=") + len("pretrained=")
	end = line.find(",", start)
	pretrained_model = line[start:end]

	num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
	num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None

	batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
	batch_size = int(batch_size_match.group(1)) if batch_size_match else None

	continue

	columns = line.split('\|')
	if len(columns) != 11:
	continue

	task_name = columns[1]
	metric = columns[5].strip()
	value = safe_float(columns[7])
	stderr = safe_float(columns[9])

	# Skips normalized accuracy metrics
	if metric == "acc_norm":
	continue

	# Identifies task and prompt sections in the file
	if task_name.startswith(" - "):
	task_name = task_name[3:].strip()
	current_task = task_name
	tasks_data.setdefault(current_task,
	{'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None,
	'CPS': None})

	elif task_name.startswith(" - ") and current_task:
	prompt_name = task_name[4:].strip()
	prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100,
	'stderr': stderr}
	tasks_data[current_task]['prompts'].append(prompt_data)

	# Special handling for evalita NER task to calculate weighted prompt averages
	if "evalita NER" in tasks_data:
	task_info = tasks_data["evalita NER"]
	weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
	"WN prompt-1": 2088, "WN prompt-2": 2088}

	weighted_values = {"prompt-1": 0, "prompt-2": 0}
	total_weights = sum(weight_map.values())

	for prompt in task_info['prompts']:
	if prompt['prompt'] in weight_map:
	if "prompt-1" in prompt['prompt']:
	weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value']
	elif "prompt-2" in prompt['prompt']:
	weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value']

	task_info['prompts'] = [
	{"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights,
	'stderr': None},
	{"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
	'stderr': None}]

	# Calculates task metrics for each task
	for task_info in tasks_data.values():
	calculate_task_metrics(task_info)

	# Calculates the average CPS across all tasks
	tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
	average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0

	config = {
	"model_name": pretrained_model,
	"num_fewshot": num_fewshot,
	"batch_size": batch_size
	}

	return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}


	"""
	MAIN PROCESSING PIPELINE

	This script executes the complete evaluation data processing workflow:

	1. Input Sources:
	- Raw evaluation results (.out files) from: ../evalita_llm_models_output/
	- Model metadata JSON files from: ../evalita_llm_requests/

	2. Processing Steps:
	- Parses evaluation metrics from .out files
	- Combines with model metadata
	- Calculates aggregated performance statistics

	3. Output:
	- Structured JSON results saved to: ../evalita_llm_results/
	- Organized by model organization/name
	- Contains complete evaluation results with metadata
	"""
	directory_in_path = '../evalita_llm_models_output/'
	directory_in_requests_path = '../evalita_llm_requests/'
	directory_out_results_path = '../evalita_llm_results/'

	for filename in os.listdir(directory_in_path):
	if filename.endswith('.out'):
	file_path = os.path.join(directory_in_path, filename)
	json_output = extract_data_from_file(file_path)

	model_org_name, model_name = json_output['config']['model_name'].split('/')


	config_file_path = os.path.join(directory_in_requests_path, model_org_name, f"{model_name}.json")

	if os.path.exists(config_file_path):
	with open(config_file_path, 'r', encoding='utf-8') as config_file:
	additional_config = json.load(config_file)
	json_output['config'].update(additional_config)


	org_folder_path = os.path.join(directory_out_results_path, model_org_name)
	os.makedirs(org_folder_path, exist_ok=True)

	file_suffix = f"{json_output['config']['num_fewshot']}"
	output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")

	with open(output_file_path, 'w', newline="\n") as outfile:
	json.dump(json_output, outfile, indent=4)

	print(f"File {filename} processed and saved to {output_file_path}")