Spaces:

evalitahf
/

evalita_llm_leaderboard

Running

File size: 8,834 Bytes

"""

EVALITA LLM EVALUATION PROCESSOR



Transforms raw model evaluation outputs into structured performance reports for leaderboard integration.



DATA PIPELINE OVERVIEW:



1. Inputs:

   - Evaluation Results: Raw .out files from lm-eval-harness

   - Model Metadata: Pre-collected .json files from HuggingFace



2. Output:

   - Comprehensive evaluation reports in JSON format

   - Ready for ingestion into the evaluation leaderboard



--------------------------------------------------------------------

INPUT SPECIFICATION



Evaluation Results (.out format):

   hf (pretrained=model-org/model-name), num_fewshot: 5, batch_size: 1

   | Task          | Metric | Value  | Stderr |

   |---------------|--------|--------|--------|

   | main-task     | acc    | 0.5605 | 0.0052 |

   | - sub-task    | acc    | 0.4640 | 0.0088 |

   |   - prompt-1  | acc    | 0.3720 | 0.0216 |



Model Metadata (.json format):

   {

     "model": "model-org/model-name",

     "base_model": "ModelArchitecture",

     "revision": "git_commit_hash",

     "parameters": 8.03,

     "language": "en_it"

   }



--------------------------------------------------------------------

OUTPUT SPECIFICATION



Evaluation Report (.json format):

   {

     "summary_metrics": {

       "average_CPS": 41.74,

       "num_tasks": 12

     },

     "model_config": {

       "identifier": "model-org/model-name",

       "architecture": "ModelArchitecture",

       "parameters": 8.03,

       "evaluation_settings": {

         "fewshot": 5,

         "batch_size": 1

       }

     },

     "task_results": {

       "task-name": {

         "average_score": 52.60,

         "best_prompt": {

           "id": "prompt-6",

           "score": 66.57

         },

         "prompt_analysis": [

           {

             "prompt_id": "prompt-1",

             "score": 37.20,

             "stderr": 0.0216

           }

         ]

       }

     }

   }

"""

import json
import os
import re
import statistics

def safe_float(value):
    """Safely converts a value to float, returning None if the conversion fails."""
    try:
        return float(value)
    except ValueError:
        return None


def calculate_task_metrics(task_info):
    """Calculates average accuracy, best prompt accuracy, and CPS for a given task."""
    accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]

    if not accuracies:
        return None

    task_info['average_accuracy'] = sum(accuracies) / len(accuracies)
    task_info['std_accuracy'] = statistics.stdev(accuracies) if len(accuracies) > 1 else 0.0
    best_prompt_data = max(task_info['prompts'], key=lambda x: x['value'])
    task_info['best_prompt'] = best_prompt_data['value']
    task_info['prompt_id'] = best_prompt_data['prompt']

    # Calculate CPS
    avg_acc = task_info['average_accuracy']
    best_acc = task_info['best_prompt']
    task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc


def extract_data_from_file(file_path):
    """Extracts task and prompt data from a specified file."""
    with open(file_path, 'r') as file:
        lines = file.readlines()

    tasks_data = {}
    current_task = None

    for line in lines:
        line = line.strip()

        # Skips empty lines
        if not line:
            continue

        # Skips header lines
        if line.startswith("|         Tasks"):
            continue

        # Extracts model configuration details
        if line.startswith("hf (pretrained="):
            start = line.find("pretrained=") + len("pretrained=")
            end = line.find(",", start)
            pretrained_model = line[start:end]

            num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
            num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None

            batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
            batch_size = int(batch_size_match.group(1)) if batch_size_match else None

            continue

        columns = line.split('|')
        if len(columns) != 11:
            continue

        task_name = columns[1]
        metric = columns[5].strip()
        value = safe_float(columns[7])
        stderr = safe_float(columns[9])

        # Skips normalized accuracy metrics
        if metric == "acc_norm":
            continue

        # Identifies task and prompt sections in the file
        if task_name.startswith(" - "):
            task_name = task_name[3:].strip()
            current_task = task_name
            tasks_data.setdefault(current_task,
                                  {'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None,
                                   'CPS': None})

        elif task_name.startswith("  - ") and current_task:
            prompt_name = task_name[4:].strip()
            prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100,
                           'stderr': stderr}
            tasks_data[current_task]['prompts'].append(prompt_data)

    # Special handling for evalita NER task to calculate weighted prompt averages
    if "evalita NER" in tasks_data:
        task_info = tasks_data["evalita NER"]
        weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
                      "WN prompt-1": 2088, "WN prompt-2": 2088}

        weighted_values = {"prompt-1": 0, "prompt-2": 0}
        total_weights = sum(weight_map.values())

        for prompt in task_info['prompts']:
            if prompt['prompt'] in weight_map:
                if "prompt-1" in prompt['prompt']:
                    weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value']
                elif "prompt-2" in prompt['prompt']:
                    weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value']

        task_info['prompts'] = [
            {"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights,
             'stderr': None},
            {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
             'stderr': None}]

    # Calculates task metrics for each task
    for task_info in tasks_data.values():
        calculate_task_metrics(task_info)

    # Calculates the average CPS across all tasks
    tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
    average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0

    config = {
        "model_name": pretrained_model,
        "num_fewshot": num_fewshot,
        "batch_size": batch_size
    }

    return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}


"""

MAIN PROCESSING PIPELINE



This script executes the complete evaluation data processing workflow:



1. Input Sources:

   - Raw evaluation results (.out files) from: ../evalita_llm_models_output/

   - Model metadata JSON files from: ../evalita_llm_requests/



2. Processing Steps:

   - Parses evaluation metrics from .out files

   - Combines with model metadata

   - Calculates aggregated performance statistics



3. Output:

   - Structured JSON results saved to: ../evalita_llm_results/

   - Organized by model organization/name

   - Contains complete evaluation results with metadata

"""
directory_in_path = '../evalita_llm_models_output/'
directory_in_requests_path = '../evalita_llm_requests/'
directory_out_results_path = '../evalita_llm_results/'

for filename in os.listdir(directory_in_path):
    if filename.endswith('.out'):
        file_path = os.path.join(directory_in_path, filename)
        json_output = extract_data_from_file(file_path)

        model_org_name, model_name = json_output['config']['model_name'].split('/')


        config_file_path = os.path.join(directory_in_requests_path, model_org_name, f"{model_name}.json")

        if os.path.exists(config_file_path):
            with open(config_file_path, 'r', encoding='utf-8') as config_file:
                additional_config = json.load(config_file)
            json_output['config'].update(additional_config)


        org_folder_path = os.path.join(directory_out_results_path, model_org_name)
        os.makedirs(org_folder_path, exist_ok=True)

        file_suffix = f"{json_output['config']['num_fewshot']}"
        output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")

        with open(output_file_path, 'w', newline="\n") as outfile:
            json.dump(json_output, outfile, indent=4)

        print(f"File {filename} processed and saved to {output_file_path}")