Spaces:

evalitahf
/

evalita_llm_leaderboard

Running

App Files Files Community

rzanoli commited on 20 days ago

Commit

ad489d5

1 Parent(s): 36a0318

Add new scripts for model processing and tasks management

Browse files

Files changed (4) hide show

app2.py +0 -0
get_model_info.py +74 -0
preprocess_models_output.py +201 -0
src/tasks.py +37 -0

app2.py ADDED Viewed

File without changes

get_model_info.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os
+import re
+import json
+from huggingface_hub import HfApi
+# Configura il token di Hugging Face (se necessario)
+#TOKEN = "YOUR_HUGGINGFACE_API_TOKEN"
+api = HfApi()
+# Percorsi delle cartelle
+input_folder = "../evalita_llm_results/models_output/"
+output_folder = "../evalita_llm_requests2/"
+# Creazione della cartella di output se non esiste
+os.makedirs(output_folder, exist_ok=True)
+# Espressione regolare per trovare il nome del modello
+model_pattern = re.compile(r"pretrained=([\w\-./]+)")
+# Scansiona i file nella cartella di input
+for filename in os.listdir(input_folder):
+    file_path = os.path.join(input_folder, filename)
+    # Leggi il contenuto del file
+    with open(file_path, "r", encoding="utf-8") as f:
+        content = f.read()
+    # Estrai il nome del modello
+    match = model_pattern.search(content)
+    if match:
+        model_name = match.group(1)
+        print(f"Processing model: {model_name}")
+        try:
+            # Ottieni le informazioni del modello da Hugging Face
+            model_info = api.model_info(model_name)
+            # Costruisci il dizionario con i metadati richiesti
+            model_data = {
+                "model": model_name,
+                "base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "",
+                "revision": model_info.sha,
+                "precision": "bfloat16",  # Se disponibile, sostituire con un valore reale
+                #"weight_type": "Original",
+                #"status": "FINISHED",
+                "submitted_time": str(model_info.created_at),
+                "model_type": "pretrained",
+                #"likes": model_info.likes,
+                #"params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None,
+                #"license": model_info.license,
+                #"private": model_info.private,
+            }
+            # Separare il model_name in due parti: prima e dopo "/"
+            if "/" in model_name:
+                dir_name, file_name = model_name.split("/", 1)
+            else:
+                dir_name, file_name = model_name, model_name  # Se non c'è "/", usa lo stesso nome
+            # Creare la cartella per la prima parte del nome del modello
+            model_output_folder = os.path.join(output_folder, dir_name)
+            os.makedirs(model_output_folder, exist_ok=True)
+            # Salvare il file JSON nella cartella appropriata
+            output_file = os.path.join(model_output_folder, f"{file_name}.json")
+            with open(output_file, "w", encoding="utf-8") as f:
+                json.dump(model_data, f, indent=4)
+            print(f"Saved metadata for {model_name} in {output_file}")
+        except Exception as e:
+            print(f"Error retrieving info for {model_name}: {e}")
+        print("Process completed.")

preprocess_models_output.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import json
+import os
+import re
+def safe_float(value):
+    """Convert a value to float safely. Returns None if conversion fails."""
+    try:
+        return float(value)
+    except ValueError:
+        return None
+def calculate_task_metrics(task_info):
+    """Calculate average accuracy, best prompt, and CPS for a task."""
+    accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
+    if not accuracies:
+        return None
+    task_info['average_accuracy'] = sum(accuracies) / len(accuracies)
+    best_prompt_data = max(task_info['prompts'], key=lambda x: x['value'])
+    task_info['best_prompt'] = best_prompt_data['value']
+    task_info['prompt_id'] = best_prompt_data['prompt']
+    # Calculate CPS
+    avg_acc = task_info['average_accuracy']
+    best_acc = task_info['best_prompt']
+    task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc
+def extract_data_from_file(file_path):
+    """Extract task and prompt data from the given file."""
+    with open(file_path, 'r') as file:
+        lines = file.readlines()
+    tasks_data = {}
+    current_task = None
+    for line in lines:
+        line = line.strip()
+        # Skip irrelevant lines
+        if not line:
+            continue
+        if line.startswith("|         Tasks"):
+            continue
+        if line.startswith("hf (pretrained="):
+            # Estrai la parte dopo "pretrained="
+            start = line.find("pretrained=") + len("pretrained=")
+            end = line.find(",", start)  # Trova la virgola successiva
+            # Estrai la stringa desiderata
+            pretrained_model = line[start:end]
+            # Estrarre num_fewshot
+            num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
+            num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
+            # Estrarre batch_size
+            batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
+            batch_size = int(batch_size_match.group(1)) if batch_size_match else None
+            continue
+        columns = line.split('|')
+        if len(columns) != 11:
+            continue
+        task_name = columns[1]
+        metric = columns[5].strip()
+        value = safe_float(columns[7])
+        stderr = safe_float(columns[9])
+        if metric == "acc_norm":
+            continue
+        # Identify task and prompts
+        if task_name.startswith(" - "):
+            task_name = task_name[3:].strip()
+            current_task = task_name
+            tasks_data.setdefault(current_task,
+                                  {'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None,
+                                   'CPS': None})
+        elif task_name.startswith("  - ") and current_task:
+            prompt_name = task_name[4:].strip()
+            prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100,
+                           'stderr': stderr}
+            tasks_data[current_task]['prompts'].append(prompt_data)
+    # Special handling for evalita NER
+    if "evalita NER" in tasks_data:
+        task_info = tasks_data["evalita NER"]
+        weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
+                      "WN prompt-1": 2088, "WN prompt-2": 2088}
+        weighted_values = {"prompt-1": 0, "prompt-2": 0}
+        total_weights = sum(weight_map.values())
+        for prompt in task_info['prompts']:
+            if prompt['prompt'] in weight_map:
+                if "prompt-1" in prompt['prompt']:
+                    weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value']
+                elif "prompt-2" in prompt['prompt']:
+                    weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value']
+        task_info['prompts'] = [
+            {"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights,
+             'stderr': None},
+            {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
+             'stderr': None}]
+    # Calculate metrics for each task
+    for task_info in tasks_data.values():
+        calculate_task_metrics(task_info)
+    # Calculate average CPS
+    tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
+    average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
+    config = {
+        "model_name": pretrained_model,
+        "num_fewshot": num_fewshot,
+        "batch_size": batch_size
+    }
+    return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
+# Example usage
+#file_path = '../evalita_llm_results/models_output/slurm-7769.out'
+#json_output = extract_data_from_file(file_path)
+#print(json_output)
+# Directory da cui leggere i file .out
+directory_in_path = '../evalita_llm_models_output/'
+directory_out_results_path = '../evalita_llm_results/'
+directory_out_requests_path = '../evalita_llm_requests/'
+# Itera sui file nella directory
+for filename in os.listdir(directory_in_path):
+    if filename.endswith('.out'):
+        # Costruisci il percorso completo del file
+        file_path = os.path.join(directory_in_path, filename)
+        # Esegui la funzione extract_data_from_file
+        json_output = extract_data_from_file(file_path)
+        # Estrai model_org_name e model_name da model_name
+        model_org_name, model_name = json_output['config']['model_name'].split('/')
+        # Percorso del file JSON di configurazione in ../evalita_llm_requests2/
+        config_file_path = os.path.join(directory_out_requests_path, model_org_name, f"{model_name}.json")
+        # Se il file esiste, caricalo e aggiorna il dizionario config
+        if os.path.exists(config_file_path):
+            with open(config_file_path, 'r', encoding='utf-8') as config_file:
+                additional_config = json.load(config_file)
+            # Aggiorna la configurazione con i nuovi dati
+            json_output['config'].update(additional_config)
+        # Crea il percorso della cartella per model_org_name
+        org_folder_path = os.path.join(directory_out_results_path, model_org_name)
+        os.makedirs(org_folder_path, exist_ok=True)  # Crea la cartella se non esiste
+        # Crea il percorso completo del file JSON
+        file_suffix = f"{json_output['config']['num_fewshot']}"
+        output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
+        # Salva il JSON in un file con ritorni a capo compatibili con Linux
+        with open(output_file_path, 'w', newline="\n") as outfile:
+            json.dump(json_output, outfile, indent=4)
+        # Stampa il risultato
+        print(f"File {filename} elaborato e salvato in {output_file_path}")

src/tasks.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from dataclasses import dataclass
+from enum import Enum
+@dataclass
+class Task:
+    benchmark: str
+    # metric: str
+    accuracy: str
+    col_name: str
+NUM_FEWSHOT = 0  # Change with your few shot
+# ---------------------------------------------------
+# Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">🚀 EVALITA-LLM Leaderboard 🚀</h1>"""
+# What does your leaderboard evaluate?
+INTRODUCTION_TEXT = """
+Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
+"""
+# Which evaluations are you running? how can people reproduce what you have?
+TE_DESCRIPTION = """### Textual Entailment (TE)
+    The input are two sentences: the text (T) and the hypothesis (H). The model  has to determine whether the meaning of the hypothesis is logically entailed by the text.
+| #   | Prompt | Answer Choices |
+|-----|--------|----------------|
+| 1   | La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? | ["Sì", "No"] |
+| 2   | Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? | ["Sì", "No"] |
+| 3   | La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: | ["A", "B"] |
+| 4   | Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: | ["A", "B"] |
+| 5   | Frase 1: '{{text1}}' Frase 2: '{{text2}}' | ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] |
+| 6   | Devi risolvere un compito di inferenza semantica. Frase 1: '{{text1}}' Frase 2: '{{text2}}' | ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] |
+Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt =  accuracy of the best prompt. Prompt ID =  ID of the best prompt (see legend above)
+"""