Spaces:

evalitahf
/

evalita_llm_leaderboard

Running

App Files Files Community

rzanoli commited on Mar 25

Commit

dbd3b18

1 Parent(s): 12c62aa

Small changes

Browse files

Files changed (9) hide show

app.py +13 -12
get_model_info.py +77 -72
preprocess_models_output.py +19 -59
preprocess_models_output_old.py +201 -0
run_instructions.txt +42 -0
src/about.py +4 -2
src/display/utils.py +0 -3
src/envs.py +1 -1
src/tasks.py +2 -2

app.py CHANGED Viewed

@@ -14,16 +14,16 @@ from src.submission.submit import add_new_eval
 # Define task metadata (icons, names, descriptions)
 TASK_METADATA = {
-    "TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": "Identify logical relationships between two text segments."},
-    "SA": {"icon": "😃", "name": "Sentiment Analysis", "tooltip": "Classify the sentiment (positive, negative, neutral) of a text."},
-    "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": "Detect hate speech in a text."},
-    "AT": {"icon": "🏥", "name": "Admission Test", "tooltip": "Classify whether a clinical statement pertains to an admission test."},
-    "WIC": {"icon": "🔤", "name": "Word in Context", "tooltip": "Identify words in context and their meaning."},
-    "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": "Answer frequently asked questions based on given text."},
-    "LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": "Identify alternative words in a given context."},
-    "SU": {"icon": "📝", "name": "Summarization", "tooltip": "Summarize long text into a shorter version."},
-    "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": "Identify named entities (e.g., persons, locations, organizations) in text."},
-    "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": "Extract and link laboratory test results to the respective tests in clinical narratives."},
 }
 def restart_space():
@@ -47,8 +47,8 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
         search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
         hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
         filter_columns=[
-            ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="Few-Shot Learning (FS)"),
-            ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"),
         ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
@@ -82,6 +82,7 @@ with demo:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         # Main leaderboard tab
         with gr.TabItem("🏅 EVALITA-LLM Benchmark"):
             leaderboard = init_leaderboard(
                 LEADERBOARD_DF,
                 default_selection=['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],

 # Define task metadata (icons, names, descriptions)
 TASK_METADATA = {
+    "TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": ""},
+    "SA": {"icon": "😃", "name": "Sentiment Analysis", "tooltip": ""},
+    "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""},
+    "AT": {"icon": "🏥", "name": "Admission Test", "tooltip": ""},
+    "WIC": {"icon": "🔤", "name": "Word in Context", "tooltip": ""},
+    "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""},
+    "LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": ""},
+    "SU": {"icon": "📝", "name": "Summarization", "tooltip": ""},
+    "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
+    "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""},
 }
 def restart_space():
         search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
         hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
         filter_columns=[
+            ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
+            ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
         ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         # Main leaderboard tab
         with gr.TabItem("🏅 EVALITA-LLM Benchmark"):
             leaderboard = init_leaderboard(
                 LEADERBOARD_DF,
                 default_selection=['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],

get_model_info.py CHANGED Viewed

@@ -1,87 +1,92 @@
 import os
 import re
 import json
 from huggingface_hub import HfApi
-# Configura il token di Hugging Face (se necessario)
-#TOKEN = "YOUR_HUGGINGFACE_API_TOKEN"
 api = HfApi()
-# Percorsi delle cartelle
 input_folder = "../evalita_llm_models_output/"
 output_folder = "../evalita_llm_requests/"
-# Creazione della cartella di output se non esiste
 os.makedirs(output_folder, exist_ok=True)
-# Espressione regolare per trovare il nome del modello
 model_pattern = re.compile(r"pretrained=([\w\-./]+)")
-# Scansiona i file nella cartella di input
 for filename in os.listdir(input_folder):
-    file_path = os.path.join(input_folder, filename)
-    # Leggi il contenuto del file
-    with open(file_path, "r", encoding="utf-8") as f:
-        content = f.read()
-    # Estrai il nome del modello
-    match = model_pattern.search(content)
-    if match:
-        model_name = match.group(1)
-        print(f"Processing model: {model_name}")
-        try:
-            # Ottieni le informazioni del modello da Hugging Face
-            model_info = api.model_info(model_name)
-            # Calcola il numero di parametri in miliardi, se disponibile
-            num_params = None
-            if model_info.safetensors and "BF16" in model_info.safetensors.parameters:
-                num_params = model_info.safetensors.parameters["BF16"] / 1e9  # Converti in miliardi
-            # Estrai la lingua (può essere una lista, quindi prendiamo la prima se esiste)
-            # Estrai e concatena i linguaggi
-            language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""
-            print(model_info)
-            # Costruisci il dizionario con i metadati richiesti
-            model_data = {
-                "model": model_name,
-                "base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "",
-                "revision": model_info.sha,
-                #"precision": "bfloat16",  # Se disponibile, sostituire con un valore reale
-                #"weight_type": "Original",
-                #"status": "FINISHED",
-                "submitted_time": str(model_info.created_at),
-                #"model_type": "pretrained",
-                #"likes": model_info.likes,
-                #"params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None,
-                #"license": model_info.license,
-                #"private": model_info.private,
-                "num_params_billion": num_params,  # Numero di parametri in miliardi
-                "language": language,  # Lingua estratta
-            }
-            # Separare il model_name in due parti: prima e dopo "/"
-            if "/" in model_name:
-                dir_name, file_name = model_name.split("/", 1)
-            else:
-                dir_name, file_name = model_name, model_name  # Se non c'è "/", usa lo stesso nome
-            # Creare la cartella per la prima parte del nome del modello
-            model_output_folder = os.path.join(output_folder, dir_name)
-            os.makedirs(model_output_folder, exist_ok=True)
-            # Salvare il file JSON nella cartella appropriata
-            output_file = os.path.join(model_output_folder, f"{file_name}.json")
-            with open(output_file, "w", encoding="utf-8") as f:
-                json.dump(model_data, f, indent=4)
-            print(f"Saved metadata for {model_name} in {output_file}")
-        except Exception as e:
-            print(f"Error retrieving info for {model_name}: {e}")
-        print("Process completed!")

+# Reads model output files (including accuracy values) produced by lm-eval-harness,
+# extracts model names, downloads their characteristics from HuggingFace, and saves metadata
+# (such as parameter count and pre-training status) to model-specific JSON files.
 import os
 import re
 import json
 from huggingface_hub import HfApi
+# Configures the Hugging Face token (if needed)
+# TOKEN = "YOUR_HUGGINGFACE_API_TOKEN"
 api = HfApi()
+# Directory paths
+# input_folder: Directory containing the output files of the lm-eval-harness library, including model accuracy metrics.
 input_folder = "../evalita_llm_models_output/"
+# output_folder: Directory where JSON files with model characteristics will be saved.
 output_folder = "../evalita_llm_requests/"
+# Creates the output folder if it doesn't exist
 os.makedirs(output_folder, exist_ok=True)
+# Regular expression to find the model name
 model_pattern = re.compile(r"pretrained=([\w\-./]+)")
+# Scans files in the input folder
 for filename in os.listdir(input_folder):
+    if filename.endswith('.out'):
+        file_path = os.path.join(input_folder, filename)
+        # Reads the file content
+        with open(file_path, "r", encoding="utf-8") as f:
+            content = f.read()
+        # Extracts the model name
+        match = model_pattern.search(content)
+        if match:
+            model_name = match.group(1)
+            print(f"Processing model: {model_name}")
+            try:
+                # Retrieves model information from Hugging Face
+                model_info = api.model_info(model_name)
+                # Calculates the number of parameters in billions, if available
+                num_params = None
+                if model_info.safetensors and "BF16" in model_info.safetensors.parameters:
+                    num_params = model_info.safetensors.parameters["BF16"] / 1e9  # Convert to billions
+                # Extracts and concatenates languages
+                language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""
+                print(model_info)
+                # Builds the dictionary with required metadata
+                model_data = {
+                    "model": model_name,
+                    "base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "",
+                    "revision": model_info.sha,
+                    # "precision": "bfloat16",  # If available, replace with real value
+                    # "weight_type": "Original",
+                    # "status": "FINISHED",
+                    "submitted_time": str(model_info.created_at),
+                    # "model_type": "pretrained",
+                    # "likes": model_info.likes,
+                    # "params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None,
+                    # "license": model_info.license,
+                    # "private": model_info.private,
+                    "num_params_billion": num_params,  # Number of parameters in billions
+                    "language": language,  # Extracted language
+                }
+                # Separates the model_name into two parts: directory name and file name
+                if "/" in model_name:
+                    dir_name, file_name = model_name.split("/", 1)
+                else:
+                    dir_name, file_name = model_name, model_name  # If no "/", use the same name
+                # Creates the folder for saving the produced json files
+                model_output_folder = os.path.join(output_folder, dir_name)
+                os.makedirs(model_output_folder, exist_ok=True)
+                # Saves the JSON file in the appropriate folder
+                output_file = os.path.join(model_output_folder, f"{file_name}.json")
+                with open(output_file, "w", encoding="utf-8") as f:
+                    json.dump(model_data, f, indent=4)
+                print(f"Saved metadata for {model_name} in {output_file}")
+            except Exception as e:
+                print(f"Error retrieving info for {model_name}: {e}")
+            print("Process finished!")

preprocess_models_output.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 import re
 def safe_float(value):
-    """Convert a value to float safely. Returns None if conversion fails."""
     try:
         return float(value)
     except ValueError:
@@ -11,7 +11,7 @@ def safe_float(value):
 def calculate_task_metrics(task_info):
-    """Calculate average accuracy, best prompt, and CPS for a task."""
     accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
     if not accuracies:
@@ -29,7 +29,7 @@ def calculate_task_metrics(task_info):
 def extract_data_from_file(file_path):
-    """Extract task and prompt data from the given file."""
     with open(file_path, 'r') as file:
         lines = file.readlines()
@@ -39,27 +39,23 @@ def extract_data_from_file(file_path):
     for line in lines:
         line = line.strip()
-        # Skip irrelevant lines
         if not line:
             continue
         if line.startswith("|         Tasks"):
             continue
         if line.startswith("hf (pretrained="):
-            # Estrai la parte dopo "pretrained="
             start = line.find("pretrained=") + len("pretrained=")
-            end = line.find(",", start)  # Trova la virgola successiva
-            # Estrai la stringa desiderata
             pretrained_model = line[start:end]
-            # Estrarre num_fewshot
             num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
             num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
-            # Estrarre batch_size
             batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
             batch_size = int(batch_size_match.group(1)) if batch_size_match else None
@@ -74,10 +70,11 @@ def extract_data_from_file(file_path):
         value = safe_float(columns[7])
         stderr = safe_float(columns[9])
         if metric == "acc_norm":
             continue
-        # Identify task and prompts
         if task_name.startswith(" - "):
             task_name = task_name[3:].strip()
             current_task = task_name
@@ -91,7 +88,7 @@ def extract_data_from_file(file_path):
                            'stderr': stderr}
             tasks_data[current_task]['prompts'].append(prompt_data)
-    # Special handling for evalita NER
     if "evalita NER" in tasks_data:
         task_info = tasks_data["evalita NER"]
         weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
@@ -113,11 +110,11 @@ def extract_data_from_file(file_path):
             {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
              'stderr': None}]
-    # Calculate metrics for each task
     for task_info in tasks_data.values():
         calculate_task_metrics(task_info)
-    # Calculate average CPS
     tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
     average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
@@ -129,73 +126,36 @@ def extract_data_from_file(file_path):
     return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
-# Example usage
-#file_path = '../evalita_llm_results/models_output/slurm-7769.out'
-#json_output = extract_data_from_file(file_path)
-#print(json_output)
-# Directory da cui leggere i file .out
 directory_in_path = '../evalita_llm_models_output/'
 directory_out_results_path = '../evalita_llm_results/'
-directory_out_requests_path = '../evalita_llm_requests/'
-# Itera sui file nella directory
 for filename in os.listdir(directory_in_path):
     if filename.endswith('.out'):
-        # Costruisci il percorso completo del file
         file_path = os.path.join(directory_in_path, filename)
-        # Esegui la funzione extract_data_from_file
         json_output = extract_data_from_file(file_path)
-        # Estrai model_org_name e model_name da model_name
         model_org_name, model_name = json_output['config']['model_name'].split('/')
-        # Percorso del file JSON di configurazione in ../evalita_llm_requests2/
-        config_file_path = os.path.join(directory_out_requests_path, model_org_name, f"{model_name}.json")
-        # Se il file esiste, caricalo e aggiorna il dizionario config
         if os.path.exists(config_file_path):
             with open(config_file_path, 'r', encoding='utf-8') as config_file:
                 additional_config = json.load(config_file)
-            # Aggiorna la configurazione con i nuovi dati
             json_output['config'].update(additional_config)
-        # Crea il percorso della cartella per model_org_name
         org_folder_path = os.path.join(directory_out_results_path, model_org_name)
-        os.makedirs(org_folder_path, exist_ok=True)  # Crea la cartella se non esiste
-        # Crea il percorso completo del file JSON
         file_suffix = f"{json_output['config']['num_fewshot']}"
         output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
-        # Salva il JSON in un file con ritorni a capo compatibili con Linux
         with open(output_file_path, 'w', newline="\n") as outfile:
             json.dump(json_output, outfile, indent=4)
-        # Stampa il risultato
-        print(f"File {filename} elaborato e salvato in {output_file_path}")

 import re
 def safe_float(value):
+    """Safely converts a value to float, returning None if the conversion fails."""
     try:
         return float(value)
     except ValueError:
 def calculate_task_metrics(task_info):
+    """Calculates average accuracy, best prompt accuracy, and CPS for a given task."""
     accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
     if not accuracies:
 def extract_data_from_file(file_path):
+    """Extracts task and prompt data from a specified file."""
     with open(file_path, 'r') as file:
         lines = file.readlines()
     for line in lines:
         line = line.strip()
+        # Skips empty lines
         if not line:
             continue
+        # Skips header lines
         if line.startswith("|         Tasks"):
             continue
+        # Extracts model configuration details
         if line.startswith("hf (pretrained="):
             start = line.find("pretrained=") + len("pretrained=")
+            end = line.find(",", start)
             pretrained_model = line[start:end]
             num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
             num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
             batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
             batch_size = int(batch_size_match.group(1)) if batch_size_match else None
         value = safe_float(columns[7])
         stderr = safe_float(columns[9])
+        # Skips normalized accuracy metrics
         if metric == "acc_norm":
             continue
+        # Identifies task and prompt sections in the file
         if task_name.startswith(" - "):
             task_name = task_name[3:].strip()
             current_task = task_name
                            'stderr': stderr}
             tasks_data[current_task]['prompts'].append(prompt_data)
+    # Special handling for evalita NER task to calculate weighted prompt averages
     if "evalita NER" in tasks_data:
         task_info = tasks_data["evalita NER"]
         weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
             {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
              'stderr': None}]
+    # Calculates task metrics for each task
     for task_info in tasks_data.values():
         calculate_task_metrics(task_info)
+    # Calculates the average CPS across all tasks
     tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
     average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
     return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
+# Main script: processes .out files, extracts data, and saves JSON results.
+# Reads .out files from directory_in_path, parses data including model config and task metrics,
+# and saves results as JSON files in directory_out_results_path, merging config from directory_out_requests_path if available.
 directory_in_path = '../evalita_llm_models_output/'
+directory_in_requests_path = '../evalita_llm_requests/'
 directory_out_results_path = '../evalita_llm_results/'
 for filename in os.listdir(directory_in_path):
     if filename.endswith('.out'):
         file_path = os.path.join(directory_in_path, filename)
         json_output = extract_data_from_file(file_path)
         model_org_name, model_name = json_output['config']['model_name'].split('/')
+        config_file_path = os.path.join(directory_in_requests_path, model_org_name, f"{model_name}.json")
         if os.path.exists(config_file_path):
             with open(config_file_path, 'r', encoding='utf-8') as config_file:
                 additional_config = json.load(config_file)
             json_output['config'].update(additional_config)
         org_folder_path = os.path.join(directory_out_results_path, model_org_name)
+        os.makedirs(org_folder_path, exist_ok=True)
         file_suffix = f"{json_output['config']['num_fewshot']}"
         output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
         with open(output_file_path, 'w', newline="\n") as outfile:
             json.dump(json_output, outfile, indent=4)
+        print(f"File {filename} processed and saved to {output_file_path}")

preprocess_models_output_old.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import json
+import os
+import re
+def safe_float(value):
+    """Convert a value to float safely. Returns None if conversion fails."""
+    try:
+        return float(value)
+    except ValueError:
+        return None
+def calculate_task_metrics(task_info):
+    """Calculate average accuracy, best prompt, and CPS for a task."""
+    accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
+    if not accuracies:
+        return None
+    task_info['average_accuracy'] = sum(accuracies) / len(accuracies)
+    best_prompt_data = max(task_info['prompts'], key=lambda x: x['value'])
+    task_info['best_prompt'] = best_prompt_data['value']
+    task_info['prompt_id'] = best_prompt_data['prompt']
+    # Calculate CPS
+    avg_acc = task_info['average_accuracy']
+    best_acc = task_info['best_prompt']
+    task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc
+def extract_data_from_file(file_path):
+    """Extract task and prompt data from the given file."""
+    with open(file_path, 'r') as file:
+        lines = file.readlines()
+    tasks_data = {}
+    current_task = None
+    for line in lines:
+        line = line.strip()
+        # Skip irrelevant lines
+        if not line:
+            continue
+        if line.startswith("|         Tasks"):
+            continue
+        if line.startswith("hf (pretrained="):
+            # Estrai la parte dopo "pretrained="
+            start = line.find("pretrained=") + len("pretrained=")
+            end = line.find(",", start)  # Trova la virgola successiva
+            # Estrai la stringa desiderata
+            pretrained_model = line[start:end]
+            # Estrarre num_fewshot
+            num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
+            num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
+            # Estrarre batch_size
+            batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
+            batch_size = int(batch_size_match.group(1)) if batch_size_match else None
+            continue
+        columns = line.split('|')
+        if len(columns) != 11:
+            continue
+        task_name = columns[1]
+        metric = columns[5].strip()
+        value = safe_float(columns[7])
+        stderr = safe_float(columns[9])
+        if metric == "acc_norm":
+            continue
+        # Identify task and prompts
+        if task_name.startswith(" - "):
+            task_name = task_name[3:].strip()
+            current_task = task_name
+            tasks_data.setdefault(current_task,
+                                  {'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None,
+                                   'CPS': None})
+        elif task_name.startswith("  - ") and current_task:
+            prompt_name = task_name[4:].strip()
+            prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100,
+                           'stderr': stderr}
+            tasks_data[current_task]['prompts'].append(prompt_data)
+    # Special handling for evalita NER
+    if "evalita NER" in tasks_data:
+        task_info = tasks_data["evalita NER"]
+        weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
+                      "WN prompt-1": 2088, "WN prompt-2": 2088}
+        weighted_values = {"prompt-1": 0, "prompt-2": 0}
+        total_weights = sum(weight_map.values())
+        for prompt in task_info['prompts']:
+            if prompt['prompt'] in weight_map:
+                if "prompt-1" in prompt['prompt']:
+                    weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value']
+                elif "prompt-2" in prompt['prompt']:
+                    weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value']
+        task_info['prompts'] = [
+            {"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights,
+             'stderr': None},
+            {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
+             'stderr': None}]
+    # Calculate metrics for each task
+    for task_info in tasks_data.values():
+        calculate_task_metrics(task_info)
+    # Calculate average CPS
+    tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
+    average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
+    config = {
+        "model_name": pretrained_model,
+        "num_fewshot": num_fewshot,
+        "batch_size": batch_size
+    }
+    return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
+# Example usage
+#file_path = '../evalita_llm_results/models_output/slurm-7769.out'
+#json_output = extract_data_from_file(file_path)
+#print(json_output)
+# Directory da cui leggere i file .out
+directory_in_path = '../evalita_llm_models_output/'
+directory_out_results_path = '../evalita_llm_results/'
+directory_out_requests_path = '../evalita_llm_requests/'
+# Itera sui file nella directory
+for filename in os.listdir(directory_in_path):
+    if filename.endswith('.out'):
+        # Costruisci il percorso completo del file
+        file_path = os.path.join(directory_in_path, filename)
+        # Esegui la funzione extract_data_from_file
+        json_output = extract_data_from_file(file_path)
+        # Estrai model_org_name e model_name da model_name
+        model_org_name, model_name = json_output['config']['model_name'].split('/')
+        # Percorso del file JSON di configurazione in ../evalita_llm_requests2/
+        config_file_path = os.path.join(directory_out_requests_path, model_org_name, f"{model_name}.json")
+        # Se il file esiste, caricalo e aggiorna il dizionario config
+        if os.path.exists(config_file_path):
+            with open(config_file_path, 'r', encoding='utf-8') as config_file:
+                additional_config = json.load(config_file)
+            # Aggiorna la configurazione con i nuovi dati
+            json_output['config'].update(additional_config)
+        # Crea il percorso della cartella per model_org_name
+        org_folder_path = os.path.join(directory_out_results_path, model_org_name)
+        os.makedirs(org_folder_path, exist_ok=True)  # Crea la cartella se non esiste
+        # Crea il percorso completo del file JSON
+        file_suffix = f"{json_output['config']['num_fewshot']}"
+        output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
+        # Salva il JSON in un file con ritorni a capo compatibili con Linux
+        with open(output_file_path, 'w', newline="\n") as outfile:
+            json.dump(json_output, outfile, indent=4)
+        # Stampa il risultato
+        print(f"File {filename} elaborato e salvato in {output_file_path}")

run_instructions.txt ADDED Viewed

	@@ -0,0 +1,42 @@

+Model Evaluation and Leaderboard
+1) Model Evaluation
+Before integrating a model into the leaderboard, it must first be evaluated using the lm-eval-harness library in both zero-shot and 5-shot configurations.
+This can be done with the following command:
+lm_eval --model hf --model_args pretrained=google/gemma-3-12b-it \
+  --tasks evalita-mp --device cuda:0 --batch_size 1 --trust_remote_code \
+  --output_path model_output --num_fewshot 5 --
+The output generated by the library will include the model's accuracy scores on the benchmark tasks.
+This output is written to the standard output and should be saved in a txt file (e.g., slurm-8368.out), which needs to be placed in the
+ evalita_llm_models_output directory for further processing.
+2) Extracting Model Metadata
+To display model details on the leaderboard (e.g., organization/group, model name, and parameter count), metadata must be retrieved from Hugging Face.
+This can be done by running:
+python get_model_info.py
+This script processes the evaluation files from Step 1 and saves each model's metadata in a JSON file within the evalita_llm_requests directory.
+3) Generating Leaderboard Submission File
+The leaderboard requires a structured file containing each model’s metadata along with its benchmark accuracy scores.
+To generate this file, run:
+python preprocess_model_output.
+This script combines the accuracy results from Step 1 with the metadata from Step 2 and outputs a JSON file in the evalita_llm_results directory.
+4) Updating the Hugging Face Repository
+The evalita_llm_results repository on HuggingFace must be updated with the newly generated files from Step 3.
+5) Running the Leaderboard Application
+Finally, execute the leaderboard application by running:
+python app.py

src/about.py CHANGED Viewed

@@ -8,7 +8,6 @@ class Task:
     metric_type: str
     col_name: str
 # Select your tasks here
 # ---------------------------------------------------
 class Tasks(Enum):
@@ -122,9 +121,10 @@ The following Evalita-LLM tasks can also be evaluated in isolation:
 ```bash
-lm_eval --model hf --model_args pretrained=meta-llama/Llama-2-7b-hf --tasks evalita-mp --device cuda:0 --batch_size auto
 ```
 ### Checklist
 * [x] Is the task an existing benchmark in the literature?
@@ -136,6 +136,8 @@ If other tasks on this dataset are already supported:
 * [x] Is the "Main" variant of this task clearly denoted?
 * [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
 * [x] Have you noted which, if any, published evaluation setups are matched by this variant?
 """

     metric_type: str
     col_name: str
 # Select your tasks here
 # ---------------------------------------------------
 class Tasks(Enum):
 ```bash
+lm_eval --model hf --model_args pretrained=meta-llama/Llama-2-7b-hf --tasks evalita-mp --device cuda:0 --batch_size 1
 ```
+<!--
 ### Checklist
 * [x] Is the task an existing benchmark in the literature?
 * [x] Is the "Main" variant of this task clearly denoted?
 * [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
 * [x] Have you noted which, if any, published evaluation setups are matched by this variant?
+-->
 """

src/display/utils.py CHANGED Viewed

@@ -30,9 +30,6 @@ auto_eval_column_dict.append(["fewshot_type", ColumnContent, ColumnContent("FS",
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #auto_eval_column_dict.append(["fewshot", ColumnContent, ColumnContent("Few-Shot", "str", True)])
 #Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg. Combined Performance ⬆️", "number", True)])
 for task in Tasks:

 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #auto_eval_column_dict.append(["fewshot", ColumnContent, ColumnContent("Few-Shot", "str", True)])
 #Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg. Combined Performance ⬆️", "number", True)])
 for task in Tasks:

src/envs.py CHANGED Viewed

@@ -15,7 +15,7 @@ OWNER = "evalitahf"
 #RESULTS_REPO = f"{OWNER}/evalita-results"
 REPO_ID = f"{OWNER}/evalita_llm_leaderboard"
-QUEUE_REPO = f"{OWNER}/evalita_llm_requests"
 RESULTS_REPO = f"{OWNER}/evalita_llm_results"
 # If you setup a cache later, just change HF_HOME

 #RESULTS_REPO = f"{OWNER}/evalita-results"
 REPO_ID = f"{OWNER}/evalita_llm_leaderboard"
+#QUEUE_REPO = f"{OWNER}/evalita_llm_requests"
 RESULTS_REPO = f"{OWNER}/evalita_llm_results"
 # If you setup a cache later, just change HF_HOME

src/tasks.py CHANGED Viewed

@@ -56,7 +56,7 @@ SA_DESCRIPTION = """### Sentiment Analysis (SA)
 """
 HS_DESCRIPTION = """### Hate Speech (HS)
-    The input is a tweet. The model has to determine whether the text contains hateful content directed at specific target groups: immigrants, Muslims, or Roma. The output is a binary classification: hateful or not hateful.
 | #   | Prompt                                                                                       | Answer Choices                                   |
 |-----|--------------------------------------------------------------------------------|-------------------------------------------------|
@@ -104,7 +104,7 @@ WIC_DESCRIPTION = """### Word in Context (WIC)
 """
 FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ)
-    The input is a user query made by customers to the Acquedotto Pugliese service. The model must determine which of the 4 possible answers is the correct response to the question.
 | #   | Prompt                                                                                       | Answer Choices               |
 |-----|--------------------------------------------------------------------------------|-----------------------------|

 """
 HS_DESCRIPTION = """### Hate Speech (HS)
+    The input is a tweet. The model has to determine whether the text contains hateful content directed towards marginalized or minority groups. The output is a binary classification: hateful or not hateful.
 | #   | Prompt                                                                                       | Answer Choices                                   |
 |-----|--------------------------------------------------------------------------------|-------------------------------------------------|
 """
 FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ)
+    The input is a user query regarding the water supply service. The model must identify the correct answer from the 4 available options.
 | #   | Prompt                                                                                       | Answer Choices               |
 |-----|--------------------------------------------------------------------------------|-----------------------------|