Spaces:

evalitahf
/

evalita_llm_leaderboard

Running

App Files Files Community

rzanoli commited on Mar 26

Commit

5a8f6c4

1 Parent(s): 5b04d4e

Small changes

Browse files

Files changed (3) hide show

get_model_info.py +41 -5
preprocess_models_output.py +92 -3
src/tasks.py +1 -1

get_model_info.py CHANGED Viewed

@@ -1,6 +1,36 @@
-# Reads model output files (including accuracy values) produced by lm-eval-harness,
-# extracts model names, downloads their characteristics from HuggingFace, and saves metadata
-# (such as parameter count and pre-training status) to model-specific JSON files.
 import os
 import re
 import json
@@ -38,7 +68,7 @@ for filename in os.listdir(input_folder):
             print(f"Processing model: {model_name}")
             try:
-                # Retrieves model information from Hugging Face
                 model_info = api.model_info(model_name)
                 # Calculates the number of parameters in billions, if available
@@ -49,7 +79,7 @@ for filename in os.listdir(input_folder):
                 # Extracts and concatenates languages
                 language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""
-                print(model_info)
                 # Builds the dictionary with required metadata
                 model_data = {
@@ -81,6 +111,12 @@ for filename in os.listdir(input_folder):
                 # Saves the JSON file in the appropriate folder
                 output_file = os.path.join(model_output_folder, f"{file_name}.json")
                 with open(output_file, "w", encoding="utf-8") as f:
                     json.dump(model_data, f, indent=4)

+"""
+MODEL METADATA EXTRACTOR
+This script processes model evaluation output files (input_folder) from the lm-eval-harness library,
+extracts model identifiers, retrieves detailed metadata from HuggingFace
+and saves the information as structured JSON files (output_folder).
+Input: Directory containing .out files from lm-eval-harness
+Output: Directory with JSON files containing model metadata
+"""
+# Example input file format (lm-eval-harness output):
+'''
+hf (pretrained=swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 1
+|         Tasks          |Version|Filter|n-shot| Metric |   |Value |   |Stderr|
+|------------------------|------:|------|-----:|--------|---|-----:|---|------|
+|evalita-mp              |      1|none  |      |acc     |↑  |0.5605|±  |0.0052|
+...
+Job completed
+'''
+# Example output JSON format:
+'''
+{
+    "model": "swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA",
+    "base_model": "LlamaForCausalLM",
+    "revision": "2b6e46e4c9d341dc8bf8350a167492c880116b66",
+    "submitted_time": "2024-04-29 09:34:12+00:00",
+    "num_params_billion": 8.030261248,
+    "language": "en_it"
+}
+'''
 import os
 import re
 import json
             print(f"Processing model: {model_name}")
             try:
+                # Retrieves model information from HuggingFace
                 model_info = api.model_info(model_name)
                 # Calculates the number of parameters in billions, if available
                 # Extracts and concatenates languages
                 language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""
+                #print(model_info)
                 # Builds the dictionary with required metadata
                 model_data = {
                 # Saves the JSON file in the appropriate folder
                 output_file = os.path.join(model_output_folder, f"{file_name}.json")
+                # Check if the file already exists
+                if os.path.exists(output_file):
+                    print(f"File {output_file} already exists. Skipping...")
+                    continue
                 with open(output_file, "w", encoding="utf-8") as f:
                     json.dump(model_data, f, indent=4)

preprocess_models_output.py CHANGED Viewed

@@ -1,3 +1,75 @@
 import json
 import os
 import re
@@ -126,9 +198,26 @@ def extract_data_from_file(file_path):
     return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
-# Main script: processes .out files, extracts data, and saves JSON results.
-# Reads .out files from directory_in_path, parses data including model config and task metrics,
-# and saves results as JSON files in directory_out_results_path, merging config from directory_out_requests_path if available.
 directory_in_path = '../evalita_llm_models_output/'
 directory_in_requests_path = '../evalita_llm_requests/'
 directory_out_results_path = '../evalita_llm_results/'

+"""
+EVALITA LLM EVALUATION PROCESSOR
+Transforms raw model evaluation outputs into structured performance reports for leaderboard integration.
+DATA PIPELINE OVERVIEW:
+1. Inputs:
+   - Evaluation Results: Raw .out files from lm-eval-harness
+   - Model Metadata: Pre-collected .json files from HuggingFace
+2. Output:
+   - Comprehensive evaluation reports in JSON format
+   - Ready for ingestion into the evaluation leaderboard
+--------------------------------------------------------------------
+INPUT SPECIFICATION
+Evaluation Results (.out format):
+   hf (pretrained=model-org/model-name), num_fewshot: 5, batch_size: 1
+   | Task          | Metric | Value  | Stderr |
+   |---------------|--------|--------|--------|
+   | main-task     | acc    | 0.5605 | 0.0052 |
+   | - sub-task    | acc    | 0.4640 | 0.0088 |
+   |   - prompt-1  | acc    | 0.3720 | 0.0216 |
+Model Metadata (.json format):
+   {
+     "model": "model-org/model-name",
+     "base_model": "ModelArchitecture",
+     "revision": "git_commit_hash",
+     "parameters": 8.03,
+     "language": "en_it"
+   }
+--------------------------------------------------------------------
+OUTPUT SPECIFICATION
+Evaluation Report (.json format):
+   {
+     "summary_metrics": {
+       "average_CPS": 41.74,
+       "num_tasks": 12
+     },
+     "model_config": {
+       "identifier": "model-org/model-name",
+       "architecture": "ModelArchitecture",
+       "parameters": 8.03,
+       "evaluation_settings": {
+         "fewshot": 5,
+         "batch_size": 1
+       }
+     },
+     "task_results": {
+       "task-name": {
+         "average_score": 52.60,
+         "best_prompt": {
+           "id": "prompt-6",
+           "score": 66.57
+         },
+         "prompt_analysis": [
+           {
+             "prompt_id": "prompt-1",
+             "score": 37.20,
+             "stderr": 0.0216
+           }
+         ]
+       }
+     }
+   }
+"""
 import json
 import os
 import re
     return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
+"""
+MAIN PROCESSING PIPELINE
+This script executes the complete evaluation data processing workflow:
+1. Input Sources:
+   - Raw evaluation results (.out files) from: ../evalita_llm_models_output/
+   - Model metadata JSON files from: ../evalita_llm_requests/
+2. Processing Steps:
+   - Parses evaluation metrics from .out files
+   - Combines with model metadata
+   - Calculates aggregated performance statistics
+3. Output:
+   - Structured JSON results saved to: ../evalita_llm_results/
+   - Organized by model organization/name
+   - Contains complete evaluation results with metadata
+"""
 directory_in_path = '../evalita_llm_models_output/'
 directory_in_requests_path = '../evalita_llm_requests/'
 directory_out_results_path = '../evalita_llm_results/'

src/tasks.py CHANGED Viewed

@@ -156,7 +156,7 @@ NER_DESCRIPTION = """### Named Entity Recognition (NER)
 """
 REL_DESCRIPTION = """### Relation Extraction (REL)
-    The task involves analyzing clinical text to extract relationships between laboratory test results (e.g., blood pressure) and the tests or procedures that produced them (e.g., blood pressure test).
 | #   | Prompt                                                                                       |
 |-----|--------------------------------------------------------------------------------|

 """
 REL_DESCRIPTION = """### Relation Extraction (REL)
+    The input is a sentence of a clinical text. The model must identify and extract relationships between laboratory test results (e.g., blood pressure) and the corresponding tests or procedures that generated them (e.g., blood pressure test).
 | #   | Prompt                                                                                       |
 |-----|--------------------------------------------------------------------------------|