""" MODEL METADATA EXTRACTOR This script processes model evaluation output files (input_folder) from the lm-eval-harness library, extracts model identifiers, retrieves detailed metadata from HuggingFace and saves the information as structured JSON files (output_folder). Input: Directory containing .out files from lm-eval-harness Output: Directory with JSON files containing model metadata """ # Example input file format (lm-eval-harness output): ''' hf (pretrained=swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 1 | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| |------------------------|------:|------|-----:|--------|---|-----:|---|------| |evalita-mp | 1|none | |acc |↑ |0.5605|± |0.0052| ... Job completed ''' # Example output JSON format: ''' { "model": "swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA", "base_model": "LlamaForCausalLM", "revision": "2b6e46e4c9d341dc8bf8350a167492c880116b66", "submitted_time": "2024-04-29 09:34:12+00:00", "num_params_billion": 8.030261248, "language": "en_it" } ''' import os import re import json from huggingface_hub import HfApi # Configures the Hugging Face token (if needed) # TOKEN = "YOUR_HUGGINGFACE_API_TOKEN" api = HfApi() # Directory paths # input_folder: Directory containing the output files of the lm-eval-harness library, including model accuracy metrics. input_folder = "../evalita_llm_models_output/" # output_folder: Directory where JSON files with model characteristics will be saved. output_folder = "../evalita_llm_requests/" # Creates the output folder if it doesn't exist os.makedirs(output_folder, exist_ok=True) # Regular expression to find the model name model_pattern = re.compile(r"pretrained=([\w\-./]+)") # Scans files in the input folder for filename in os.listdir(input_folder): if filename.endswith('.out'): file_path = os.path.join(input_folder, filename) # Reads the file content with open(file_path, "r", encoding="utf-8") as f: content = f.read() # Extracts the model name match = model_pattern.search(content) if match: model_name = match.group(1) print(f"Processing model: {model_name}") try: # Retrieves model information from HuggingFace model_info = api.model_info(model_name) # Calculates the number of parameters in billions, if available num_params = None if model_info.safetensors and "BF16" in model_info.safetensors.parameters: num_params = model_info.safetensors.parameters["BF16"] / 1e9 # Convert to billions # Extracts and concatenates languages language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else "" #print(model_info) # Builds the dictionary with required metadata model_data = { "model": model_name, "base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "", "revision": model_info.sha, # "precision": "bfloat16", # If available, replace with real value # "weight_type": "Original", # "status": "FINISHED", "submitted_time": str(model_info.created_at), # "model_type": "pretrained", # "likes": model_info.likes, # "params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None, # "license": model_info.license, # "private": model_info.private, "num_params_billion": num_params, # Number of parameters in billions "language": language, # Extracted language } # Separates the model_name into two parts: directory name and file name if "/" in model_name: dir_name, file_name = model_name.split("/", 1) else: dir_name, file_name = model_name, model_name # If no "/", use the same name # Creates the folder for saving the produced json files model_output_folder = os.path.join(output_folder, dir_name) os.makedirs(model_output_folder, exist_ok=True) # Saves the JSON file in the appropriate folder output_file = os.path.join(model_output_folder, f"{file_name}.json") # Check if the file already exists if os.path.exists(output_file): print(f"File {output_file} already exists. Skipping...") continue with open(output_file, "w", encoding="utf-8") as f: json.dump(model_data, f, indent=4) print(f"Saved metadata for {model_name} in {output_file}") except Exception as e: print(f"Error retrieving info for {model_name}: {e}") print("Process finished!")