evalita_llm_leaderboard / get_model_info.py
rzanoli's picture
Small changes
dbd3b18
raw
history blame
4.22 kB
# Reads model output files (including accuracy values) produced by lm-eval-harness,
# extracts model names, downloads their characteristics from HuggingFace, and saves metadata
# (such as parameter count and pre-training status) to model-specific JSON files.
import os
import re
import json
from huggingface_hub import HfApi
# Configures the Hugging Face token (if needed)
# TOKEN = "YOUR_HUGGINGFACE_API_TOKEN"
api = HfApi()
# Directory paths
# input_folder: Directory containing the output files of the lm-eval-harness library, including model accuracy metrics.
input_folder = "../evalita_llm_models_output/"
# output_folder: Directory where JSON files with model characteristics will be saved.
output_folder = "../evalita_llm_requests/"
# Creates the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)
# Regular expression to find the model name
model_pattern = re.compile(r"pretrained=([\w\-./]+)")
# Scans files in the input folder
for filename in os.listdir(input_folder):
if filename.endswith('.out'):
file_path = os.path.join(input_folder, filename)
# Reads the file content
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
# Extracts the model name
match = model_pattern.search(content)
if match:
model_name = match.group(1)
print(f"Processing model: {model_name}")
try:
# Retrieves model information from Hugging Face
model_info = api.model_info(model_name)
# Calculates the number of parameters in billions, if available
num_params = None
if model_info.safetensors and "BF16" in model_info.safetensors.parameters:
num_params = model_info.safetensors.parameters["BF16"] / 1e9 # Convert to billions
# Extracts and concatenates languages
language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""
print(model_info)
# Builds the dictionary with required metadata
model_data = {
"model": model_name,
"base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "",
"revision": model_info.sha,
# "precision": "bfloat16", # If available, replace with real value
# "weight_type": "Original",
# "status": "FINISHED",
"submitted_time": str(model_info.created_at),
# "model_type": "pretrained",
# "likes": model_info.likes,
# "params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None,
# "license": model_info.license,
# "private": model_info.private,
"num_params_billion": num_params, # Number of parameters in billions
"language": language, # Extracted language
}
# Separates the model_name into two parts: directory name and file name
if "/" in model_name:
dir_name, file_name = model_name.split("/", 1)
else:
dir_name, file_name = model_name, model_name # If no "/", use the same name
# Creates the folder for saving the produced json files
model_output_folder = os.path.join(output_folder, dir_name)
os.makedirs(model_output_folder, exist_ok=True)
# Saves the JSON file in the appropriate folder
output_file = os.path.join(model_output_folder, f"{file_name}.json")
with open(output_file, "w", encoding="utf-8") as f:
json.dump(model_data, f, indent=4)
print(f"Saved metadata for {model_name} in {output_file}")
except Exception as e:
print(f"Error retrieving info for {model_name}: {e}")
print("Process finished!")