|
"""
|
|
MODEL METADATA EXTRACTOR
|
|
|
|
This script processes model evaluation output files (input_folder) from the lm-eval-harness library,
|
|
extracts model identifiers, retrieves detailed metadata from HuggingFace
|
|
and saves the information as structured JSON files (output_folder).
|
|
|
|
Input: Directory containing .out files from lm-eval-harness
|
|
Output: Directory with JSON files containing model metadata
|
|
"""
|
|
|
|
|
|
'''
|
|
hf (pretrained=swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 1
|
|
| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
|
|
|------------------------|------:|------|-----:|--------|---|-----:|---|------|
|
|
|evalita-mp | 1|none | |acc |↑ |0.5605|± |0.0052|
|
|
...
|
|
Job completed
|
|
'''
|
|
|
|
|
|
'''
|
|
{
|
|
"model": "swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA",
|
|
"base_model": "LlamaForCausalLM",
|
|
"revision": "2b6e46e4c9d341dc8bf8350a167492c880116b66",
|
|
"submitted_time": "2024-04-29 09:34:12+00:00",
|
|
"num_params_billion": 8.030261248,
|
|
"language": "en_it"
|
|
}
|
|
'''
|
|
|
|
import os
|
|
import re
|
|
import json
|
|
from huggingface_hub import HfApi
|
|
|
|
|
|
|
|
api = HfApi()
|
|
|
|
|
|
|
|
input_folder = "../evalita_llm_models_output/"
|
|
|
|
output_folder = "../evalita_llm_requests/"
|
|
|
|
|
|
os.makedirs(output_folder, exist_ok=True)
|
|
|
|
|
|
model_pattern = re.compile(r"pretrained=([\w\-./]+)")
|
|
|
|
|
|
for filename in os.listdir(input_folder):
|
|
if filename.endswith('.out'):
|
|
file_path = os.path.join(input_folder, filename)
|
|
|
|
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
|
|
match = model_pattern.search(content)
|
|
if match:
|
|
model_name = match.group(1)
|
|
print(f"Processing model: {model_name}")
|
|
|
|
try:
|
|
|
|
model_info = api.model_info(model_name)
|
|
|
|
|
|
num_params = None
|
|
if model_info.safetensors and "BF16" in model_info.safetensors.parameters:
|
|
num_params = model_info.safetensors.parameters["BF16"] / 1e9
|
|
|
|
|
|
language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""
|
|
|
|
|
|
|
|
|
|
model_data = {
|
|
"model": model_name,
|
|
"base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "",
|
|
"revision": model_info.sha,
|
|
|
|
|
|
|
|
"submitted_time": str(model_info.created_at),
|
|
|
|
|
|
|
|
|
|
|
|
"num_params_billion": num_params,
|
|
"language": language,
|
|
}
|
|
|
|
|
|
if "/" in model_name:
|
|
dir_name, file_name = model_name.split("/", 1)
|
|
else:
|
|
dir_name, file_name = model_name, model_name
|
|
|
|
|
|
model_output_folder = os.path.join(output_folder, dir_name)
|
|
os.makedirs(model_output_folder, exist_ok=True)
|
|
|
|
|
|
output_file = os.path.join(model_output_folder, f"{file_name}.json")
|
|
|
|
|
|
if os.path.exists(output_file):
|
|
print(f"File {output_file} already exists. Skipping...")
|
|
continue
|
|
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
json.dump(model_data, f, indent=4)
|
|
|
|
print(f"Saved metadata for {model_name} in {output_file}")
|
|
|
|
except Exception as e:
|
|
print(f"Error retrieving info for {model_name}: {e}")
|
|
|
|
print("Process finished!") |