File size: 5,392 Bytes
5a8f6c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad489d5
 
 
 
 
dbd3b18
 
ad489d5
 
dbd3b18
 
d1c3cb5
dbd3b18
d1c3cb5
ad489d5
dbd3b18
ad489d5
 
dbd3b18
ad489d5
 
dbd3b18
ad489d5
dbd3b18
 
 
 
 
 
 
 
 
 
 
 
 
 
5a8f6c4
dbd3b18
 
 
 
 
 
 
 
 
 
5a8f6c4
dbd3b18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a8f6c4
 
 
 
 
 
dbd3b18
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""

MODEL METADATA EXTRACTOR



This script processes model evaluation output files (input_folder) from the lm-eval-harness library,

extracts model identifiers, retrieves detailed metadata from HuggingFace

and saves the information as structured JSON files (output_folder).



Input: Directory containing .out files from lm-eval-harness

Output: Directory with JSON files containing model metadata

"""

# Example input file format (lm-eval-harness output):
'''

hf (pretrained=swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 1

|         Tasks          |Version|Filter|n-shot| Metric |   |Value |   |Stderr|

|------------------------|------:|------|-----:|--------|---|-----:|---|------|

|evalita-mp              |      1|none  |      |acc     |↑  |0.5605|±  |0.0052|

...

Job completed

'''

# Example output JSON format:
'''

{

    "model": "swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA",

    "base_model": "LlamaForCausalLM",

    "revision": "2b6e46e4c9d341dc8bf8350a167492c880116b66",

    "submitted_time": "2024-04-29 09:34:12+00:00",

    "num_params_billion": 8.030261248,

    "language": "en_it"

}

'''

import os
import re
import json
from huggingface_hub import HfApi

# Configures the Hugging Face token (if needed)
# TOKEN = "YOUR_HUGGINGFACE_API_TOKEN"
api = HfApi()

# Directory paths
# input_folder: Directory containing the output files of the lm-eval-harness library, including model accuracy metrics.
input_folder = "../evalita_llm_models_output/"
# output_folder: Directory where JSON files with model characteristics will be saved.
output_folder = "../evalita_llm_requests/"

# Creates the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Regular expression to find the model name
model_pattern = re.compile(r"pretrained=([\w\-./]+)")

# Scans files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith('.out'):
        file_path = os.path.join(input_folder, filename)

        # Reads the file content
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        # Extracts the model name
        match = model_pattern.search(content)
        if match:
            model_name = match.group(1)
            print(f"Processing model: {model_name}")

            try:
                # Retrieves model information from HuggingFace
                model_info = api.model_info(model_name)

                # Calculates the number of parameters in billions, if available
                num_params = None
                if model_info.safetensors and "BF16" in model_info.safetensors.parameters:
                    num_params = model_info.safetensors.parameters["BF16"] / 1e9  # Convert to billions

                # Extracts and concatenates languages
                language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""

                #print(model_info)

                # Builds the dictionary with required metadata
                model_data = {
                    "model": model_name,
                    "base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "",
                    "revision": model_info.sha,
                    # "precision": "bfloat16",  # If available, replace with real value
                    # "weight_type": "Original",
                    # "status": "FINISHED",
                    "submitted_time": str(model_info.created_at),
                    # "model_type": "pretrained",
                    # "likes": model_info.likes,
                    # "params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None,
                    # "license": model_info.license,
                    # "private": model_info.private,
                    "num_params_billion": num_params,  # Number of parameters in billions
                    "language": language,  # Extracted language
                }

                # Separates the model_name into two parts: directory name and file name
                if "/" in model_name:
                    dir_name, file_name = model_name.split("/", 1)
                else:
                    dir_name, file_name = model_name, model_name  # If no "/", use the same name

                # Creates the folder for saving the produced json files
                model_output_folder = os.path.join(output_folder, dir_name)
                os.makedirs(model_output_folder, exist_ok=True)

                # Saves the JSON file in the appropriate folder
                output_file = os.path.join(model_output_folder, f"{file_name}.json")

                # Check if the file already exists
                if os.path.exists(output_file):
                    print(f"File {output_file} already exists. Skipping...")
                    continue

                with open(output_file, "w", encoding="utf-8") as f:
                    json.dump(model_data, f, indent=4)

                print(f"Saved metadata for {model_name} in {output_file}")

            except Exception as e:
                print(f"Error retrieving info for {model_name}: {e}")

            print("Process finished!")