Commit
·
9674655
1
Parent(s):
66ac6d1
switch to new results file format, code formatting, efficiency optimizations
Browse files- backend/app/about.py +6 -6
- backend/app/config/hf_config.py +1 -1
- backend/app/leaderboard/read_evals.py +77 -66
backend/app/about.py
CHANGED
|
@@ -10,9 +10,9 @@ class Task:
|
|
| 10 |
|
| 11 |
|
| 12 |
class Tasks(Enum):
|
| 13 |
-
task0 = Task(benchmark="icelandic_winogrande_stringmatch", metric="exact_match
|
| 14 |
-
task1 = Task(benchmark="icelandic_sentences_ged_stringmatch", metric="exact_match
|
| 15 |
-
task2 = Task(benchmark="icelandic_inflection_all", metric="exact_match
|
| 16 |
-
task5 = Task(benchmark="icelandic_belebele", metric="exact_match
|
| 17 |
-
task6 = Task(benchmark="icelandic_arc_challenge", metric="exact_match
|
| 18 |
-
task7 = Task(benchmark="icelandic_wiki_qa", metric="
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
class Tasks(Enum):
|
| 13 |
+
task0 = Task(benchmark="icelandic_winogrande_stringmatch", metric="exact_match", col_name="WinoGrande-IS (3-shot)")
|
| 14 |
+
task1 = Task(benchmark="icelandic_sentences_ged_stringmatch", metric="exact_match", col_name="GED")
|
| 15 |
+
task2 = Task(benchmark="icelandic_inflection_all", metric="exact_match", col_name="Inflection (1-shot)")
|
| 16 |
+
task5 = Task(benchmark="icelandic_belebele", metric="exact_match", col_name="Belebele (IS)")
|
| 17 |
+
task6 = Task(benchmark="icelandic_arc_challenge", metric="exact_match", col_name="ARC-Challenge-IS")
|
| 18 |
+
task7 = Task(benchmark="icelandic_wiki_qa", metric="llm_judge_score", col_name="WikiQA-IS")
|
backend/app/config/hf_config.py
CHANGED
|
@@ -30,7 +30,7 @@ else:
|
|
| 30 |
# Repository configuration
|
| 31 |
REPO_ID = f"{HF_ORGANIZATION}/icelandic-llm-leaderboard"
|
| 32 |
QUEUE_REPO = f"{HF_ORGANIZATION}/icelandic-llm-leaderboard-requests"
|
| 33 |
-
RESULTS_REPO = f"{HF_ORGANIZATION}/
|
| 34 |
|
| 35 |
# Local cache paths
|
| 36 |
HF_HOME = os.getenv("HF_HOME", ".")
|
|
|
|
| 30 |
# Repository configuration
|
| 31 |
REPO_ID = f"{HF_ORGANIZATION}/icelandic-llm-leaderboard"
|
| 32 |
QUEUE_REPO = f"{HF_ORGANIZATION}/icelandic-llm-leaderboard-requests"
|
| 33 |
+
RESULTS_REPO = f"{HF_ORGANIZATION}/llm-leaderboard-results"
|
| 34 |
|
| 35 |
# Local cache paths
|
| 36 |
HF_HOME = os.getenv("HF_HOME", ".")
|
backend/app/leaderboard/read_evals.py
CHANGED
|
@@ -1,95 +1,92 @@
|
|
| 1 |
import glob
|
| 2 |
import json
|
| 3 |
-
import math
|
| 4 |
import os
|
| 5 |
from dataclasses import dataclass
|
|
|
|
| 6 |
|
| 7 |
-
import dateutil
|
| 8 |
import numpy as np
|
| 9 |
|
| 10 |
from app.display.formatting import make_clickable_model
|
| 11 |
-
from app.display.utils import AutoEvalColumn, ModelType,
|
| 12 |
from app.submission.check_validity import is_model_on_hub
|
| 13 |
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
@dataclass
|
| 16 |
class EvalResult:
|
| 17 |
-
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
| 18 |
-
|
| 19 |
-
eval_name: str
|
| 20 |
-
full_model: str
|
| 21 |
-
org: str
|
| 22 |
model: str
|
| 23 |
-
revision: str
|
| 24 |
results: dict
|
| 25 |
precision: Precision = Precision.Unknown
|
| 26 |
-
model_type: ModelType = ModelType.Unknown
|
| 27 |
-
architecture: str = "Unknown"
|
| 28 |
license: str = "?"
|
| 29 |
likes: int = 0
|
| 30 |
num_params: int = 0
|
| 31 |
-
date: str = ""
|
| 32 |
still_on_hub: bool = False
|
| 33 |
-
reasoning: bool = False
|
| 34 |
-
note: str = ""
|
| 35 |
|
| 36 |
@classmethod
|
| 37 |
-
def
|
| 38 |
-
"""Inits the result from the
|
| 39 |
with open(json_filepath) as fp:
|
| 40 |
data = json.load(fp)
|
| 41 |
|
| 42 |
-
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
| 46 |
|
| 47 |
-
|
| 48 |
-
org_and_model = config.get("model_name", config.get("model_args", None))
|
| 49 |
-
org_and_model = org_and_model.split("/", 1)
|
| 50 |
|
| 51 |
-
|
| 52 |
-
org = None
|
| 53 |
-
model = org_and_model[0]
|
| 54 |
-
result_key = f"{model}_{precision.value.name}"
|
| 55 |
-
else:
|
| 56 |
-
org = org_and_model[0]
|
| 57 |
-
model = org_and_model[1]
|
| 58 |
-
result_key = f"{org}_{model}_{precision.value.name}"
|
| 59 |
-
full_model = "/".join(org_and_model)
|
| 60 |
|
| 61 |
-
still_on_hub, _, model_config = is_model_on_hub(
|
| 62 |
-
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
| 63 |
-
)
|
| 64 |
architecture = "?"
|
| 65 |
if model_config is not None:
|
| 66 |
architectures = getattr(model_config, "architectures", None)
|
| 67 |
if architectures:
|
| 68 |
architecture = ";".join(architectures)
|
| 69 |
|
| 70 |
-
# Extract results available in this file
|
| 71 |
-
|
| 72 |
for task in Tasks:
|
| 73 |
task = task.value
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
continue
|
| 79 |
|
| 80 |
-
mean_acc = np.mean(
|
| 81 |
-
|
| 82 |
|
| 83 |
return self(
|
| 84 |
eval_name=result_key,
|
| 85 |
full_model=full_model,
|
| 86 |
org=org,
|
| 87 |
model=model,
|
| 88 |
-
results=
|
| 89 |
-
|
| 90 |
-
revision= config.get("model_sha", ""),
|
| 91 |
still_on_hub=still_on_hub,
|
| 92 |
-
architecture=architecture
|
| 93 |
)
|
| 94 |
|
| 95 |
def update_with_request_file(self, requests_path):
|
|
@@ -104,10 +101,14 @@ class EvalResult:
|
|
| 104 |
self.likes = request.get("likes", 0)
|
| 105 |
self.num_params = request.get("params", 0)
|
| 106 |
self.date = request.get("submitted_time", "")
|
| 107 |
-
self.reasoning = request.get("reasoning", False)
|
|
|
|
|
|
|
| 108 |
self.note = request.get("note", "") # Default to empty string if missing
|
| 109 |
except FileNotFoundError:
|
| 110 |
-
print(
|
|
|
|
|
|
|
| 111 |
|
| 112 |
def to_dict(self):
|
| 113 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
|
@@ -145,6 +146,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
| 145 |
f"{model_name}_eval_request_*.json",
|
| 146 |
)
|
| 147 |
request_files = glob.glob(request_files)
|
|
|
|
|
|
|
| 148 |
|
| 149 |
# Select correct request file (precision)
|
| 150 |
request_file = ""
|
|
@@ -161,38 +164,46 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 161 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 162 |
model_result_filepaths = []
|
| 163 |
|
|
|
|
| 164 |
for root, _, files in os.walk(results_path):
|
| 165 |
# We should only have json files in model results
|
| 166 |
-
|
|
|
|
| 167 |
continue
|
| 168 |
|
| 169 |
-
# Sort
|
| 170 |
try:
|
| 171 |
-
|
| 172 |
-
except
|
| 173 |
-
|
|
|
|
| 174 |
|
| 175 |
-
for file in
|
| 176 |
model_result_filepaths.append(os.path.join(root, file))
|
| 177 |
|
| 178 |
-
|
| 179 |
eval_results = {}
|
| 180 |
for model_result_filepath in model_result_filepaths:
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
results = []
|
| 193 |
for v in eval_results.values():
|
| 194 |
try:
|
| 195 |
-
v.to_dict()
|
| 196 |
results.append(v)
|
| 197 |
except KeyError: # not all eval values present
|
| 198 |
continue
|
|
|
|
| 1 |
import glob
|
| 2 |
import json
|
|
|
|
| 3 |
import os
|
| 4 |
from dataclasses import dataclass
|
| 5 |
+
from functools import lru_cache
|
| 6 |
|
|
|
|
| 7 |
import numpy as np
|
| 8 |
|
| 9 |
from app.display.formatting import make_clickable_model
|
| 10 |
+
from app.display.utils import AutoEvalColumn, ModelType, Precision, Tasks
|
| 11 |
from app.submission.check_validity import is_model_on_hub
|
| 12 |
|
| 13 |
|
| 14 |
+
# Add caching for hub checks to avoid repeated network calls
|
| 15 |
+
@lru_cache(maxsize=256)
|
| 16 |
+
def cached_is_model_on_hub(full_model, revision):
|
| 17 |
+
"""Cached version of is_model_on_hub to avoid repeated network calls"""
|
| 18 |
+
return is_model_on_hub(full_model, revision, trust_remote_code=True, test_tokenizer=False)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
@dataclass
|
| 22 |
class EvalResult:
|
| 23 |
+
"""Represents one full evaluation. Built from a combination of the result and request file for a given run."""
|
| 24 |
+
|
| 25 |
+
eval_name: str # org_model_precision (uid)
|
| 26 |
+
full_model: str # org/model (path on hub)
|
| 27 |
+
org: str
|
| 28 |
model: str
|
| 29 |
+
revision: str # commit hash, "" if main
|
| 30 |
results: dict
|
| 31 |
precision: Precision = Precision.Unknown
|
| 32 |
+
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
| 33 |
+
architecture: str = "Unknown"
|
| 34 |
license: str = "?"
|
| 35 |
likes: int = 0
|
| 36 |
num_params: int = 0
|
| 37 |
+
date: str = "" # submission date of request file
|
| 38 |
still_on_hub: bool = False
|
| 39 |
+
reasoning: bool = False # Whether reasoning is enabled for this model
|
| 40 |
+
note: str = "" # Extra information about the model (e.g., thinking budget, warnings)
|
| 41 |
|
| 42 |
@classmethod
|
| 43 |
+
def init_from_new_format_json_file(self, json_filepath):
|
| 44 |
+
"""Inits the result from the new format model result file"""
|
| 45 |
with open(json_filepath) as fp:
|
| 46 |
data = json.load(fp)
|
| 47 |
|
| 48 |
+
results = data.get("results")
|
| 49 |
|
| 50 |
+
full_model = data.get("config_general", {}).get("model_name", "").strip()
|
| 51 |
+
result_key = full_model.replace("/", "_")
|
| 52 |
|
| 53 |
+
org, model = full_model.split("/", 1) if "/" in full_model else ("", full_model)
|
|
|
|
|
|
|
| 54 |
|
| 55 |
+
still_on_hub, _, model_config = cached_is_model_on_hub(full_model, "main")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
|
|
|
|
|
|
|
|
|
| 57 |
architecture = "?"
|
| 58 |
if model_config is not None:
|
| 59 |
architectures = getattr(model_config, "architectures", None)
|
| 60 |
if architectures:
|
| 61 |
architecture = ";".join(architectures)
|
| 62 |
|
| 63 |
+
# Extract results available in this file
|
| 64 |
+
score_results = {}
|
| 65 |
for task in Tasks:
|
| 66 |
task = task.value
|
| 67 |
+
benchmark_id = task.benchmark
|
| 68 |
+
metric = task.metric
|
| 69 |
+
|
| 70 |
+
scores = [
|
| 71 |
+
results[key][metric]
|
| 72 |
+
for key in results
|
| 73 |
+
if "|" in key and benchmark_id.startswith(key.split("|")[1].removeprefix("icelandic_evals:"))
|
| 74 |
+
]
|
| 75 |
+
if len(scores) == 0:
|
| 76 |
continue
|
| 77 |
|
| 78 |
+
mean_acc = np.mean(scores) * 100.0
|
| 79 |
+
score_results[benchmark_id] = mean_acc
|
| 80 |
|
| 81 |
return self(
|
| 82 |
eval_name=result_key,
|
| 83 |
full_model=full_model,
|
| 84 |
org=org,
|
| 85 |
model=model,
|
| 86 |
+
results=score_results,
|
| 87 |
+
revision="",
|
|
|
|
| 88 |
still_on_hub=still_on_hub,
|
| 89 |
+
architecture=architecture,
|
| 90 |
)
|
| 91 |
|
| 92 |
def update_with_request_file(self, requests_path):
|
|
|
|
| 101 |
self.likes = request.get("likes", 0)
|
| 102 |
self.num_params = request.get("params", 0)
|
| 103 |
self.date = request.get("submitted_time", "")
|
| 104 |
+
self.reasoning = request.get("reasoning", False) or request.get("gen_kwargs", {}).get(
|
| 105 |
+
"reasoning_effort", None
|
| 106 |
+
)
|
| 107 |
self.note = request.get("note", "") # Default to empty string if missing
|
| 108 |
except FileNotFoundError:
|
| 109 |
+
print(
|
| 110 |
+
f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
|
| 111 |
+
)
|
| 112 |
|
| 113 |
def to_dict(self):
|
| 114 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
|
|
|
| 146 |
f"{model_name}_eval_request_*.json",
|
| 147 |
)
|
| 148 |
request_files = glob.glob(request_files)
|
| 149 |
+
if len(request_files) == 1:
|
| 150 |
+
return request_files[0]
|
| 151 |
|
| 152 |
# Select correct request file (precision)
|
| 153 |
request_file = ""
|
|
|
|
| 164 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 165 |
model_result_filepaths = []
|
| 166 |
|
| 167 |
+
# Collect all JSON files first
|
| 168 |
for root, _, files in os.walk(results_path):
|
| 169 |
# We should only have json files in model results
|
| 170 |
+
json_files = [f for f in files if f.endswith(".json")]
|
| 171 |
+
if len(json_files) == 0:
|
| 172 |
continue
|
| 173 |
|
| 174 |
+
# Sort JSON files by date (newer later)
|
| 175 |
try:
|
| 176 |
+
json_files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
| 177 |
+
except (ValueError, IndexError):
|
| 178 |
+
# If sorting fails, just use the files as-is or take the last one
|
| 179 |
+
json_files = [json_files[-1]] if json_files else []
|
| 180 |
|
| 181 |
+
for file in json_files:
|
| 182 |
model_result_filepaths.append(os.path.join(root, file))
|
| 183 |
|
|
|
|
| 184 |
eval_results = {}
|
| 185 |
for model_result_filepath in model_result_filepaths:
|
| 186 |
+
try:
|
| 187 |
+
# Creation of result
|
| 188 |
+
eval_result = EvalResult.init_from_new_format_json_file(model_result_filepath)
|
| 189 |
+
eval_result.update_with_request_file(requests_path)
|
| 190 |
+
|
| 191 |
+
# Store results of same eval together
|
| 192 |
+
eval_name = eval_result.eval_name
|
| 193 |
+
if eval_name in eval_results:
|
| 194 |
+
# Update with newer scores
|
| 195 |
+
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
| 196 |
+
else:
|
| 197 |
+
eval_results[eval_name] = eval_result
|
| 198 |
+
except Exception as e:
|
| 199 |
+
# Log error but continue processing other files
|
| 200 |
+
print(f"Error processing {model_result_filepath}: {e}")
|
| 201 |
+
continue
|
| 202 |
|
| 203 |
results = []
|
| 204 |
for v in eval_results.values():
|
| 205 |
try:
|
| 206 |
+
v.to_dict() # we test if the dict version is complete
|
| 207 |
results.append(v)
|
| 208 |
except KeyError: # not all eval values present
|
| 209 |
continue
|