gardarjuto commited on
Commit
9674655
·
1 Parent(s): 66ac6d1

switch to new results file format, code formatting, efficiency optimizations

Browse files
backend/app/about.py CHANGED
@@ -10,9 +10,9 @@ class Task:
10
 
11
 
12
  class Tasks(Enum):
13
- task0 = Task(benchmark="icelandic_winogrande_stringmatch", metric="exact_match,get-answer", col_name="WinoGrande-IS (3-shot)")
14
- task1 = Task(benchmark="icelandic_sentences_ged_stringmatch", metric="exact_match,get-answer", col_name="GED")
15
- task2 = Task(benchmark="icelandic_inflection_all", metric="exact_match,get-answer", col_name="Inflection (1-shot)")
16
- task5 = Task(benchmark="icelandic_belebele", metric="exact_match,get-answer", col_name="Belebele (IS)")
17
- task6 = Task(benchmark="icelandic_arc_challenge", metric="exact_match,get-answer", col_name="ARC-Challenge-IS")
18
- task7 = Task(benchmark="icelandic_wiki_qa", metric="lm_judge_score,get-answer", col_name="WikiQA-IS")
 
10
 
11
 
12
  class Tasks(Enum):
13
+ task0 = Task(benchmark="icelandic_winogrande_stringmatch", metric="exact_match", col_name="WinoGrande-IS (3-shot)")
14
+ task1 = Task(benchmark="icelandic_sentences_ged_stringmatch", metric="exact_match", col_name="GED")
15
+ task2 = Task(benchmark="icelandic_inflection_all", metric="exact_match", col_name="Inflection (1-shot)")
16
+ task5 = Task(benchmark="icelandic_belebele", metric="exact_match", col_name="Belebele (IS)")
17
+ task6 = Task(benchmark="icelandic_arc_challenge", metric="exact_match", col_name="ARC-Challenge-IS")
18
+ task7 = Task(benchmark="icelandic_wiki_qa", metric="llm_judge_score", col_name="WikiQA-IS")
backend/app/config/hf_config.py CHANGED
@@ -30,7 +30,7 @@ else:
30
  # Repository configuration
31
  REPO_ID = f"{HF_ORGANIZATION}/icelandic-llm-leaderboard"
32
  QUEUE_REPO = f"{HF_ORGANIZATION}/icelandic-llm-leaderboard-requests"
33
- RESULTS_REPO = f"{HF_ORGANIZATION}/icelandic-llm-leaderboard-results"
34
 
35
  # Local cache paths
36
  HF_HOME = os.getenv("HF_HOME", ".")
 
30
  # Repository configuration
31
  REPO_ID = f"{HF_ORGANIZATION}/icelandic-llm-leaderboard"
32
  QUEUE_REPO = f"{HF_ORGANIZATION}/icelandic-llm-leaderboard-requests"
33
+ RESULTS_REPO = f"{HF_ORGANIZATION}/llm-leaderboard-results"
34
 
35
  # Local cache paths
36
  HF_HOME = os.getenv("HF_HOME", ".")
backend/app/leaderboard/read_evals.py CHANGED
@@ -1,95 +1,92 @@
1
  import glob
2
  import json
3
- import math
4
  import os
5
  from dataclasses import dataclass
 
6
 
7
- import dateutil
8
  import numpy as np
9
 
10
  from app.display.formatting import make_clickable_model
11
- from app.display.utils import AutoEvalColumn, ModelType, Tasks, Precision
12
  from app.submission.check_validity import is_model_on_hub
13
 
14
 
 
 
 
 
 
 
 
15
  @dataclass
16
  class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
  model: str
23
- revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- architecture: str = "Unknown"
28
  license: str = "?"
29
  likes: int = 0
30
  num_params: int = 0
31
- date: str = "" # submission date of request file
32
  still_on_hub: bool = False
33
- reasoning: bool = False # Whether reasoning is enabled for this model
34
- note: str = "" # Extra information about the model (e.g., thinking budget, warnings)
35
 
36
  @classmethod
37
- def init_from_json_file(self, json_filepath):
38
- """Inits the result from the specific model result file"""
39
  with open(json_filepath) as fp:
40
  data = json.load(fp)
41
 
42
- config = data.get("config")
43
 
44
- # Precision
45
- precision = Precision.from_str(config.get("model_dtype"))
46
 
47
- # Get model and org
48
- org_and_model = config.get("model_name", config.get("model_args", None))
49
- org_and_model = org_and_model.split("/", 1)
50
 
51
- if len(org_and_model) == 1:
52
- org = None
53
- model = org_and_model[0]
54
- result_key = f"{model}_{precision.value.name}"
55
- else:
56
- org = org_and_model[0]
57
- model = org_and_model[1]
58
- result_key = f"{org}_{model}_{precision.value.name}"
59
- full_model = "/".join(org_and_model)
60
 
61
- still_on_hub, _, model_config = is_model_on_hub(
62
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
63
- )
64
  architecture = "?"
65
  if model_config is not None:
66
  architectures = getattr(model_config, "architectures", None)
67
  if architectures:
68
  architecture = ";".join(architectures)
69
 
70
- # Extract results available in this file (some results are split in several files)
71
- results = {}
72
  for task in Tasks:
73
  task = task.value
74
-
75
- # We average all scores of a given metric (not all metrics are present in all files)
76
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
77
- if accs.size == 0 or any([acc is None for acc in accs]):
 
 
 
 
 
78
  continue
79
 
80
- mean_acc = np.mean(accs) * 100.0
81
- results[task.benchmark] = mean_acc
82
 
83
  return self(
84
  eval_name=result_key,
85
  full_model=full_model,
86
  org=org,
87
  model=model,
88
- results=results,
89
- precision=precision,
90
- revision= config.get("model_sha", ""),
91
  still_on_hub=still_on_hub,
92
- architecture=architecture
93
  )
94
 
95
  def update_with_request_file(self, requests_path):
@@ -104,10 +101,14 @@ class EvalResult:
104
  self.likes = request.get("likes", 0)
105
  self.num_params = request.get("params", 0)
106
  self.date = request.get("submitted_time", "")
107
- self.reasoning = request.get("reasoning", False) # Default to False if missing
 
 
108
  self.note = request.get("note", "") # Default to empty string if missing
109
  except FileNotFoundError:
110
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
 
 
111
 
112
  def to_dict(self):
113
  """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -145,6 +146,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
145
  f"{model_name}_eval_request_*.json",
146
  )
147
  request_files = glob.glob(request_files)
 
 
148
 
149
  # Select correct request file (precision)
150
  request_file = ""
@@ -161,38 +164,46 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
161
  """From the path of the results folder root, extract all needed info for results"""
162
  model_result_filepaths = []
163
 
 
164
  for root, _, files in os.walk(results_path):
165
  # We should only have json files in model results
166
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
 
167
  continue
168
 
169
- # Sort the files by date
170
  try:
171
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
172
- except dateutil.parser._parser.ParserError:
173
- files = [files[-1]]
 
174
 
175
- for file in files:
176
  model_result_filepaths.append(os.path.join(root, file))
177
 
178
-
179
  eval_results = {}
180
  for model_result_filepath in model_result_filepaths:
181
- # Creation of result
182
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
183
- eval_result.update_with_request_file(requests_path)
184
-
185
- # Store results of same eval together
186
- eval_name = eval_result.eval_name
187
- if eval_name in eval_results.keys():
188
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
189
- else:
190
- eval_results[eval_name] = eval_result
 
 
 
 
 
 
191
 
192
  results = []
193
  for v in eval_results.values():
194
  try:
195
- v.to_dict() # we test if the dict version is complete
196
  results.append(v)
197
  except KeyError: # not all eval values present
198
  continue
 
1
  import glob
2
  import json
 
3
  import os
4
  from dataclasses import dataclass
5
+ from functools import lru_cache
6
 
 
7
  import numpy as np
8
 
9
  from app.display.formatting import make_clickable_model
10
+ from app.display.utils import AutoEvalColumn, ModelType, Precision, Tasks
11
  from app.submission.check_validity import is_model_on_hub
12
 
13
 
14
+ # Add caching for hub checks to avoid repeated network calls
15
+ @lru_cache(maxsize=256)
16
+ def cached_is_model_on_hub(full_model, revision):
17
+ """Cached version of is_model_on_hub to avoid repeated network calls"""
18
+ return is_model_on_hub(full_model, revision, trust_remote_code=True, test_tokenizer=False)
19
+
20
+
21
  @dataclass
22
  class EvalResult:
23
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
24
+
25
+ eval_name: str # org_model_precision (uid)
26
+ full_model: str # org/model (path on hub)
27
+ org: str
28
  model: str
29
+ revision: str # commit hash, "" if main
30
  results: dict
31
  precision: Precision = Precision.Unknown
32
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
33
+ architecture: str = "Unknown"
34
  license: str = "?"
35
  likes: int = 0
36
  num_params: int = 0
37
+ date: str = "" # submission date of request file
38
  still_on_hub: bool = False
39
+ reasoning: bool = False # Whether reasoning is enabled for this model
40
+ note: str = "" # Extra information about the model (e.g., thinking budget, warnings)
41
 
42
  @classmethod
43
+ def init_from_new_format_json_file(self, json_filepath):
44
+ """Inits the result from the new format model result file"""
45
  with open(json_filepath) as fp:
46
  data = json.load(fp)
47
 
48
+ results = data.get("results")
49
 
50
+ full_model = data.get("config_general", {}).get("model_name", "").strip()
51
+ result_key = full_model.replace("/", "_")
52
 
53
+ org, model = full_model.split("/", 1) if "/" in full_model else ("", full_model)
 
 
54
 
55
+ still_on_hub, _, model_config = cached_is_model_on_hub(full_model, "main")
 
 
 
 
 
 
 
 
56
 
 
 
 
57
  architecture = "?"
58
  if model_config is not None:
59
  architectures = getattr(model_config, "architectures", None)
60
  if architectures:
61
  architecture = ";".join(architectures)
62
 
63
+ # Extract results available in this file
64
+ score_results = {}
65
  for task in Tasks:
66
  task = task.value
67
+ benchmark_id = task.benchmark
68
+ metric = task.metric
69
+
70
+ scores = [
71
+ results[key][metric]
72
+ for key in results
73
+ if "|" in key and benchmark_id.startswith(key.split("|")[1].removeprefix("icelandic_evals:"))
74
+ ]
75
+ if len(scores) == 0:
76
  continue
77
 
78
+ mean_acc = np.mean(scores) * 100.0
79
+ score_results[benchmark_id] = mean_acc
80
 
81
  return self(
82
  eval_name=result_key,
83
  full_model=full_model,
84
  org=org,
85
  model=model,
86
+ results=score_results,
87
+ revision="",
 
88
  still_on_hub=still_on_hub,
89
+ architecture=architecture,
90
  )
91
 
92
  def update_with_request_file(self, requests_path):
 
101
  self.likes = request.get("likes", 0)
102
  self.num_params = request.get("params", 0)
103
  self.date = request.get("submitted_time", "")
104
+ self.reasoning = request.get("reasoning", False) or request.get("gen_kwargs", {}).get(
105
+ "reasoning_effort", None
106
+ )
107
  self.note = request.get("note", "") # Default to empty string if missing
108
  except FileNotFoundError:
109
+ print(
110
+ f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
111
+ )
112
 
113
  def to_dict(self):
114
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
146
  f"{model_name}_eval_request_*.json",
147
  )
148
  request_files = glob.glob(request_files)
149
+ if len(request_files) == 1:
150
+ return request_files[0]
151
 
152
  # Select correct request file (precision)
153
  request_file = ""
 
164
  """From the path of the results folder root, extract all needed info for results"""
165
  model_result_filepaths = []
166
 
167
+ # Collect all JSON files first
168
  for root, _, files in os.walk(results_path):
169
  # We should only have json files in model results
170
+ json_files = [f for f in files if f.endswith(".json")]
171
+ if len(json_files) == 0:
172
  continue
173
 
174
+ # Sort JSON files by date (newer later)
175
  try:
176
+ json_files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
177
+ except (ValueError, IndexError):
178
+ # If sorting fails, just use the files as-is or take the last one
179
+ json_files = [json_files[-1]] if json_files else []
180
 
181
+ for file in json_files:
182
  model_result_filepaths.append(os.path.join(root, file))
183
 
 
184
  eval_results = {}
185
  for model_result_filepath in model_result_filepaths:
186
+ try:
187
+ # Creation of result
188
+ eval_result = EvalResult.init_from_new_format_json_file(model_result_filepath)
189
+ eval_result.update_with_request_file(requests_path)
190
+
191
+ # Store results of same eval together
192
+ eval_name = eval_result.eval_name
193
+ if eval_name in eval_results:
194
+ # Update with newer scores
195
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
196
+ else:
197
+ eval_results[eval_name] = eval_result
198
+ except Exception as e:
199
+ # Log error but continue processing other files
200
+ print(f"Error processing {model_result_filepath}: {e}")
201
+ continue
202
 
203
  results = []
204
  for v in eval_results.values():
205
  try:
206
+ v.to_dict() # we test if the dict version is complete
207
  results.append(v)
208
  except KeyError: # not all eval values present
209
  continue