Spaces:

mideind
/

icelandic-llm-leaderboard

Running

App Files Files Community

gardarjuto commited on Aug 14

Commit

9674655

1 Parent(s): 66ac6d1

switch to new results file format, code formatting, efficiency optimizations

Browse files

Files changed (3) hide show

backend/app/about.py +6 -6
backend/app/config/hf_config.py +1 -1
backend/app/leaderboard/read_evals.py +77 -66

backend/app/about.py CHANGED Viewed

@@ -10,9 +10,9 @@ class Task:
 class Tasks(Enum):
-    task0 = Task(benchmark="icelandic_winogrande_stringmatch", metric="exact_match,get-answer", col_name="WinoGrande-IS (3-shot)")
-    task1 = Task(benchmark="icelandic_sentences_ged_stringmatch", metric="exact_match,get-answer", col_name="GED")
-    task2 = Task(benchmark="icelandic_inflection_all", metric="exact_match,get-answer", col_name="Inflection (1-shot)")
-    task5 = Task(benchmark="icelandic_belebele", metric="exact_match,get-answer", col_name="Belebele (IS)")
-    task6 = Task(benchmark="icelandic_arc_challenge", metric="exact_match,get-answer", col_name="ARC-Challenge-IS")
-    task7 = Task(benchmark="icelandic_wiki_qa", metric="lm_judge_score,get-answer", col_name="WikiQA-IS")

 class Tasks(Enum):
+    task0 = Task(benchmark="icelandic_winogrande_stringmatch", metric="exact_match", col_name="WinoGrande-IS (3-shot)")
+    task1 = Task(benchmark="icelandic_sentences_ged_stringmatch", metric="exact_match", col_name="GED")
+    task2 = Task(benchmark="icelandic_inflection_all", metric="exact_match", col_name="Inflection (1-shot)")
+    task5 = Task(benchmark="icelandic_belebele", metric="exact_match", col_name="Belebele (IS)")
+    task6 = Task(benchmark="icelandic_arc_challenge", metric="exact_match", col_name="ARC-Challenge-IS")
+    task7 = Task(benchmark="icelandic_wiki_qa", metric="llm_judge_score", col_name="WikiQA-IS")

backend/app/config/hf_config.py CHANGED Viewed

@@ -30,7 +30,7 @@ else:
 # Repository configuration
 REPO_ID = f"{HF_ORGANIZATION}/icelandic-llm-leaderboard"
 QUEUE_REPO = f"{HF_ORGANIZATION}/icelandic-llm-leaderboard-requests"
-RESULTS_REPO = f"{HF_ORGANIZATION}/icelandic-llm-leaderboard-results"
 # Local cache paths
 HF_HOME = os.getenv("HF_HOME", ".")

 # Repository configuration
 REPO_ID = f"{HF_ORGANIZATION}/icelandic-llm-leaderboard"
 QUEUE_REPO = f"{HF_ORGANIZATION}/icelandic-llm-leaderboard-requests"
+RESULTS_REPO = f"{HF_ORGANIZATION}/llm-leaderboard-results"
 # Local cache paths
 HF_HOME = os.getenv("HF_HOME", ".")

backend/app/leaderboard/read_evals.py CHANGED Viewed

@@ -1,95 +1,92 @@
 import glob
 import json
-import math
 import os
 from dataclasses import dataclass
-import dateutil
 import numpy as np
 from app.display.formatting import make_clickable_model
-from app.display.utils import AutoEvalColumn, ModelType, Tasks, Precision
 from app.submission.check_validity import is_model_on_hub
 @dataclass
 class EvalResult:
-    """Represents one full evaluation. Built from a combination of the result and request file for a given run.
-    """
-    eval_name: str # org_model_precision (uid)
-    full_model: str # org/model (path on hub)
-    org: str
     model: str
-    revision: str # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
-    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
-    date: str = "" # submission date of request file
     still_on_hub: bool = False
-    reasoning: bool = False # Whether reasoning is enabled for this model
-    note: str = "" # Extra information about the model (e.g., thinking budget, warnings)
     @classmethod
-    def init_from_json_file(self, json_filepath):
-        """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
-        config = data.get("config")
-        # Precision
-        precision = Precision.from_str(config.get("model_dtype"))
-        # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
-        org_and_model = org_and_model.split("/", 1)
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}"
-        full_model = "/".join(org_and_model)
-        still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
-        )
         architecture = "?"
         if model_config is not None:
             architectures = getattr(model_config, "architectures", None)
             if architectures:
                 architecture = ";".join(architectures)
-        # Extract results available in this file (some results are split in several files)
-        results = {}
         for task in Tasks:
             task = task.value
-            # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
-            if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
-            mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
         return self(
             eval_name=result_key,
             full_model=full_model,
             org=org,
             model=model,
-            results=results,
-            precision=precision,
-            revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
-            architecture=architecture
         )
     def update_with_request_file(self, requests_path):
@@ -104,10 +101,14 @@ class EvalResult:
             self.likes = request.get("likes", 0)
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
-            self.reasoning = request.get("reasoning", False)  # Default to False if missing
             self.note = request.get("note", "")  # Default to empty string if missing
         except FileNotFoundError:
-            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -145,6 +146,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
         f"{model_name}_eval_request_*.json",
     )
     request_files = glob.glob(request_files)
     # Select correct request file (precision)
     request_file = ""
@@ -161,38 +164,46 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
-        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
             continue
-        # Sort the files by date
         try:
-            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
-        except dateutil.parser._parser.ParserError:
-            files = [files[-1]]
-        for file in files:
             model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
-        # Creation of result
-        eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        eval_result.update_with_request_file(requests_path)
-        # Store results of same eval together
-        eval_name = eval_result.eval_name
-        if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
-        else:
-            eval_results[eval_name] = eval_result
     results = []
     for v in eval_results.values():
         try:
-            v.to_dict() # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue

 import glob
 import json
 import os
 from dataclasses import dataclass
+from functools import lru_cache
 import numpy as np
 from app.display.formatting import make_clickable_model
+from app.display.utils import AutoEvalColumn, ModelType, Precision, Tasks
 from app.submission.check_validity import is_model_on_hub
+# Add caching for hub checks to avoid repeated network calls
+@lru_cache(maxsize=256)
+def cached_is_model_on_hub(full_model, revision):
+    """Cached version of is_model_on_hub to avoid repeated network calls"""
+    return is_model_on_hub(full_model, revision, trust_remote_code=True, test_tokenizer=False)
 @dataclass
 class EvalResult:
+    """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
+    eval_name: str  # org_model_precision (uid)
+    full_model: str  # org/model (path on hub)
+    org: str
     model: str
+    revision: str  # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
+    model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
+    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
+    date: str = ""  # submission date of request file
     still_on_hub: bool = False
+    reasoning: bool = False  # Whether reasoning is enabled for this model
+    note: str = ""  # Extra information about the model (e.g., thinking budget, warnings)
     @classmethod
+    def init_from_new_format_json_file(self, json_filepath):
+        """Inits the result from the new format model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
+        results = data.get("results")
+        full_model = data.get("config_general", {}).get("model_name", "").strip()
+        result_key = full_model.replace("/", "_")
+        org, model = full_model.split("/", 1) if "/" in full_model else ("", full_model)
+        still_on_hub, _, model_config = cached_is_model_on_hub(full_model, "main")
         architecture = "?"
         if model_config is not None:
             architectures = getattr(model_config, "architectures", None)
             if architectures:
                 architecture = ";".join(architectures)
+        # Extract results available in this file
+        score_results = {}
         for task in Tasks:
             task = task.value
+            benchmark_id = task.benchmark
+            metric = task.metric
+            scores = [
+                results[key][metric]
+                for key in results
+                if "|" in key and benchmark_id.startswith(key.split("|")[1].removeprefix("icelandic_evals:"))
+            ]
+            if len(scores) == 0:
                 continue
+            mean_acc = np.mean(scores) * 100.0
+            score_results[benchmark_id] = mean_acc
         return self(
             eval_name=result_key,
             full_model=full_model,
             org=org,
             model=model,
+            results=score_results,
+            revision="",
             still_on_hub=still_on_hub,
+            architecture=architecture,
         )
     def update_with_request_file(self, requests_path):
             self.likes = request.get("likes", 0)
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
+            self.reasoning = request.get("reasoning", False) or request.get("gen_kwargs", {}).get(
+                "reasoning_effort", None
+            )
             self.note = request.get("note", "")  # Default to empty string if missing
         except FileNotFoundError:
+            print(
+                f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
+            )
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         f"{model_name}_eval_request_*.json",
     )
     request_files = glob.glob(request_files)
+    if len(request_files) == 1:
+        return request_files[0]
     # Select correct request file (precision)
     request_file = ""
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
+    # Collect all JSON files first
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
+        json_files = [f for f in files if f.endswith(".json")]
+        if len(json_files) == 0:
             continue
+        # Sort JSON files by date (newer later)
         try:
+            json_files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
+        except (ValueError, IndexError):
+            # If sorting fails, just use the files as-is or take the last one
+            json_files = [json_files[-1]] if json_files else []
+        for file in json_files:
             model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
+        try:
+            # Creation of result
+            eval_result = EvalResult.init_from_new_format_json_file(model_result_filepath)
+            eval_result.update_with_request_file(requests_path)
+            # Store results of same eval together
+            eval_name = eval_result.eval_name
+            if eval_name in eval_results:
+                # Update with newer scores
+                eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
+            else:
+                eval_results[eval_name] = eval_result
+        except Exception as e:
+            # Log error but continue processing other files
+            print(f"Error processing {model_result_filepath}: {e}")
+            continue
     results = []
     for v in eval_results.values():
         try:
+            v.to_dict()  # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue