open_pl_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

Clémentine commited on Nov 21, 2023

Commit

943f952

1 Parent(s): 314f91a

update read

Browse files

Files changed (3) hide show

README.md +24 -3
src/display/about.py +5 -3
src/leaderboard/read_evals.py +5 -9

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
-title: Open LLM Leaderboard
-emoji: 🏆
 colorFrom: green
 colorTo: indigo
 sdk: gradio
@@ -12,4 +12,25 @@ license: apache-2.0
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
-Most of the variables to change for a default leaderboard are in env (replace the path for your leaderboard) and src/display/about.

 ---
+title: Demo Leaderboard
+emoji: 🥇
 colorFrom: green
 colorTo: indigo
 sdk: gradio
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+Most of the variables to change for a default leaderboard are in env (replace the path for your leaderboard) and src/display/about.
+Results files should have the following format:
+```
+{
+    "config": {
+        "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
+        "model_name": "path of the model on the hub: org/model",
+        "model_sha": "revision on the hub",
+    },
+    "results": {
+        "task_name": {
+            "metric_name": score,
+        },
+        "task_name2": {
+            "metric_name": score,
+        }
+    }
+}
+```
+Request files are created automatically by this tool.

src/display/about.py CHANGED Viewed

@@ -10,15 +10,17 @@ class Task:
 # Init: to update with your specific keys
 class Tasks(Enum):
-    task0 = Task("Key in the harness", "metric in the harness", "Display name 1")
-    task1 = Task("Key in the harness", "metric in the harness", "Display name 2")
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 """
 # Which evaluations are you running? how can people reproduce what you have?

 # Init: to update with your specific keys
 class Tasks(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("task_name1", "metric_name", "First task")
+    task1 = Task("task_name2", "metric_name", "Second task")
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+Intro text
 """
 # Which evaluations are you running? how can people reproduce what you have?

src/leaderboard/read_evals.py CHANGED Viewed

@@ -5,8 +5,6 @@ import os
 from dataclasses import dataclass
 import dateutil
-from datetime import datetime
-from transformers import AutoConfig
 import numpy as np
 from src.display.formatting import make_clickable_model
@@ -16,7 +14,6 @@ from src.submission.check_validity import is_model_on_hub
 @dataclass
 class EvalResult:
-    # Also see src.display.utils.AutoEvalColumn for what will be displayed.
     eval_name: str # org_model_precision (uid)
     full_model: str # org/model (path on hub)
     org: str
@@ -26,7 +23,7 @@ class EvalResult:
     precision: Precision = Precision.Unknown
     model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original # Original or Adapter
-    architecture: str = "Unknown" # From config file
     license: str = "?"
     likes: int = 0
     num_params: int = 0
@@ -39,8 +36,7 @@ class EvalResult:
         with open(json_filepath) as fp:
             data = json.load(fp)
-        # We manage the legacy config format
-        config = data.get("config", data.get("config_general", None))
         # Precision
         precision = Precision.from_str(config.get("model_dtype"))
@@ -59,7 +55,7 @@ class EvalResult:
             result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
-        still_on_hub, error, model_config = is_model_on_hub(
             full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
         )
         architecture = "?"
@@ -73,8 +69,8 @@ class EvalResult:
         for task in Tasks:
             task = task.value
-            # We average all scores of a given metric
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue

 from dataclasses import dataclass
 import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
 @dataclass
 class EvalResult:
     eval_name: str # org_model_precision (uid)
     full_model: str # org/model (path on hub)
     org: str
     precision: Precision = Precision.Unknown
     model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original # Original or Adapter
+    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
         with open(json_filepath) as fp:
             data = json.load(fp)
+        config = data.get("config")
         # Precision
         precision = Precision.from_str(config.get("model_dtype"))
             result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
+        still_on_hub, _, model_config = is_model_on_hub(
             full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
         )
         architecture = "?"
         for task in Tasks:
             task = task.value
+            # We average all scores of a given metric (not all metrics are present in all files)
+            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue