Spaces:

evalitahf
/

evalita_llm_leaderboard

Running

App Files Files Community

rzanoli commited on 18 days ago

Commit

c996d40

1 Parent(s): 8886020

Small changes

Browse files

Files changed (2) hide show

src/leaderboard/read_evals.py +203 -95
src/leaderboard/read_evals_old.py +0 -296

src/leaderboard/read_evals.py CHANGED Viewed

@@ -1,95 +1,203 @@
-import glob
-import json
-import math
-import os
-from dataclasses import dataclass
-from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, FewShotType
-from src.submission.check_validity import is_model_on_hub
-@dataclass
-class EvalResult:
-    eval_name: str
-    full_model: str
-    org: str
-    model: str
-    revision: str
-    results: dict
-    average_CPS: str
-    fewshot: int
-    fewshot_type: FewShotType = FewShotType.Unknown
-    weight_type: WeightType = WeightType.Original
-    architecture: str = "Unknown"
-    license: str = "?"
-    likes: int = 0
-    num_params: int = 0
-    date: str = ""
-    still_on_hub: bool = False
-    @classmethod
-    def init_from_json_file(cls, json_filepath):
-        with open(json_filepath) as fp:
-            data = json.load(fp)
-        config = data.get("config")
-        average_CPS = f"{data.get('average_CPS'):.2f}"
-        num_fewshot = int(config.get("num_fewshot", 0))
-        fewshot_type = FewShotType.from_num_fewshot(num_fewshot)
-        model_type = ModelType.from_str(config.get("model_type")) if config.get("model_type") else None
-        num_params = math.ceil(config.get("num_params_billion", 0)) if config.get("num_params_billion") else 0
-        org_and_model = config.get("model_name", "").split("/", 1)
-        org, model = (org_and_model if len(org_and_model) == 2 else (None, org_and_model[0]))
-        full_model = "/".join([org, model] if org else [model])
-        still_on_hub, _, model_config = is_model_on_hub(full_model, config.get("model_sha", "main"))
-        architecture = ";".join(getattr(model_config, "architectures", [])) if model_config else "?"
-        results = {
-            task.value.benchmark: f"{data['tasks'].get(task.value.benchmark, {}).get(task.metric_type, 0):.2f}"
-            for task in Tasks
-        }
-        return cls(
-            eval_name=f"{model}_{num_fewshot}",
-            full_model=full_model,
-            org=org,
-            model=model,
-            results=results,
-            average_CPS=average_CPS,
-            fewshot=fewshot_type,
-            fewshot_type=fewshot_type,
-            revision=config.get("model_sha", ""),
-            still_on_hub=still_on_hub,
-            architecture=architecture,
-            num_params=num_params
-        )
-def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
-    model_result_filepaths = [
-        os.path.join(root, file)
-        for root, _, files in os.walk(results_path)
-        for file in sorted(files, key=lambda x: x.split("_")[-1], reverse=True) if file.endswith(".json")
-    ]
-    eval_results = {}
-    for model_result_filepath in model_result_filepaths:
-        eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        eval_name = eval_result.eval_name
-        if eval_name not in eval_results:
-            eval_results[eval_name] = eval_result
-        else:
-            eval_results[eval_name].results.update(eval_result.results)
-    results = []
-    for v in eval_results.values():
-        try:
-            v.to_dict()  # Test if the dict version is complete
-            results.append(v)
-        except KeyError:
-            continue
-    return results

+import glob
+import json
+import math
+import os
+from dataclasses import dataclass
+import dateutil
+import numpy as np
+#from get_model_info import num_params
+from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, FewShotType
+from src.submission.check_validity import is_model_on_hub
+@dataclass
+class EvalResult:
+    """Represents one full evaluation. Built from a combination of the result and request file for a given run.
+    """
+    eval_name: str # org_model_precision (uid)
+    full_model: str # org/model (path on hub)
+    org: str
+    model: str
+    revision: str # commit hash, "" if main
+    results: dict
+    average_CPS: str
+    fewshot: int
+    fewshot_type: FewShotType = FewShotType.Unknown
+    weight_type: WeightType = WeightType.Original # Original or Adapter
+    architecture: str = "Unknown"
+    license: str = "?"
+    likes: int = 0
+    num_params: int = 0
+    date: str = "" # submission date of request file
+    still_on_hub: bool = False
+    @classmethod
+    def init_from_json_file(self, json_filepath):
+        """Inits the result from the specific model result file"""
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        config = data.get("config")
+        average_CPS = f"{data.get('average_CPS'):.2f}"
+        num_fewshot = config.get("num_fewshot", 0)  # Imposta il valore predefinito a 0
+        try:
+            num_fewshot = int(num_fewshot)  # Converte in intero se possibile
+        except ValueError:
+            num_fewshot = 0  # Se la conversione fallisce, assegna 0
+        # Determine the few-shot type (ZS or FS) based on num_fewshot
+        fewshot_type = FewShotType.from_num_fewshot(num_fewshot)  # Use the new
+        num_params = int(0)
+        num_params_billion = config.get("num_params_billion")
+        if num_params_billion is not None:
+            num_params = math.ceil(num_params_billion)
+        # Get model and org
+        org_and_model = config.get("model_name", config.get("model_args", None))
+        org_and_model = org_and_model.split("/", 1)
+        if len(org_and_model) == 1:
+            org = None
+            model = org_and_model[0]
+            #result_key = f"{model}_{precision.value.name}"
+            result_key = f"{model}_{num_fewshot}"
+        else:
+            org = org_and_model[0]
+            model = org_and_model[1]
+            #result_key = f"{org}_{model}_{precision.value.name}"
+            result_key = f"{org}_{model}_{num_fewshot}"
+        full_model = "/".join(org_and_model)
+        still_on_hub, _, model_config = is_model_on_hub(
+            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
+        )
+        architecture = "?"
+        if model_config is not None:
+            architectures = getattr(model_config, "architectures", None)
+            if architectures:
+                architecture = ";".join(architectures)
+        # Extract results available in this file (some results are split in several files)
+        results = {}
+        for task in Tasks:
+            task = task.value
+            for k, v in data["tasks"].items():
+                if task.benchmark[:-2] == k:
+                    if "Best Prompt Id" in task.col_name:
+                        results[task.benchmark] = int(v[task.metric_type][-1:])
+                    else:
+                        results[task.benchmark] = f"{v[task.metric_type]:.2f}"  # Ensure two decimals for display
+        return self(
+            eval_name=result_key,
+            full_model=full_model,
+            org=org,
+            model=model,
+            results=results,
+            average_CPS=average_CPS,
+            fewshot_type=fewshot_type,
+            fewshot=num_fewshot,
+            revision= config.get("model_sha", ""),
+            still_on_hub=still_on_hub,
+            architecture=architecture,
+            num_params=num_params
+        )
+    '''
+    def update_with_request_file(self, requests_path):
+        """Finds the relevant request file for the current model and updates info with it"""
+        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
+        try:
+            with open(request_file, "r") as f:
+                request = json.load(f)
+            self.model_type = ModelType.from_str(request.get("model_type", ""))
+            self.weight_type = WeightType[request.get("weight_type", "Original")]
+            self.license = request.get("license", "?")
+            self.likes = request.get("likes", 0)
+            self.num_params = request.get("params", 0)
+            self.date = request.get("submitted_time", "")
+        except Exception:
+            print(f"Could not find request file for {self.org}/{self.model} with precision
+    '''
+    def to_dict(self):
+        """Converts the Eval Result to a dict compatible with our dataframe display"""
+        average = self.average_CPS
+        fewshot_type_symbol = (
+            self.fewshot_type.value.symbol if isinstance(self.fewshot_type, FewShotType) else "❓"
+        )
+        data_dict = {
+            "eval_name": self.eval_name,  # not a column, just a save name,
+            #AutoEvalColumn.precision.name: self.precision.value.name,
+            #AutoEvalColumn.model_type.name: self.model_type.value.name,
+            #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
+            #AutoEvalColumn.model_type.name: self.model_type.value.name if self.model_type else "Unknown",
+            #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol if self.model_type else "Unknown",
+            AutoEvalColumn.fewshot_type.name: fewshot_type_symbol,  # Simbolo corretto per fewshot type
+            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
+            AutoEvalColumn.architecture.name: self.architecture,
+            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
+            AutoEvalColumn.revision.name: self.revision,
+            AutoEvalColumn.average.name: average,
+            #AutoEvalColumn.fewshot.name: fewshot,
+            AutoEvalColumn.license.name: self.license,
+            AutoEvalColumn.likes.name: self.likes,
+            AutoEvalColumn.params.name: self.num_params,
+            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
+        }
+        for task in Tasks:
+            data_dict[task.value.col_name] = self.results[task.value.benchmark]
+        return data_dict
+def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
+    """From the path of the results folder root, extract all needed info for results"""
+    model_result_filepaths = []
+    for root, _, files in os.walk(results_path):
+        # We should only have json files in model results
+        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
+            continue
+        # Sort the files by date
+        try:
+            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
+        except dateutil.parser._parser.ParserError:
+            files = [files[-1]]
+        for file in files:
+            model_result_filepaths.append(os.path.join(root, file))
+    eval_results = {}
+    for model_result_filepath in model_result_filepaths:
+        # Creation of result
+        eval_result = EvalResult.init_from_json_file(model_result_filepath)
+        #eval_result.update_with_request_file(requests_path)
+        # Store results of same eval together
+        eval_name = eval_result.eval_name
+        if eval_name in eval_results.keys():
+            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
+        else:
+            eval_results[eval_name] = eval_result
+    results = []
+    for v in eval_results.values():
+        try:
+            v.to_dict() # we test if the dict version is complete
+            results.append(v)
+        except KeyError:  # not all eval values present
+            continue
+    return results

src/leaderboard/read_evals_old.py DELETED Viewed

@@ -1,296 +0,0 @@
-import glob
-import json
-import math
-import os
-from dataclasses import dataclass
-import dateutil
-import numpy as np
-#from get_model_info import num_params
-from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, FewShotType
-from src.submission.check_validity import is_model_on_hub
-@dataclass
-class EvalResult:
-    """Represents one full evaluation. Built from a combination of the result and request file for a given run.
-    """
-    eval_name: str # org_model_precision (uid)
-    full_model: str # org/model (path on hub)
-    org: str
-    model: str
-    revision: str # commit hash, "" if main
-    results: dict
-    average_CPS: str
-    fewshot: int
-    #fewshot_type: str
-    #precision: Precision = Precision.Unknown
-    #model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
-    fewshot_type: FewShotType = FewShotType.Unknown
-    weight_type: WeightType = WeightType.Original # Original or Adapter
-    architecture: str = "Unknown"
-    license: str = "?"
-    likes: int = 0
-    num_params: int = 0
-    date: str = "" # submission date of request file
-    still_on_hub: bool = False
-    @classmethod
-    def init_from_json_file(self, json_filepath):
-        """Inits the result from the specific model result file"""
-        with open(json_filepath) as fp:
-            data = json.load(fp)
-        config = data.get("config")
-        average_CPS = f"{data.get('average_CPS'):.2f}"
-        num_fewshot = config.get("num_fewshot", 0)  # Imposta il valore predefinito a 0
-        try:
-            num_fewshot = int(num_fewshot)  # Converte in intero se possibile
-        except ValueError:
-            num_fewshot = 0  # Se la conversione fallisce, assegna 0
-        # Determine the few-shot type (ZS or FS) based on num_fewshot
-        fewshot_type = FewShotType.from_num_fewshot(num_fewshot)  # Use the new
-        #precision = config.get("precision")
-        #print(precision)
-        #print(config, num_fewshot)
-        # Precision
-        #precision = Precision.from_str(config.get("model_dtype"))
-        model_type = config.get("model_type")
-        # Modifica: Convertire model_type in un oggetto Enum (se è un Enum)
-        model_type = ModelType.from_str(model_type) if model_type else None
-        #print("=====================", model_type, config.get("model_name"))
-        # Initialize num_params with a default value (e.g., 0)
-        num_params = int(0)
-        # Controlla se "num_params_billion" esiste in config e non è null
-        num_params_billion = config.get("num_params_billion")
-        if num_params_billion is not None:
-            num_params = math.ceil(num_params_billion)
-        print("^^^^^^^^^^^^^^^^^^^^^^^^^", num_params, config.get("num_params_billion"))
-        # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
-        org_and_model = org_and_model.split("/", 1)
-        #print(precision.value.name)
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            #result_key = f"{model}_{precision.value.name}"
-            result_key = f"{model}_{num_fewshot}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
-            #result_key = f"{org}_{model}_{precision.value.name}"
-            result_key = f"{org}_{model}_{num_fewshot}"
-        full_model = "/".join(org_and_model)
-        still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
-        )
-        architecture = "?"
-        if model_config is not None:
-            architectures = getattr(model_config, "architectures", None)
-            if architectures:
-                architecture = ";".join(architectures)
-        # Extract results available in this file (some results are split in several files)
-        results = {}
-        for task in Tasks:
-            task = task.value
-            '''
-            # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
-            if accs.size == 0 or any([acc is None for acc in accs]):
-                continue
-            mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
-            '''
-            for k, v in data["tasks"].items():
-                #if task.benchmark == k:
-                if task.benchmark[:-2] == k:
-                    # print(k, "==================", v)
-                    # results[task.benchmark] = v[task.cps]
-                    #print(task.benchmark, v[task.metric])
-                    if "Best Prompt Id" in task.col_name:
-                        results[task.benchmark] = int(v[task.metric_type][-1:])
-                        #print(results[task.benchmark],v[task.metric_type][-1:])
-                    else:
-                        #results[task.benchmark] = round(v[task.metric_type], 2)
-                        # Format the value to 2 decimal places (ensure it's always shown as xx.xx)
-                        results[task.benchmark] = f"{v[task.metric_type]:.2f}"  # Ensure two decimals for display
-                    #results[task.benchmark + "_" + task.metric] = 1.0
-                    #results[task.benchmark] = v[task.accuracy]
-                    # print("======", results[task.benchmark])
-                    #results[task.benchmark] = 1.0
-        return self(
-            eval_name=result_key,
-            full_model=full_model,
-            org=org,
-            model=model,
-            results=results,
-            average_CPS=average_CPS,
-            fewshot_type=fewshot_type,  # Set the fewshot type (ZS or FS)
-            fewshot=num_fewshot,
-            #model_type=model_type,
-            #precision=precision,
-            revision= config.get("model_sha", ""),
-            still_on_hub=still_on_hub,
-            architecture=architecture,
-            num_params=num_params
-        )
-    '''
-    def update_with_request_file(self, requests_path):
-        """Finds the relevant request file for the current model and updates info with it"""
-        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
-        try:
-            with open(request_file, "r") as f:
-                request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", ""))
-            self.weight_type = WeightType[request.get("weight_type", "Original")]
-            self.license = request.get("license", "?")
-            self.likes = request.get("likes", 0)
-            self.num_params = request.get("params", 0)
-            self.date = request.get("submitted_time", "")
-        except Exception:
-            print(f"Could not find request file for {self.org}/{self.model} with precision
-    '''
-    def to_dict(self):
-        """Converts the Eval Result to a dict compatible with our dataframe display"""
-        #average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
-        average = self.average_CPS
-        fewshot = self.fewshot
-        # Ottiene il simbolo di FewShotType in modo simile a ModelType
-        fewshot_type_symbol = (
-            self.fewshot_type.value.symbol if isinstance(self.fewshot_type, FewShotType) else "❓"
-        )
-        #("?????", fewshot)
-        data_dict = {
-            "eval_name": self.eval_name,  # not a column, just a save name,
-            #AutoEvalColumn.precision.name: self.precision.value.name,
-            #AutoEvalColumn.model_type.name: self.model_type.value.name,
-            #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            #AutoEvalColumn.model_type.name: self.model_type.value.name if self.model_type else "Unknown",
-            #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol if self.model_type else "Unknown",
-            AutoEvalColumn.fewshot_type.name: fewshot_type_symbol,  # Simbolo corretto per fewshot type
-            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            AutoEvalColumn.architecture.name: self.architecture,
-            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
-            AutoEvalColumn.revision.name: self.revision,
-            AutoEvalColumn.average.name: average,
-            #AutoEvalColumn.fewshot.name: fewshot,
-            AutoEvalColumn.license.name: self.license,
-            AutoEvalColumn.likes.name: self.likes,
-            AutoEvalColumn.params.name: self.num_params,
-            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
-        }
-        for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
-        return data_dict
-'''
-def get_request_file_for_model(requests_path, model_name, precision):
-    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
-    request_files = os.path.join(
-        requests_path,
-        f"{model_name}_eval_request_*.json",
-    )
-    request_files = glob.glob(request_files)
-    # Select correct request file (precision)
-    request_file = ""
-    request_files = sorted(request_files, reverse=True)
-    for tmp_request_file in request_files:
-        with open(tmp_request_file, "r") as f:
-            req_content = json.load(f)
-            if (
-                req_content["status"] in ["FINISHED"]
-                and req_content["precision"] == precision.split(".")[-1]
-            ):
-                request_file = tmp_request_file
-    return request_file
-'''
-def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
-    """From the path of the results folder root, extract all needed info for results"""
-    model_result_filepaths = []
-    for root, _, files in os.walk(results_path):
-        # We should only have json files in model results
-        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
-            continue
-        # Sort the files by date
-        try:
-            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
-        except dateutil.parser._parser.ParserError:
-            files = [files[-1]]
-        for file in files:
-            model_result_filepaths.append(os.path.join(root, file))
-    eval_results = {}
-    for model_result_filepath in model_result_filepaths:
-        # Creation of result
-        eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        #eval_result.update_with_request_file(requests_path)
-        # Store results of same eval together
-        eval_name = eval_result.eval_name
-        if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
-        else:
-            eval_results[eval_name] = eval_result
-    results = []
-    for v in eval_results.values():
-        try:
-            v.to_dict() # we test if the dict version is complete
-            results.append(v)
-        except KeyError:  # not all eval values present
-            continue
-    return results