Spaces:
Runtime error
Runtime error
ClΓ©mentine
Cleaned and refactored the code, improved filtering, added selection of deleted models
8c49cb6
| import json | |
| import os | |
| from dataclasses import dataclass | |
| from typing import Dict, List, Tuple | |
| import dateutil | |
| import numpy as np | |
| from src.display_models.utils import AutoEvalColumn, make_clickable_model | |
| METRICS = ["acc_norm", "acc_norm", "acc", "mc2"] | |
| BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"] | |
| BENCH_TO_NAME = { | |
| "arc:challenge": AutoEvalColumn.arc.name, | |
| "hellaswag": AutoEvalColumn.hellaswag.name, | |
| "hendrycksTest": AutoEvalColumn.mmlu.name, | |
| "truthfulqa:mc": AutoEvalColumn.truthfulqa.name, | |
| } | |
| class EvalResult: | |
| eval_name: str | |
| org: str | |
| model: str | |
| revision: str | |
| results: dict | |
| precision: str = "" | |
| model_type: str = "" | |
| weight_type: str = "" | |
| def to_dict(self): | |
| from src.load_from_hub import is_model_on_hub | |
| if self.org is not None: | |
| base_model = f"{self.org}/{self.model}" | |
| else: | |
| base_model = f"{self.model}" | |
| data_dict = {} | |
| data_dict["eval_name"] = self.eval_name # not a column, just a save name | |
| data_dict["weight_type"] = self.weight_type # not a column, just a save name | |
| data_dict[AutoEvalColumn.precision.name] = self.precision | |
| data_dict[AutoEvalColumn.model_type.name] = self.model_type | |
| data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model) | |
| data_dict[AutoEvalColumn.dummy.name] = base_model | |
| data_dict[AutoEvalColumn.revision.name] = self.revision | |
| data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0 | |
| data_dict[AutoEvalColumn.still_on_hub.name] = ( | |
| is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline" | |
| ) | |
| for benchmark in BENCHMARKS: | |
| if benchmark not in self.results.keys(): | |
| self.results[benchmark] = None | |
| for k, v in BENCH_TO_NAME.items(): | |
| data_dict[v] = self.results[k] | |
| return data_dict | |
| def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]: | |
| with open(json_filepath) as fp: | |
| data = json.load(fp) | |
| for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]: | |
| if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0: | |
| return None, [] # we skip models with the wrong version | |
| try: | |
| config = data["config"] | |
| except KeyError: | |
| config = data["config_general"] | |
| model = config.get("model_name", None) | |
| if model is None: | |
| model = config.get("model_args", None) | |
| model_sha = config.get("model_sha", "") | |
| model_split = model.split("/", 1) | |
| precision = config.get("model_dtype") | |
| model = model_split[-1] | |
| if len(model_split) == 1: | |
| org = None | |
| model = model_split[0] | |
| result_key = f"{model}_{model_sha}_{precision}" | |
| else: | |
| org = model_split[0] | |
| model = model_split[1] | |
| result_key = f"{org}_{model}_{model_sha}_{precision}" | |
| eval_results = [] | |
| for benchmark, metric in zip(BENCHMARKS, METRICS): | |
| accs = np.array([v.get(metric, None) for k, v in data["results"].items() if benchmark in k]) | |
| if accs.size == 0 or any([acc is None for acc in accs]): | |
| continue | |
| mean_acc = np.mean(accs) * 100.0 | |
| eval_results.append( | |
| EvalResult( | |
| eval_name=result_key, | |
| org=org, | |
| model=model, | |
| revision=model_sha, | |
| results={benchmark: mean_acc}, | |
| precision=precision, # todo model_type=, weight_type= | |
| ) | |
| ) | |
| return result_key, eval_results | |
| def get_eval_results() -> List[EvalResult]: | |
| json_filepaths = [] | |
| for root, dir, files in os.walk("eval-results"): | |
| # We should only have json files in model results | |
| if len(files) == 0 or any([not f.endswith(".json") for f in files]): | |
| continue | |
| # Sort the files by date | |
| # store results by precision maybe? | |
| try: | |
| files.sort(key=lambda x: dateutil.parser.parse(x.split("_", 1)[-1][:-5])) | |
| except dateutil.parser._parser.ParserError: | |
| files = [files[-1]] | |
| # up_to_date = files[-1] | |
| for file in files: | |
| json_filepaths.append(os.path.join(root, file)) | |
| eval_results = {} | |
| for json_filepath in json_filepaths: | |
| result_key, results = parse_eval_result(json_filepath) | |
| for eval_result in results: | |
| if result_key in eval_results.keys(): | |
| eval_results[result_key].results.update(eval_result.results) | |
| else: | |
| eval_results[result_key] = eval_result | |
| eval_results = [v for v in eval_results.values()] | |
| return eval_results | |
| def get_eval_results_dicts() -> List[Dict]: | |
| eval_results = get_eval_results() | |
| return [e.to_dict() for e in eval_results] | |