Spaces:
Runtime error
Runtime error
sort results by date
Browse files- src/about.py +23 -22
- src/leaderboard/read_evals.py +68 -37
src/about.py
CHANGED
|
@@ -7,35 +7,36 @@ class Task:
|
|
| 7 |
metric: str
|
| 8 |
col_name: str
|
| 9 |
type: str
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
# Select your tasks here
|
| 13 |
# ---------------------------------------------------
|
| 14 |
class Tasks(Enum):
|
| 15 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 16 |
-
# task2 = Task("belebele_pol_Latn", "acc,none", "belebele_pol_Latn", "multiple_choice")
|
| 17 |
-
task3 = Task("polemo2_in", "exact_match,score-first", "polemo2-in_g", "generate_until")
|
| 18 |
-
task4 = Task("polemo2_in_multiple_choice", "acc,none", "polemo2-in_mc", "multiple_choice")
|
| 19 |
-
task5 = Task("polemo2_out", "exact_match,score-first", "polemo2-out_g", "generate_until")
|
| 20 |
-
task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice")
|
| 21 |
-
task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice")
|
| 22 |
-
task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until")
|
| 23 |
-
task9a = Task("polish_belebele_mc", "acc,none", "belebele_mc", "multiple_choice")
|
| 24 |
-
task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until")
|
| 25 |
-
task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice")
|
| 26 |
-
task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until")
|
| 27 |
-
task12 = Task("polish_ppc_multiple_choice", "acc,none", "ppc_mc", "multiple_choice")
|
| 28 |
-
task13 = Task("polish_ppc_regex", "exact_match,score-first", "ppc_g", "generate_until")
|
| 29 |
-
task14 = Task("polish_psc_multiple_choice", "f1,none", "psc_mc", "multiple_choice")
|
| 30 |
-
task15 = Task("polish_psc_regex", "f1,score-first", "psc_g", "generate_until")
|
| 31 |
-
task16 = Task("polish_cbd_multiple_choice", "f1,none", "cbd_mc", "multiple_choice")
|
| 32 |
-
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until")
|
| 33 |
-
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice")
|
| 34 |
-
task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until")
|
| 35 |
task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
|
| 36 |
-
task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "
|
| 37 |
-
task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "
|
| 38 |
-
task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "
|
| 39 |
|
| 40 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 41 |
# ---------------------------------------------------
|
|
|
|
| 7 |
metric: str
|
| 8 |
col_name: str
|
| 9 |
type: str
|
| 10 |
+
baseline: float = 0.0
|
| 11 |
|
| 12 |
|
| 13 |
# Select your tasks here
|
| 14 |
# ---------------------------------------------------
|
| 15 |
class Tasks(Enum):
|
| 16 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 17 |
+
# task2 = Task("belebele_pol_Latn", "acc,none", "belebele_pol_Latn", "multiple_choice", 0.279)
|
| 18 |
+
task3 = Task("polemo2_in", "exact_match,score-first", "polemo2-in_g", "generate_until", 0.416)
|
| 19 |
+
task4 = Task("polemo2_in_multiple_choice", "acc,none", "polemo2-in_mc", "multiple_choice", 0.416)
|
| 20 |
+
task5 = Task("polemo2_out", "exact_match,score-first", "polemo2-out_g", "generate_until", 0.368)
|
| 21 |
+
task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice", 0.368)
|
| 22 |
+
task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice", 0.143)
|
| 23 |
+
task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until", 0.143)
|
| 24 |
+
task9a = Task("polish_belebele_mc", "acc,none", "belebele_mc", "multiple_choice", 0.279)
|
| 25 |
+
task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until", 0.279)
|
| 26 |
+
task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice", 0.289)
|
| 27 |
+
task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until", 0.289)
|
| 28 |
+
task12 = Task("polish_ppc_multiple_choice", "acc,none", "ppc_mc", "multiple_choice", 0.419)
|
| 29 |
+
task13 = Task("polish_ppc_regex", "exact_match,score-first", "ppc_g", "generate_until", 0.419)
|
| 30 |
+
task14 = Task("polish_psc_multiple_choice", "f1,none", "psc_mc", "multiple_choice", 0.466)
|
| 31 |
+
task15 = Task("polish_psc_regex", "f1,score-first", "psc_g", "generate_until", 0.466)
|
| 32 |
+
task16 = Task("polish_cbd_multiple_choice", "f1,none", "cbd_mc", "multiple_choice", 0.149)
|
| 33 |
+
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until", 0.149)
|
| 34 |
+
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice", 0.343)
|
| 35 |
+
task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until", 0.343)
|
| 36 |
task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
|
| 37 |
+
# task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
|
| 38 |
+
# task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
|
| 39 |
+
# task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
|
| 40 |
|
| 41 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 42 |
# ---------------------------------------------------
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -14,26 +14,28 @@ from src.submission.check_validity import is_model_on_hub
|
|
| 14 |
|
| 15 |
NUM_FEWSHOT = 0
|
| 16 |
|
|
|
|
| 17 |
@dataclass
|
| 18 |
class EvalResult:
|
| 19 |
-
eval_name: str
|
| 20 |
-
full_model: str
|
| 21 |
-
org: str
|
| 22 |
model: str
|
| 23 |
-
revision: str
|
| 24 |
results: dict
|
| 25 |
precision: Precision = Precision.Unknown
|
| 26 |
-
model_type: ModelType = ModelType.Unknown
|
| 27 |
-
weight_type: WeightType = WeightType.Original
|
| 28 |
-
architecture: str = "Unknown"
|
| 29 |
license: str = "?"
|
| 30 |
lang: str = "?"
|
| 31 |
likes: int = 0
|
| 32 |
num_params: int = 0
|
| 33 |
-
date: str = ""
|
| 34 |
still_on_hub: bool = False
|
| 35 |
n_shot: NShotType = NShotType.n0
|
| 36 |
org_and_model: str = ""
|
|
|
|
| 37 |
|
| 38 |
@classmethod
|
| 39 |
def init_from_json_file(self, json_filepath, n_shot_num):
|
|
@@ -43,6 +45,7 @@ class EvalResult:
|
|
| 43 |
|
| 44 |
config = data.get("config")
|
| 45 |
n_shot = data.get("n-shot")
|
|
|
|
| 46 |
|
| 47 |
# Precision
|
| 48 |
precision = Precision.from_str(config.get("model_dtype"))
|
|
@@ -54,14 +57,17 @@ class EvalResult:
|
|
| 54 |
|
| 55 |
if re.match(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", org_and_model):
|
| 56 |
org_and_model = re.sub(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", SPICHLERZ_ORG, org_and_model)
|
| 57 |
-
org_and_model = org_and_model.replace(",dtype=bfloat16", "")
|
| 58 |
|
| 59 |
-
org_and_model=org_and_model.replace("
|
| 60 |
-
org_and_model=org_and_model.replace("
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
org_and_model = re.sub(r"^pretrained=", "", org_and_model)
|
| 63 |
org_and_model = org_and_model.replace(",trust_remote_code=True", "")
|
| 64 |
org_and_model = re.sub(",prefix_token_id=\d+", "", org_and_model)
|
|
|
|
| 65 |
|
| 66 |
org_and_model = org_and_model.split("/", 1)
|
| 67 |
|
|
@@ -90,7 +96,8 @@ class EvalResult:
|
|
| 90 |
task = task.value
|
| 91 |
|
| 92 |
# We average all scores of a given metric (not all metrics are present in all files)
|
| 93 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if
|
|
|
|
| 94 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 95 |
continue
|
| 96 |
|
|
@@ -98,7 +105,8 @@ class EvalResult:
|
|
| 98 |
mean_acc = np.mean(accs)
|
| 99 |
else:
|
| 100 |
mean_acc = np.mean(accs) * 100.0
|
| 101 |
-
results[task.benchmark] = mean_acc
|
|
|
|
| 102 |
|
| 103 |
return self(
|
| 104 |
eval_name=result_key,
|
|
@@ -106,27 +114,27 @@ class EvalResult:
|
|
| 106 |
org=org,
|
| 107 |
model=model,
|
| 108 |
results=results,
|
| 109 |
-
precision=precision,
|
| 110 |
-
revision=
|
| 111 |
still_on_hub=still_on_hub,
|
| 112 |
architecture=architecture,
|
| 113 |
n_shot=NShotType.from_str(n_shot_num),
|
| 114 |
-
org_and_model=orig_org_and_model
|
|
|
|
| 115 |
)
|
| 116 |
|
| 117 |
def update_with_metadata(self, metadata):
|
| 118 |
-
#print('UPDATE', self.full_model, self.model, self.eval_name)
|
| 119 |
try:
|
| 120 |
-
meta=metadata[self.full_model]
|
| 121 |
self.model_type = ModelType.from_str(meta.get("type", "?"))
|
| 122 |
self.num_params = meta.get("params", 0)
|
| 123 |
self.license = meta.get("license", "?")
|
| 124 |
self.lang = meta.get("lang", "?")
|
| 125 |
-
#TODO desc name
|
| 126 |
except KeyError:
|
| 127 |
print(f"Could not find metadata for {self.full_model}")
|
| 128 |
|
| 129 |
-
|
| 130 |
def update_with_request_file(self, requests_path):
|
| 131 |
"""Finds the relevant request file for the current model and updates info with it"""
|
| 132 |
return
|
|
@@ -149,12 +157,18 @@ class EvalResult:
|
|
| 149 |
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
| 150 |
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
| 151 |
all_tasks = g_tasks + mc_tasks
|
| 152 |
-
average = sum([v for task,v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
|
| 153 |
-
average_g = sum([v for task,v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
| 154 |
-
average_mc = sum([v for task,v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
| 155 |
|
|
|
|
| 156 |
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
# data_dict = {
|
| 159 |
# "eval_name": self.eval_name, # not a column, just a save name,
|
| 160 |
# AutoEvalColumn.precision.name: self.precision.value.name,
|
|
@@ -188,7 +202,6 @@ class EvalResult:
|
|
| 188 |
except KeyError:
|
| 189 |
print(f"Could not find model type")
|
| 190 |
|
| 191 |
-
|
| 192 |
try:
|
| 193 |
data_dict[AutoEvalColumn.model_type_symbol.name] = self.model_type.value.symbol
|
| 194 |
except KeyError:
|
|
@@ -209,7 +222,8 @@ class EvalResult:
|
|
| 209 |
print(f"AttributeError architecture")
|
| 210 |
|
| 211 |
try:
|
| 212 |
-
data_dict[AutoEvalColumn.model.name] = make_clickable_model(
|
|
|
|
| 213 |
except KeyError:
|
| 214 |
print(f"Could not find model")
|
| 215 |
|
|
@@ -305,8 +319,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
| 305 |
with open(tmp_request_file, "r") as f:
|
| 306 |
req_content = json.load(f)
|
| 307 |
if (
|
| 308 |
-
|
| 309 |
-
|
| 310 |
):
|
| 311 |
request_file = tmp_request_file
|
| 312 |
return request_file
|
|
@@ -330,30 +344,48 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
|
|
| 330 |
for file in files:
|
| 331 |
model_result_filepaths.append(os.path.join(root, file))
|
| 332 |
|
|
|
|
|
|
|
| 333 |
eval_results = {}
|
| 334 |
-
for n_shot in [0,5]:
|
| 335 |
for model_result_filepath in model_result_filepaths:
|
| 336 |
# Creation of result
|
| 337 |
eval_result = EvalResult.init_from_json_file(model_result_filepath, n_shot_num=n_shot)
|
| 338 |
eval_result.update_with_request_file(requests_path)
|
| 339 |
-
#update with metadata
|
| 340 |
eval_result.update_with_metadata(metadata)
|
| 341 |
|
| 342 |
-
|
| 343 |
# Store results of same eval together
|
| 344 |
eval_name = f"{eval_result.eval_name}_{n_shot}-shot"
|
| 345 |
if eval_name in eval_results.keys():
|
| 346 |
-
|
| 347 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
else:
|
| 349 |
eval_results[eval_name] = eval_result
|
| 350 |
|
|
|
|
|
|
|
|
|
|
| 351 |
results = []
|
| 352 |
for v in eval_results.values():
|
| 353 |
try:
|
| 354 |
print(v)
|
| 355 |
-
v.to_dict()
|
| 356 |
-
#if v.results:
|
| 357 |
results.append(v)
|
| 358 |
except KeyError: # not all eval values present
|
| 359 |
print(f"not all eval values present {v.eval_name} {v.full_model}")
|
|
@@ -370,7 +402,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
|
|
| 370 |
missing_results_for_task[task_name].append(f"{v.full_model}|{v.org_and_model}")
|
| 371 |
else:
|
| 372 |
missing_results_for_task[task_name] = [f"{v.full_model}|{v.org_and_model}"]
|
| 373 |
-
if r[AutoEvalColumn.lang.name] is None or r[AutoEvalColumn.lang.name]=="?":
|
| 374 |
missing_metadata.append(f"{v.full_model}")
|
| 375 |
|
| 376 |
# print('missing_results_for_task', missing_results_for_task)
|
|
@@ -386,5 +418,4 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
|
|
| 386 |
print(model)
|
| 387 |
print()
|
| 388 |
|
| 389 |
-
|
| 390 |
return results
|
|
|
|
| 14 |
|
| 15 |
NUM_FEWSHOT = 0
|
| 16 |
|
| 17 |
+
|
| 18 |
@dataclass
|
| 19 |
class EvalResult:
|
| 20 |
+
eval_name: str # org_model_precision (uid)
|
| 21 |
+
full_model: str # org/model (path on hub)
|
| 22 |
+
org: str
|
| 23 |
model: str
|
| 24 |
+
revision: str # commit hash, "" if main
|
| 25 |
results: dict
|
| 26 |
precision: Precision = Precision.Unknown
|
| 27 |
+
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
| 28 |
+
weight_type: WeightType = WeightType.Original # Original or Adapter
|
| 29 |
+
architecture: str = "Unknown"
|
| 30 |
license: str = "?"
|
| 31 |
lang: str = "?"
|
| 32 |
likes: int = 0
|
| 33 |
num_params: int = 0
|
| 34 |
+
date: str = "" # submission date of request file
|
| 35 |
still_on_hub: bool = False
|
| 36 |
n_shot: NShotType = NShotType.n0
|
| 37 |
org_and_model: str = ""
|
| 38 |
+
start_date: float = 0
|
| 39 |
|
| 40 |
@classmethod
|
| 41 |
def init_from_json_file(self, json_filepath, n_shot_num):
|
|
|
|
| 45 |
|
| 46 |
config = data.get("config")
|
| 47 |
n_shot = data.get("n-shot")
|
| 48 |
+
start_date = data.get("date", 0)
|
| 49 |
|
| 50 |
# Precision
|
| 51 |
precision = Precision.from_str(config.get("model_dtype"))
|
|
|
|
| 57 |
|
| 58 |
if re.match(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", org_and_model):
|
| 59 |
org_and_model = re.sub(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", SPICHLERZ_ORG, org_and_model)
|
|
|
|
| 60 |
|
| 61 |
+
org_and_model = org_and_model.replace(",dtype=bfloat16", "")
|
| 62 |
+
org_and_model = org_and_model.replace(",dtype=float16", "")
|
| 63 |
+
|
| 64 |
+
org_and_model = org_and_model.replace("models/hf_v7_e1", "APT3-1B-Instruct-e1")
|
| 65 |
+
org_and_model = org_and_model.replace("models/hf_v7_e2", "APT3-1B-Instruct-e2")
|
| 66 |
|
| 67 |
org_and_model = re.sub(r"^pretrained=", "", org_and_model)
|
| 68 |
org_and_model = org_and_model.replace(",trust_remote_code=True", "")
|
| 69 |
org_and_model = re.sub(",prefix_token_id=\d+", "", org_and_model)
|
| 70 |
+
org_and_model = re.sub("/$", "", org_and_model)
|
| 71 |
|
| 72 |
org_and_model = org_and_model.split("/", 1)
|
| 73 |
|
|
|
|
| 96 |
task = task.value
|
| 97 |
|
| 98 |
# We average all scores of a given metric (not all metrics are present in all files)
|
| 99 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if
|
| 100 |
+
task.benchmark == k and n_shot.get(k, -1) == n_shot_num])
|
| 101 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 102 |
continue
|
| 103 |
|
|
|
|
| 105 |
mean_acc = np.mean(accs)
|
| 106 |
else:
|
| 107 |
mean_acc = np.mean(accs) * 100.0
|
| 108 |
+
results[task.benchmark] = (mean_acc, start_date)
|
| 109 |
+
# results[task.benchmark] = mean_acc
|
| 110 |
|
| 111 |
return self(
|
| 112 |
eval_name=result_key,
|
|
|
|
| 114 |
org=org,
|
| 115 |
model=model,
|
| 116 |
results=results,
|
| 117 |
+
precision=precision,
|
| 118 |
+
revision=config.get("model_sha", ""),
|
| 119 |
still_on_hub=still_on_hub,
|
| 120 |
architecture=architecture,
|
| 121 |
n_shot=NShotType.from_str(n_shot_num),
|
| 122 |
+
org_and_model=orig_org_and_model,
|
| 123 |
+
start_date=start_date
|
| 124 |
)
|
| 125 |
|
| 126 |
def update_with_metadata(self, metadata):
|
| 127 |
+
# print('UPDATE', self.full_model, self.model, self.eval_name)
|
| 128 |
try:
|
| 129 |
+
meta = metadata[self.full_model]
|
| 130 |
self.model_type = ModelType.from_str(meta.get("type", "?"))
|
| 131 |
self.num_params = meta.get("params", 0)
|
| 132 |
self.license = meta.get("license", "?")
|
| 133 |
self.lang = meta.get("lang", "?")
|
| 134 |
+
# TODO desc name
|
| 135 |
except KeyError:
|
| 136 |
print(f"Could not find metadata for {self.full_model}")
|
| 137 |
|
|
|
|
| 138 |
def update_with_request_file(self, requests_path):
|
| 139 |
"""Finds the relevant request file for the current model and updates info with it"""
|
| 140 |
return
|
|
|
|
| 157 |
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
| 158 |
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
| 159 |
all_tasks = g_tasks + mc_tasks
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
+
baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
|
| 162 |
|
| 163 |
+
average = sum([v for task, v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
|
| 164 |
+
average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
| 165 |
+
average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
| 166 |
+
|
| 167 |
+
# average = sum([(v-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
|
| 168 |
+
# average_g = sum([(v-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
| 169 |
+
# average_mc = sum([(v-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
| 170 |
+
|
| 171 |
+
data_dict = {}
|
| 172 |
# data_dict = {
|
| 173 |
# "eval_name": self.eval_name, # not a column, just a save name,
|
| 174 |
# AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
|
| 202 |
except KeyError:
|
| 203 |
print(f"Could not find model type")
|
| 204 |
|
|
|
|
| 205 |
try:
|
| 206 |
data_dict[AutoEvalColumn.model_type_symbol.name] = self.model_type.value.symbol
|
| 207 |
except KeyError:
|
|
|
|
| 222 |
print(f"AttributeError architecture")
|
| 223 |
|
| 224 |
try:
|
| 225 |
+
data_dict[AutoEvalColumn.model.name] = make_clickable_model(
|
| 226 |
+
self.full_model) if self.still_on_hub else self.full_model
|
| 227 |
except KeyError:
|
| 228 |
print(f"Could not find model")
|
| 229 |
|
|
|
|
| 319 |
with open(tmp_request_file, "r") as f:
|
| 320 |
req_content = json.load(f)
|
| 321 |
if (
|
| 322 |
+
req_content["status"] in ["FINISHED"]
|
| 323 |
+
and req_content["precision"] == precision.split(".")[-1]
|
| 324 |
):
|
| 325 |
request_file = tmp_request_file
|
| 326 |
return request_file
|
|
|
|
| 344 |
for file in files:
|
| 345 |
model_result_filepaths.append(os.path.join(root, file))
|
| 346 |
|
| 347 |
+
# print('PATHS:', model_result_filepaths)
|
| 348 |
+
|
| 349 |
eval_results = {}
|
| 350 |
+
for n_shot in [0, 5]:
|
| 351 |
for model_result_filepath in model_result_filepaths:
|
| 352 |
# Creation of result
|
| 353 |
eval_result = EvalResult.init_from_json_file(model_result_filepath, n_shot_num=n_shot)
|
| 354 |
eval_result.update_with_request_file(requests_path)
|
| 355 |
+
# update with metadata
|
| 356 |
eval_result.update_with_metadata(metadata)
|
| 357 |
|
|
|
|
| 358 |
# Store results of same eval together
|
| 359 |
eval_name = f"{eval_result.eval_name}_{n_shot}-shot"
|
| 360 |
if eval_name in eval_results.keys():
|
| 361 |
+
|
| 362 |
+
for k, (v, start_date) in eval_result.results.items():
|
| 363 |
+
if v is not None:
|
| 364 |
+
if k in eval_results[eval_name].results:
|
| 365 |
+
if start_date > eval_results[eval_name].results[k][1]:
|
| 366 |
+
print(
|
| 367 |
+
f"Overwriting {eval_name}.results {k} {eval_results[eval_name].results[k]} with {v}: {model_result_filepath} {n_shot} {eval_result.start_date} {eval_results[eval_name].start_date}")
|
| 368 |
+
eval_results[eval_name].results[k] = (v, start_date)
|
| 369 |
+
else:
|
| 370 |
+
print(
|
| 371 |
+
f"Skipping {eval_name} {eval_result.start_date} {eval_results[eval_name].start_date}: {model_result_filepath} {n_shot}")
|
| 372 |
+
else:
|
| 373 |
+
eval_results[eval_name].results[k] = (v, start_date)
|
| 374 |
+
# eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
| 375 |
+
# TODO: log updated
|
| 376 |
+
|
| 377 |
else:
|
| 378 |
eval_results[eval_name] = eval_result
|
| 379 |
|
| 380 |
+
for k,v in eval_results.items():
|
| 381 |
+
v.results = {k: v for k, (v, start_date) in v.results.items()}
|
| 382 |
+
|
| 383 |
results = []
|
| 384 |
for v in eval_results.values():
|
| 385 |
try:
|
| 386 |
print(v)
|
| 387 |
+
v.to_dict() # we test if the dict version is complete
|
| 388 |
+
# if v.results:
|
| 389 |
results.append(v)
|
| 390 |
except KeyError: # not all eval values present
|
| 391 |
print(f"not all eval values present {v.eval_name} {v.full_model}")
|
|
|
|
| 402 |
missing_results_for_task[task_name].append(f"{v.full_model}|{v.org_and_model}")
|
| 403 |
else:
|
| 404 |
missing_results_for_task[task_name] = [f"{v.full_model}|{v.org_and_model}"]
|
| 405 |
+
if r[AutoEvalColumn.lang.name] is None or r[AutoEvalColumn.lang.name] == "?":
|
| 406 |
missing_metadata.append(f"{v.full_model}")
|
| 407 |
|
| 408 |
# print('missing_results_for_task', missing_results_for_task)
|
|
|
|
| 418 |
print(model)
|
| 419 |
print()
|
| 420 |
|
|
|
|
| 421 |
return results
|