Update src/leaderboard/read_evals.py
Browse files
src/leaderboard/read_evals.py
CHANGED
|
@@ -109,6 +109,7 @@ class EvalResult:
|
|
| 109 |
|
| 110 |
def to_dict(self):
|
| 111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
|
|
|
| 112 |
data_dict = {
|
| 113 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 114 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
@@ -118,15 +119,16 @@ class EvalResult:
|
|
| 118 |
AutoEvalColumn.architecture.name: self.architecture,
|
| 119 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
| 120 |
AutoEvalColumn.revision.name: self.revision,
|
|
|
|
| 121 |
AutoEvalColumn.license.name: self.license,
|
| 122 |
AutoEvalColumn.likes.name: self.likes,
|
| 123 |
AutoEvalColumn.params.name: self.num_params,
|
| 124 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 125 |
}
|
| 126 |
-
|
|
|
|
| 127 |
for task in Tasks:
|
| 128 |
-
|
| 129 |
-
data_dict[task_value.col_name] = self.results[task_value.benchmark]
|
| 130 |
|
| 131 |
return data_dict
|
| 132 |
|
|
@@ -185,7 +187,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 185 |
|
| 186 |
results = []
|
| 187 |
for v in eval_results.values():
|
| 188 |
-
print("v", v)
|
| 189 |
try:
|
| 190 |
v.to_dict() # we test if the dict version is complete
|
| 191 |
results.append(v)
|
|
|
|
| 109 |
|
| 110 |
def to_dict(self):
|
| 111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 112 |
+
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
| 113 |
data_dict = {
|
| 114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 115 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
|
| 119 |
AutoEvalColumn.architecture.name: self.architecture,
|
| 120 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
| 121 |
AutoEvalColumn.revision.name: self.revision,
|
| 122 |
+
AutoEvalColumn.average.name: average,
|
| 123 |
AutoEvalColumn.license.name: self.license,
|
| 124 |
AutoEvalColumn.likes.name: self.likes,
|
| 125 |
AutoEvalColumn.params.name: self.num_params,
|
| 126 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 127 |
}
|
| 128 |
+
|
| 129 |
+
print("Tasks", Tasks)
|
| 130 |
for task in Tasks:
|
| 131 |
+
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
|
|
|
| 132 |
|
| 133 |
return data_dict
|
| 134 |
|
|
|
|
| 187 |
|
| 188 |
results = []
|
| 189 |
for v in eval_results.values():
|
|
|
|
| 190 |
try:
|
| 191 |
v.to_dict() # we test if the dict version is complete
|
| 192 |
results.append(v)
|