saattrupdan
commited on
Commit
·
637c71d
1
Parent(s):
4a4e1c1
fix: Use all datasets from a task, use ranks instead of log_ranks
Browse files
app.py
CHANGED
@@ -647,8 +647,8 @@ def produce_radial_plot(
|
|
647 |
best_scores = scores
|
648 |
ranks.append(rank)
|
649 |
|
650 |
-
|
651 |
-
scores = 1 - (
|
652 |
for model_id, score in zip(model_ids_sorted, scores):
|
653 |
all_rank_scores[task][language][model_id] = score
|
654 |
logger.info("Successfully computed rank scores.")
|
@@ -786,15 +786,22 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
|
|
786 |
for test_score_dict in raw_results
|
787 |
]
|
788 |
if dataset.task in data_dict[model_name]:
|
789 |
-
data_dict[model_name][dataset.task]
|
790 |
else:
|
791 |
-
data_dict[model_name][dataset.task] =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
792 |
results_df = pd.DataFrame(data_dict).T.map(
|
793 |
lambda lists_or_nan:
|
794 |
-
list(it.chain(lists_or_nan))
|
795 |
-
if lists_or_nan
|
796 |
else lists_or_nan
|
797 |
-
).dropna()
|
798 |
results_dfs[language] = results_df
|
799 |
|
800 |
logger.info("Successfully fetched results from ScandEval benchmark.")
|
|
|
647 |
best_scores = scores
|
648 |
ranks.append(rank)
|
649 |
|
650 |
+
ranks = np.asarray(ranks)
|
651 |
+
scores = 1 - (ranks / ranks.max())
|
652 |
for model_id, score in zip(model_ids_sorted, scores):
|
653 |
all_rank_scores[task][language][model_id] = score
|
654 |
logger.info("Successfully computed rank scores.")
|
|
|
786 |
for test_score_dict in raw_results
|
787 |
]
|
788 |
if dataset.task in data_dict[model_name]:
|
789 |
+
data_dict[model_name][dataset.task][dataset] = scores
|
790 |
else:
|
791 |
+
data_dict[model_name][dataset.task] = {dataset: scores}
|
792 |
+
|
793 |
+
# Compute the task scores as the mean of the scores for each dataset
|
794 |
+
for model_name, task_dict in data_dict.items():
|
795 |
+
for task, dataset_dict in task_dict.items():
|
796 |
+
values = np.asarray(list(dataset_dict.values())).mean(axis=0)
|
797 |
+
data_dict[model_name][task] = values
|
798 |
+
|
799 |
results_df = pd.DataFrame(data_dict).T.map(
|
800 |
lambda lists_or_nan:
|
801 |
+
list(it.chain(*lists_or_nan))
|
802 |
+
if isinstance(lists_or_nan, list)
|
803 |
else lists_or_nan
|
804 |
+
).dropna()
|
805 |
results_dfs[language] = results_df
|
806 |
|
807 |
logger.info("Successfully fetched results from ScandEval benchmark.")
|