saattrupdan commited on
Commit
637c71d
·
1 Parent(s): 4a4e1c1

fix: Use all datasets from a task, use ranks instead of log_ranks

Browse files
Files changed (1) hide show
  1. app.py +14 -7
app.py CHANGED
@@ -647,8 +647,8 @@ def produce_radial_plot(
647
  best_scores = scores
648
  ranks.append(rank)
649
 
650
- log_ranks = np.log(ranks)
651
- scores = 1 - (log_ranks / log_ranks.max())
652
  for model_id, score in zip(model_ids_sorted, scores):
653
  all_rank_scores[task][language][model_id] = score
654
  logger.info("Successfully computed rank scores.")
@@ -786,15 +786,22 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
786
  for test_score_dict in raw_results
787
  ]
788
  if dataset.task in data_dict[model_name]:
789
- data_dict[model_name][dataset.task].append(scores)
790
  else:
791
- data_dict[model_name][dataset.task] = [scores]
 
 
 
 
 
 
 
792
  results_df = pd.DataFrame(data_dict).T.map(
793
  lambda lists_or_nan:
794
- list(it.chain(lists_or_nan))
795
- if lists_or_nan == lists_or_nan
796
  else lists_or_nan
797
- ).dropna().map(lambda lst: lst[0])
798
  results_dfs[language] = results_df
799
 
800
  logger.info("Successfully fetched results from ScandEval benchmark.")
 
647
  best_scores = scores
648
  ranks.append(rank)
649
 
650
+ ranks = np.asarray(ranks)
651
+ scores = 1 - (ranks / ranks.max())
652
  for model_id, score in zip(model_ids_sorted, scores):
653
  all_rank_scores[task][language][model_id] = score
654
  logger.info("Successfully computed rank scores.")
 
786
  for test_score_dict in raw_results
787
  ]
788
  if dataset.task in data_dict[model_name]:
789
+ data_dict[model_name][dataset.task][dataset] = scores
790
  else:
791
+ data_dict[model_name][dataset.task] = {dataset: scores}
792
+
793
+ # Compute the task scores as the mean of the scores for each dataset
794
+ for model_name, task_dict in data_dict.items():
795
+ for task, dataset_dict in task_dict.items():
796
+ values = np.asarray(list(dataset_dict.values())).mean(axis=0)
797
+ data_dict[model_name][task] = values
798
+
799
  results_df = pd.DataFrame(data_dict).T.map(
800
  lambda lists_or_nan:
801
+ list(it.chain(*lists_or_nan))
802
+ if isinstance(lists_or_nan, list)
803
  else lists_or_nan
804
+ ).dropna()
805
  results_dfs[language] = results_df
806
 
807
  logger.info("Successfully fetched results from ScandEval benchmark.")