Commit
·
9a46da5
1
Parent(s):
65f7993
feat: Use t-tests to determine win ratios
Browse files- app.py +26 -13
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -12,6 +12,8 @@ import requests
|
|
| 12 |
import random
|
| 13 |
import logging
|
| 14 |
import datetime as dt
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
fmt = "%(asctime)s [%(levelname)s] <%(name)s> %(message)s"
|
|
@@ -408,13 +410,19 @@ def produce_radial_plot(
|
|
| 408 |
for language in languages:
|
| 409 |
if model_id not in results_dfs_filtered[language].index:
|
| 410 |
continue
|
| 411 |
-
|
| 412 |
win_ratio = 100 * np.mean([
|
| 413 |
-
|
| 414 |
-
|
|
|
|
|
|
|
| 415 |
])
|
| 416 |
win_ratios.append(win_ratio)
|
| 417 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
if use_win_ratio:
|
| 419 |
result_list.append(np.mean(win_ratios))
|
| 420 |
else:
|
|
@@ -515,18 +523,23 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
|
|
| 515 |
dataset = next(
|
| 516 |
dataset for dataset in DATASETS if dataset.name == dataset_name
|
| 517 |
)
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
|
|
|
|
|
|
|
|
|
| 522 |
if dataset.task in data_dict[model_name]:
|
| 523 |
-
data_dict[model_name][dataset.task].append(
|
| 524 |
else:
|
| 525 |
-
data_dict[model_name][dataset.task] = [
|
| 526 |
results_df = pd.DataFrame(data_dict).T.map(
|
| 527 |
-
lambda
|
| 528 |
-
|
| 529 |
-
|
|
|
|
|
|
|
| 530 |
results_dfs[language] = results_df
|
| 531 |
|
| 532 |
logger.info("Successfully fetched results from ScandEval benchmark.")
|
|
|
|
| 12 |
import random
|
| 13 |
import logging
|
| 14 |
import datetime as dt
|
| 15 |
+
import scipy.stats as stats
|
| 16 |
+
import itertools as it
|
| 17 |
|
| 18 |
|
| 19 |
fmt = "%(asctime)s [%(levelname)s] <%(name)s> %(message)s"
|
|
|
|
| 410 |
for language in languages:
|
| 411 |
if model_id not in results_dfs_filtered[language].index:
|
| 412 |
continue
|
| 413 |
+
score_list = results_dfs_filtered[language].loc[model_id][task]
|
| 414 |
win_ratio = 100 * np.mean([
|
| 415 |
+
stats.ttest_rel(
|
| 416 |
+
a=score_list, b=other_scores, alternative="greater"
|
| 417 |
+
).pvalue < 0.05
|
| 418 |
+
for other_scores in results_dfs_filtered[language][task].dropna().drop(index=model_id)
|
| 419 |
])
|
| 420 |
win_ratios.append(win_ratio)
|
| 421 |
+
|
| 422 |
+
if all(score < 1 for score in score_list):
|
| 423 |
+
score_list = [100 * score for score in score_list]
|
| 424 |
+
|
| 425 |
+
scores.append(np.mean(score_list))
|
| 426 |
if use_win_ratio:
|
| 427 |
result_list.append(np.mean(win_ratios))
|
| 428 |
else:
|
|
|
|
| 523 |
dataset = next(
|
| 524 |
dataset for dataset in DATASETS if dataset.name == dataset_name
|
| 525 |
)
|
| 526 |
+
scores = [
|
| 527 |
+
test_score_dict.get(
|
| 528 |
+
f"test_{dataset.task.metric}",
|
| 529 |
+
test_score_dict.get(dataset.task.metric)
|
| 530 |
+
)
|
| 531 |
+
for test_score_dict in record["results"]["raw"]["test"]
|
| 532 |
+
]
|
| 533 |
if dataset.task in data_dict[model_name]:
|
| 534 |
+
data_dict[model_name][dataset.task].append(scores)
|
| 535 |
else:
|
| 536 |
+
data_dict[model_name][dataset.task] = [scores]
|
| 537 |
results_df = pd.DataFrame(data_dict).T.map(
|
| 538 |
+
lambda lists_or_nan:
|
| 539 |
+
list(it.chain(lists_or_nan))
|
| 540 |
+
if lists_or_nan == lists_or_nan
|
| 541 |
+
else lists_or_nan
|
| 542 |
+
).dropna().map(lambda lst: lst[0])
|
| 543 |
results_dfs[language] = results_df
|
| 544 |
|
| 545 |
logger.info("Successfully fetched results from ScandEval benchmark.")
|
requirements.txt
CHANGED
|
@@ -52,6 +52,7 @@ requests==2.31.0
|
|
| 52 |
rich==13.7.0
|
| 53 |
rpds-py==0.17.1
|
| 54 |
ruff==0.1.14
|
|
|
|
| 55 |
semantic-version==2.10.0
|
| 56 |
shellingham==1.5.4
|
| 57 |
six==1.16.0
|
|
|
|
| 52 |
rich==13.7.0
|
| 53 |
rpds-py==0.17.1
|
| 54 |
ruff==0.1.14
|
| 55 |
+
scipy==1.12.0
|
| 56 |
semantic-version==2.10.0
|
| 57 |
shellingham==1.5.4
|
| 58 |
six==1.16.0
|