SocialAISchool / stester.py
grg's picture
Cleaned old git history
be5548b
raw
history blame
3 kB
import os
import numpy as np
import re
from pathlib import Path
from collections import defaultdict
from scipy import stats
experiments = Path("./results_1000/")
results_dict = {}
def label_parser(label):
label_parser_dict = {
"VIGIL4_WizardGuide_lang64_no_explo": "ABL_MH-BabyAI",
"VIGIL4_WizardTwoGuides_lang64_no_explo": "FULL_MH-BabyAI",
"VIGIL4_WizardGuide_lang64_mm": "ABL_MH-BabyAI-ExpBonus",
"VIGIL4_WizardTwoGuides_lang64_mm": "FULL_MH-BabyAI-ExpBonus",
"VIGIL4_WizardGuide_lang64_deaf_no_explo": "ABL_Deaf-MH-BabyAI",
"VIGIL4_WizardTwoGuides_lang64_deaf_no_explo": "FULL_Deaf-MH-BabyAI",
"VIGIL4_WizardGuide_lang64_bow": "ABL_MH-BabyAI-BOW",
"VIGIL4_WizardTwoGuides_lang64_bow": "FULL_MH-BabyAI-BOW",
"VIGIL4_WizardGuide_lang64_no_mem": "ABL_MH-BabyAI-no-mem",
"VIGIL4_WizardTwoGuides_lang64_no_mem": "FULL_MH-BabyAI-no-mem",
"VIGIL5_WizardGuide_lang64_bigru": "ABL_MH-BabyAI-bigru",
"VIGIL5_WizardTwoGuides_lang64_bigru": "FULL_MH-BabyAI-bigru",
"VIGIL5_WizardGuide_lang64_attgru": "ABL_MH-BabyAI-attgru",
"VIGIL5_WizardTwoGuides_lang64_attgru": "FULL_MH-BabyAI-attgru",
"VIGIL4_WizardGuide_lang64_curr_dial": "ABL_MH-BabyAI-current-dialogue",
"VIGIL4_WizardTwoGuides_lang64_curr_dial": "FULL_MH-BabyAI-current-dialogue",
"random_WizardGuide": "ABL_Random-agent",
"random_WizardTwoGuides": "FULL_Random-agent",
}
if sum([1 for k, v in label_parser_dict.items() if k in label]) != 1:
print("ERROR")
print(label)
exit()
for k, v in label_parser_dict.items():
if k in label: return v
return label
for experiment_out_file in experiments.iterdir():
results_dict[label_parser(str(experiment_out_file))] = []
with open(experiment_out_file) as f:
for line in f:
if "seed success rate" in line:
seed_success_rate = float(re.search('[0-9]\.[0-9]*', line).group())
results_dict[label_parser(str(experiment_out_file))].append(seed_success_rate)
assert set([len(v) for v in results_dict.values()]) == set([16])
test_p = 0.05
compare = {
"ABL_MH-BabyAI-ExpBonus": "ABL_MH-BabyAI",
"ABL_MH-BabyAI": "ABL_Deaf-MH-BabyAI",
"ABL_Deaf-MH-BabyAI": "ABL_Random-agent",
"FULL_MH-BabyAI-ExpBonus": "FULL_MH-BabyAI",
"FULL_MH-BabyAI": "FULL_Deaf-MH-BabyAI",
"FULL_Deaf-MH-BabyAI": "FULL_Random-agent",
}
for k, v in compare.items():
p = stats.ttest_ind(
results_dict[k],
results_dict[v],
equal_var=False
).pvalue
if np.isnan(p):
from IPython import embed; embed()
print("{} (m:{}) <---> {} (m:{}) = p: {} result: {}".format(
k, np.mean(results_dict[k]), v, np.mean(results_dict[v]), p,
"Distributions different(p={})".format(test_p) if p < test_p else "Distributions same(p={})".format(test_p)
))
print()
# from IPython import embed; embed()