Spaces:
Sleeping
Sleeping
import os | |
import numpy as np | |
import re | |
from pathlib import Path | |
from collections import defaultdict | |
from scipy import stats | |
experiments = Path("./results_1000/") | |
results_dict = {} | |
def label_parser(label): | |
label_parser_dict = { | |
"VIGIL4_WizardGuide_lang64_no_explo": "ABL_MH-BabyAI", | |
"VIGIL4_WizardTwoGuides_lang64_no_explo": "FULL_MH-BabyAI", | |
"VIGIL4_WizardGuide_lang64_mm": "ABL_MH-BabyAI-ExpBonus", | |
"VIGIL4_WizardTwoGuides_lang64_mm": "FULL_MH-BabyAI-ExpBonus", | |
"VIGIL4_WizardGuide_lang64_deaf_no_explo": "ABL_Deaf-MH-BabyAI", | |
"VIGIL4_WizardTwoGuides_lang64_deaf_no_explo": "FULL_Deaf-MH-BabyAI", | |
"VIGIL4_WizardGuide_lang64_bow": "ABL_MH-BabyAI-BOW", | |
"VIGIL4_WizardTwoGuides_lang64_bow": "FULL_MH-BabyAI-BOW", | |
"VIGIL4_WizardGuide_lang64_no_mem": "ABL_MH-BabyAI-no-mem", | |
"VIGIL4_WizardTwoGuides_lang64_no_mem": "FULL_MH-BabyAI-no-mem", | |
"VIGIL5_WizardGuide_lang64_bigru": "ABL_MH-BabyAI-bigru", | |
"VIGIL5_WizardTwoGuides_lang64_bigru": "FULL_MH-BabyAI-bigru", | |
"VIGIL5_WizardGuide_lang64_attgru": "ABL_MH-BabyAI-attgru", | |
"VIGIL5_WizardTwoGuides_lang64_attgru": "FULL_MH-BabyAI-attgru", | |
"VIGIL4_WizardGuide_lang64_curr_dial": "ABL_MH-BabyAI-current-dialogue", | |
"VIGIL4_WizardTwoGuides_lang64_curr_dial": "FULL_MH-BabyAI-current-dialogue", | |
"random_WizardGuide": "ABL_Random-agent", | |
"random_WizardTwoGuides": "FULL_Random-agent", | |
} | |
if sum([1 for k, v in label_parser_dict.items() if k in label]) != 1: | |
print("ERROR") | |
print(label) | |
exit() | |
for k, v in label_parser_dict.items(): | |
if k in label: return v | |
return label | |
for experiment_out_file in experiments.iterdir(): | |
results_dict[label_parser(str(experiment_out_file))] = [] | |
with open(experiment_out_file) as f: | |
for line in f: | |
if "seed success rate" in line: | |
seed_success_rate = float(re.search('[0-9]\.[0-9]*', line).group()) | |
results_dict[label_parser(str(experiment_out_file))].append(seed_success_rate) | |
assert set([len(v) for v in results_dict.values()]) == set([16]) | |
test_p = 0.05 | |
compare = { | |
"ABL_MH-BabyAI-ExpBonus": "ABL_MH-BabyAI", | |
"ABL_MH-BabyAI": "ABL_Deaf-MH-BabyAI", | |
"ABL_Deaf-MH-BabyAI": "ABL_Random-agent", | |
"FULL_MH-BabyAI-ExpBonus": "FULL_MH-BabyAI", | |
"FULL_MH-BabyAI": "FULL_Deaf-MH-BabyAI", | |
"FULL_Deaf-MH-BabyAI": "FULL_Random-agent", | |
} | |
for k, v in compare.items(): | |
p = stats.ttest_ind( | |
results_dict[k], | |
results_dict[v], | |
equal_var=False | |
).pvalue | |
if np.isnan(p): | |
from IPython import embed; embed() | |
print("{} (m:{}) <---> {} (m:{}) = p: {} result: {}".format( | |
k, np.mean(results_dict[k]), v, np.mean(results_dict[v]), p, | |
"Distributions different(p={})".format(test_p) if p < test_p else "Distributions same(p={})".format(test_p) | |
)) | |
print() | |
# from IPython import embed; embed() |