Upload nemotron_wrap_1T_exp/metrics.eval.jsonl with huggingface_hub
Browse files
nemotron_wrap_1T_exp/metrics.eval.jsonl
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"created_at": "2025-09-04T14:45:45.641974", "global_step": 30000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20475020475020475, "acc_stderr,none": 0.01155271447787667}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4213304122684724, "acc_stderr,none": 0.004927631806477565, "acc_norm,none": 0.54690300736905, "acc_norm_stderr,none": 0.004967778940011933}, "mmlu": {"acc,none": 0.24597635664435266, "acc_stderr,none": 0.0036299089114184245, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25844845908607866, "acc_stderr,none": 0.006383894624004596, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.04073524322147125}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.035243908445117836}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.029771775228145624}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25738396624472576, "acc_stderr,none": 0.028458820991460305}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.30578512396694213, "acc_stderr,none": 0.04205953933884125}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094631}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.03351953879521269}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.023532925431044276}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.28268156424581004, "acc_stderr,none": 0.0150603817300181}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21864951768488747, "acc_stderr,none": 0.023475581417861113}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22839506172839505, "acc_stderr,none": 0.023358211840626267}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25554106910039115, "acc_stderr,none": 0.011139857833598516}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.25146198830409355, "acc_stderr,none": 0.033275044238468436}, "mmlu_other": {"acc,none": 0.24106855487608625, "acc_stderr,none": 0.007657728751263028, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899105}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.0339175032232166}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322674}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.23766816143497757, "acc_stderr,none": 0.028568079464714267}, "mmlu_management": {"alias": " - management", "acc,none": 0.1650485436893204, "acc_stderr,none": 0.036756688322331886}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674057}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23627075351213284, "acc_stderr,none": 0.0151904737170375}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.024288619466046085}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.02564555362226673}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19117647058823528, "acc_stderr,none": 0.02388688192244034}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21084337349397592, "acc_stderr,none": 0.0317555478662992}, "mmlu_social_sciences": {"acc,none": 0.24666883327916803, "acc_stderr,none": 0.007768283305304792, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.041424397194893624}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2676767676767677, "acc_stderr,none": 0.03154449888270285}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.029519282616817247}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21794871794871795, "acc_stderr,none": 0.020932445774463185}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23109243697478993, "acc_stderr,none": 0.027381406927868952}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23302752293577983, "acc_stderr,none": 0.018125669180861486}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.03880848301082395}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.29248366013071897, "acc_stderr,none": 0.018403415710109783}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721376}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20816326530612245, "acc_stderr,none": 0.025991117672813296}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_stem": {"acc,none": 0.23152553124008882, "acc_stderr,none": 0.007494300440637687, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.03455473702325435}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.25, "acc_stderr,none": 0.03523807393012047}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3194444444444444, "acc_stderr,none": 0.03899073687357336}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.14, "acc_stderr,none": 0.03487350880197774}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.03873958714149352}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.22127659574468084, "acc_stderr,none": 0.027136349602424052}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2896551724137931, "acc_stderr,none": 0.037800192304380135}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21693121693121692, "acc_stderr,none": 0.02122708244944506}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24193548387096775, "acc_stderr,none": 0.024362599693031096}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1477832512315271, "acc_stderr,none": 0.024969621333521257}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22592592592592592, "acc_stderr,none": 0.025497532639609553}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.24503311258278146, "acc_stderr,none": 0.03511807571804725}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2175925925925926, "acc_stderr,none": 0.028139689444859676}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340455}, "sciq": {"alias": "sciq", "acc,none": 0.871, "acc_stderr,none": 0.010605256784796587, "acc_norm,none": 0.822, "acc_norm_stderr,none": 0.012102167676183568}}
|
2 |
+
{"created_at": "2025-09-05T18:51:03.955667", "global_step": 60000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21048321048321048, "acc_stderr,none": 0.011671038436522901}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.43457478589922327, "acc_stderr,none": 0.0049468798744226805, "acc_norm,none": 0.566620195180243, "acc_norm_stderr,none": 0.004945291270072434}, "mmlu": {"acc,none": 0.252385700042729, "acc_stderr,none": 0.0036635684617208467, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2641870350690754, "acc_stderr,none": 0.006428124263774489, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.18253968253968253, "acc_stderr,none": 0.03455071019102147}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.03427743175816524}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.029771775228145628}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25738396624472576, "acc_stderr,none": 0.028458820991460295}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.33884297520661155, "acc_stderr,none": 0.04320767807536669}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094633}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2822085889570552, "acc_stderr,none": 0.03536117886664743}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.024105712607754307}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2446927374301676, "acc_stderr,none": 0.014378169884098424}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.28938906752411575, "acc_stderr,none": 0.02575586592263294}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.02465968518596729}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27053455019556716, "acc_stderr,none": 0.011345996743539258}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.26900584795321636, "acc_stderr,none": 0.0340105262010409}, "mmlu_other": {"acc,none": 0.24879304795622786, "acc_stderr,none": 0.007735907481916668, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173042}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.25112107623318386, "acc_stderr,none": 0.02910522083322462}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928315}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.25213675213675213, "acc_stderr,none": 0.02844796547623102}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2950191570881226, "acc_stderr,none": 0.016308363772932728}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.02463004897982477}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.026469036818590634}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.15441176470588236, "acc_stderr,none": 0.021950024722922033}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21084337349397592, "acc_stderr,none": 0.03175554786629921}, "mmlu_social_sciences": {"acc,none": 0.2333441663958401, "acc_stderr,none": 0.007627771466453176, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518754}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.030532892233932026}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22279792746113988, "acc_stderr,none": 0.03003114797764154}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2076923076923077, "acc_stderr,none": 0.020567539567246797}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.22268907563025211, "acc_stderr,none": 0.027025433498882374}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21100917431192662, "acc_stderr,none": 0.01749392240411265}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.03880848301082396}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2630718954248366, "acc_stderr,none": 0.017812676542320653}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884601}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22857142857142856, "acc_stderr,none": 0.026882144922307748}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.03076944496729601}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.25689819219790677, "acc_stderr,none": 0.007789177443064565, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.03785714465066652}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03459777606810537}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2425531914893617, "acc_stderr,none": 0.028020226271200214}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.038061426873099935}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02256989707491842}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25483870967741934, "acc_stderr,none": 0.024790118459332208}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.33004926108374383, "acc_stderr,none": 0.033085304262282574}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.026067159222275788}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.03543304234389985}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.028963702570791026}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.04246624336697626}, "sciq": {"alias": "sciq", "acc,none": 0.889, "acc_stderr,none": 0.009938701010583726, "acc_norm,none": 0.831, "acc_norm_stderr,none": 0.011856625977890115}}
|
3 |
+
{"created_at": "2025-09-05T18:56:13.112015", "global_step": 90000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.18591318591318592, "acc_stderr,none": 0.011138085349810704}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4446325433180641, "acc_stderr,none": 0.0049590941464715274, "acc_norm,none": 0.5875323640709023, "acc_norm_stderr,none": 0.004912723848944802}, "mmlu": {"acc,none": 0.2568010254949437, "acc_stderr,none": 0.003685280461597825, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2631243358129649, "acc_stderr,none": 0.006416361158279365, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.19047619047619047, "acc_stderr,none": 0.035122074123020514}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.03524390844511782}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059804}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.33884297520661155, "acc_stderr,none": 0.043207678075366705}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3148148148148148, "acc_stderr,none": 0.04489931073591312}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.30346820809248554, "acc_stderr,none": 0.02475241196091721}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2540192926045016, "acc_stderr,none": 0.024723861504771696}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2932098765432099, "acc_stderr,none": 0.025329888171900926}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25488917861799215, "acc_stderr,none": 0.011130509812662974}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.03467826685703826}, "mmlu_other": {"acc,none": 0.24782748632121018, "acc_stderr,none": 0.0077407864558619475, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.025757559893106727}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.033687629322594295}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.21524663677130046, "acc_stderr,none": 0.027584066602208263}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928315}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.27350427350427353, "acc_stderr,none": 0.029202540153431183}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24521072796934865, "acc_stderr,none": 0.015384352284543946}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.024051029739912255}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2907801418439716, "acc_stderr,none": 0.027090664368353178}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1948529411764706, "acc_stderr,none": 0.024060599423487414}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25903614457831325, "acc_stderr,none": 0.03410646614071857}, "mmlu_social_sciences": {"acc,none": 0.257393565160871, "acc_stderr,none": 0.007884640791865532, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.043727482902780064}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.02985751567338641}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2538860103626943, "acc_stderr,none": 0.03141024780565319}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23846153846153847, "acc_stderr,none": 0.021606294494647727}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2689075630252101, "acc_stderr,none": 0.028801392193631276}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26422018348623855, "acc_stderr,none": 0.018904164171510193}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768361}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2826797385620915, "acc_stderr,none": 0.01821726955205344}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.041723430387053825}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2163265306122449, "acc_stderr,none": 0.026358916334904045}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23383084577114427, "acc_stderr,none": 0.02992941540834839}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_stem": {"acc,none": 0.25562955915001584, "acc_stderr,none": 0.007765520408552716, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.31851851851851853, "acc_stderr,none": 0.04024778401977111}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.03690677986137283}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2152777777777778, "acc_stderr,none": 0.03437079344106136}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.24680851063829787, "acc_stderr,none": 0.0281854413012341}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24338624338624337, "acc_stderr,none": 0.022101128787415436}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2129032258064516, "acc_stderr,none": 0.023287665127268542}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2512315270935961, "acc_stderr,none": 0.030516530732694436}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712177}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2913907284768212, "acc_stderr,none": 0.037101857261199946}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.028353212866863448}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.04327040932578729}, "sciq": {"alias": "sciq", "acc,none": 0.895, "acc_stderr,none": 0.009698921026024956, "acc_norm,none": 0.84, "acc_norm_stderr,none": 0.011598902298689007}}
|
4 |
+
{"created_at": "2025-09-05T18:59:18.895389", "global_step": 120000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19328419328419327, "acc_stderr,none": 0.011305207486827692}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4531965743875722, "acc_stderr,none": 0.004967872475383269, "acc_norm,none": 0.6003784106751643, "acc_norm_stderr,none": 0.004888194985997402}, "mmlu": {"acc,none": 0.23144851160803304, "acc_stderr,none": 0.003553824667227633, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24187035069075452, "acc_stderr,none": 0.00624181049436969, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.040406101782088394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03225078108306289}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.023357365785874037}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2191358024691358, "acc_stderr,none": 0.023016705640262196}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.01099615663514269}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30994152046783624, "acc_stderr,none": 0.035469769593931624}, "mmlu_other": {"acc,none": 0.24332153202446088, "acc_stderr,none": 0.007677180638335299, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173043}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.32286995515695066, "acc_stderr,none": 0.03138147637575498}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24393358876117496, "acc_stderr,none": 0.015357212665829468}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.0239291555173513}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.025257861359432414}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.30120481927710846, "acc_stderr,none": 0.03571609230053481}, "mmlu_social_sciences": {"acc,none": 0.22001949951251218, "acc_stderr,none": 0.007464692719068293, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436716}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.027479603010538787}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21761658031088082, "acc_stderr,none": 0.02977866303775296}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20256410256410257, "acc_stderr,none": 0.020377660970371386}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1944954128440367, "acc_stderr,none": 0.01697028909045805}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2748091603053435, "acc_stderr,none": 0.03915345408847836}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24836601307189543, "acc_stderr,none": 0.017479487001364764}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.025000256039546212}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.21535045987947987, "acc_stderr,none": 0.007309837118972004, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.0315469804508223}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403325}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.25517241379310346, "acc_stderr,none": 0.03632984052707842}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20899470899470898, "acc_stderr,none": 0.02094048156533485}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1870967741935484, "acc_stderr,none": 0.022185710092252252}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15270935960591134, "acc_stderr,none": 0.025308904539380627}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.02488211685765511}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16203703703703703, "acc_stderr,none": 0.025130453652268455}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.0432704093257873}, "sciq": {"alias": "sciq", "acc,none": 0.885, "acc_stderr,none": 0.010093407594904614, "acc_norm,none": 0.829, "acc_norm_stderr,none": 0.01191221645626459}}
|
5 |
+
{"created_at": "2025-09-06T15:17:32.981870", "global_step": 150000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21621621621621623, "acc_stderr,none": 0.01178588917548666}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.46793467436765585, "acc_stderr,none": 0.004979510001776624, "acc_norm,none": 0.6204939255128461, "acc_norm_stderr,none": 0.004842723234022034}, "mmlu": {"acc,none": 0.25430850306224184, "acc_stderr,none": 0.003669551751342653, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24824654622741765, "acc_stderr,none": 0.006296240799070965, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047182}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.031922715695482974}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.030190282453501964}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.19831223628691982, "acc_stderr,none": 0.02595502084162111}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.040655781409087044}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.03322015795776741}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.02425790170532337}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2346368715083799, "acc_stderr,none": 0.014173044098303675}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2733118971061093, "acc_stderr,none": 0.025311765975426115}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2993827160493827, "acc_stderr,none": 0.025483115601195462}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.01099615663514269}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23391812865497075, "acc_stderr,none": 0.032467217651178264}, "mmlu_other": {"acc,none": 0.25555197940135177, "acc_stderr,none": 0.007796455075872634, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2490566037735849, "acc_stderr,none": 0.02661648298050171}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.030952890217749888}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.29596412556053814, "acc_stderr,none": 0.03063659134869979}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.040580420156460344}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.27350427350427353, "acc_stderr,none": 0.029202540153431177}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26436781609195403, "acc_stderr,none": 0.015769984840690518}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2908496732026144, "acc_stderr,none": 0.026004800363952113}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.20921985815602837, "acc_stderr,none": 0.024264769439988475}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16176470588235295, "acc_stderr,none": 0.022368672562886754}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3674698795180723, "acc_stderr,none": 0.03753267402120575}, "mmlu_social_sciences": {"acc,none": 0.2612934676633084, "acc_stderr,none": 0.007908273888034513, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.0414243971948936}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02962022787479047}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.030276909945178253}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2512820512820513, "acc_stderr,none": 0.021992016662370547}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.02665353159671548}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24220183486238533, "acc_stderr,none": 0.01836817630659862}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3053435114503817, "acc_stderr,none": 0.04039314978724562}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.017776947157528044}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.04122066502878284}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3469387755102041, "acc_stderr,none": 0.030472526026726496}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.31840796019900497, "acc_stderr,none": 0.03294118479054095}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_stem": {"acc,none": 0.25531240088804313, "acc_stderr,none": 0.007760037058784921, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.035478541985608236}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2565789473684211, "acc_stderr,none": 0.0355418036802569}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080343}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.0379328118530781}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.30638297872340425, "acc_stderr,none": 0.030135906478517563}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135303}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.022019080012217897}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2806451612903226, "acc_stderr,none": 0.02556060472102289}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.270935960591133, "acc_stderr,none": 0.031270907132976984}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2851851851851852, "acc_stderr,none": 0.027528599210340492}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31125827814569534, "acc_stderr,none": 0.037804458505267334}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.02896370257079103}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "sciq": {"alias": "sciq", "acc,none": 0.897, "acc_stderr,none": 0.0096168333396958, "acc_norm,none": 0.847, "acc_norm_stderr,none": 0.011389500459665544}}
|
6 |
+
{"created_at": "2025-09-06T15:29:03.613463", "global_step": 180000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19328419328419327, "acc_stderr,none": 0.011305207486827711}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.472814180442143, "acc_stderr,none": 0.00498240036893967, "acc_norm,none": 0.6289583748257319, "acc_norm_stderr,none": 0.004820962855749732}, "mmlu": {"acc,none": 0.2662013958125623, "acc_stderr,none": 0.0037248849398115627, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24930924548352817, "acc_stderr,none": 0.006302939411809862, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.38095238095238093, "acc_stderr,none": 0.04343525428949098}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.031321798030832904}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.028756799629658332}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2085889570552147, "acc_stderr,none": 0.031921934489347235}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2745664739884393, "acc_stderr,none": 0.024027745155265002}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24581005586592178, "acc_stderr,none": 0.014400296429225601}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2572347266881029, "acc_stderr,none": 0.024826171289250888}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023132376234543325}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2392438070404172, "acc_stderr,none": 0.010896123652676644}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03377310252209194}, "mmlu_other": {"acc,none": 0.2745413582233666, "acc_stderr,none": 0.008008959969584012, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2679245283018868, "acc_stderr,none": 0.027257260322494845}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.29596412556053814, "acc_stderr,none": 0.03063659134869981}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.04354631077260598}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.029480360549541194}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.25287356321839083, "acc_stderr,none": 0.015543377313719681}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.026568921015457162}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.026011992930902006}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2536764705882353, "acc_stderr,none": 0.026431329870789524}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3253012048192771, "acc_stderr,none": 0.03647168523683227}, "mmlu_social_sciences": {"acc,none": 0.2892427689307767, "acc_stderr,none": 0.00815835128948865, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436716}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.30808080808080807, "acc_stderr,none": 0.03289477330098617}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2694300518134715, "acc_stderr,none": 0.03201867122877794}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2846153846153846, "acc_stderr,none": 0.022878322799706287}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.028657491285071966}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.29724770642201837, "acc_stderr,none": 0.019595707224643544}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.366412213740458, "acc_stderr,none": 0.042258754519696386}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.017630827375148383}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.04172343038705383}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.39183673469387753, "acc_stderr,none": 0.03125127591089165}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.263681592039801, "acc_stderr,none": 0.03115715086935555}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.26070409134157946, "acc_stderr,none": 0.007814007032811428, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.17777777777777778, "acc_stderr,none": 0.03302789859901719}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.0378272898086547}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2847222222222222, "acc_stderr,none": 0.03773809990686935}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542129}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.04280105837364396}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33617021276595743, "acc_stderr,none": 0.030881618520676942}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.0385528961637895}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.022182037202948368}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22258064516129034, "acc_stderr,none": 0.023664216671642518}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.24630541871921183, "acc_stderr,none": 0.030315099285617715}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.026466117538959916}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.03631329803969653}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.028353212866863448}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.042466243366976256}, "sciq": {"alias": "sciq", "acc,none": 0.923, "acc_stderr,none": 0.008434580140240662, "acc_norm,none": 0.872, "acc_norm_stderr,none": 0.01057013376110866}}
|
7 |
+
{"created_at": "2025-09-06T15:32:50.218575", "global_step": 210000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19656019656019655, "acc_stderr,none": 0.011377439773964}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.483469428400717, "acc_stderr,none": 0.004987053652540277, "acc_norm,none": 0.640211113324039, "acc_norm_stderr,none": 0.004789575163418656}, "mmlu": {"acc,none": 0.2853582110810426, "acc_stderr,none": 0.0038024660068090898, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2671625929861849, "acc_stderr,none": 0.006439573513833181, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30952380952380953, "acc_stderr,none": 0.04134913018303316}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.03524390844511783}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.03198001660115073}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2616033755274262, "acc_stderr,none": 0.028609516716994934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.03520893951097653}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.033519538795212696}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3179190751445087, "acc_stderr,none": 0.02507071371915319}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.22681564245810057, "acc_stderr,none": 0.014005843570897897}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.29260450160771706, "acc_stderr,none": 0.02583989833487798}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.33024691358024694, "acc_stderr,none": 0.026168298456732846}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2588005215123859, "acc_stderr,none": 0.01118610904656461}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30994152046783624, "acc_stderr,none": 0.03546976959393163}, "mmlu_other": {"acc,none": 0.28355326681686516, "acc_stderr,none": 0.008081963166503362, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2981132075471698, "acc_stderr,none": 0.028152837942493875}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3063583815028902, "acc_stderr,none": 0.035149425512674394}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.22869955156950672, "acc_stderr,none": 0.02818824004692919}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928315}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2692307692307692, "acc_stderr,none": 0.029058588303748842}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2796934865900383, "acc_stderr,none": 0.016050792148036543}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3300653594771242, "acc_stderr,none": 0.026925654653615686}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.02668456434046099}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.31985294117647056, "acc_stderr,none": 0.028332959514031218}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2469879518072289, "acc_stderr,none": 0.03357351982064536}, "mmlu_social_sciences": {"acc,none": 0.29769255768605785, "acc_stderr,none": 0.008215931769376971, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.039994238792813386}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.0331847733384533}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.3316062176165803, "acc_stderr,none": 0.03397636541089117}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2743589743589744, "acc_stderr,none": 0.02262276576749321}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3319327731092437, "acc_stderr,none": 0.030588697013783663}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.28073394495412846, "acc_stderr,none": 0.019266055045871616}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.37404580152671757, "acc_stderr,none": 0.04243869242230523}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.017401816711427657}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.4, "acc_stderr,none": 0.03136250240935893}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.32338308457711445, "acc_stderr,none": 0.033076159479790326}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_stem": {"acc,none": 0.30225182366000636, "acc_stderr,none": 0.008168966064469119, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2814814814814815, "acc_stderr,none": 0.03885004245800255}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3355263157894737, "acc_stderr,none": 0.03842498559395268}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.037161774375660164}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.043898699568087785}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.251063829787234, "acc_stderr,none": 0.028346963777162466}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.35172413793103446, "acc_stderr,none": 0.0397923663749741}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02306818884826111}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2903225806451613, "acc_stderr,none": 0.02582210611941589}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03255086769970103}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.27037037037037037, "acc_stderr,none": 0.027080372815145658}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.33774834437086093, "acc_stderr,none": 0.0386155754625517}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3888888888888889, "acc_stderr,none": 0.033247089118091176}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340455}, "sciq": {"alias": "sciq", "acc,none": 0.925, "acc_stderr,none": 0.008333333333333361, "acc_norm,none": 0.894, "acc_norm_stderr,none": 0.009739551265785141}}
|
8 |
+
{"created_at": "2025-09-06T17:13:57.103254", "global_step": 240000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20393120393120392, "acc_stderr,none": 0.011535521334313653}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.5036845249950209, "acc_stderr,none": 0.004989645929811453, "acc_norm,none": 0.6709818761202948, "acc_norm_stderr,none": 0.004688963175758167}, "mmlu": {"acc,none": 0.2798034468024498, "acc_stderr,none": 0.003780617345848806, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2763018065887354, "acc_stderr,none": 0.006510711260353776, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.03893259610604674}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.296969696969697, "acc_stderr,none": 0.035679697722680495}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.031145570659486782}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3037974683544304, "acc_stderr,none": 0.029936696387138594}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302871}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3006134969325153, "acc_stderr,none": 0.03602511318806771}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2976878612716763, "acc_stderr,none": 0.024617055388676996}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3279742765273312, "acc_stderr,none": 0.026664410886937606}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3425925925925926, "acc_stderr,none": 0.026406145973625672}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25554106910039115, "acc_stderr,none": 0.01113985783359852}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.32748538011695905, "acc_stderr,none": 0.035993357714560276}, "mmlu_other": {"acc,none": 0.276472481493402, "acc_stderr,none": 0.007999524754880756, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.28679245283018867, "acc_stderr,none": 0.027834912527544074}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.30057803468208094, "acc_stderr,none": 0.0349610148119118}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2556053811659193, "acc_stderr,none": 0.029275891003969927}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3418803418803419, "acc_stderr,none": 0.03107502852650775}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.27330779054916987, "acc_stderr,none": 0.01593668106262856}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.32679738562091504, "acc_stderr,none": 0.02685729466328142}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2624113475177305, "acc_stderr,none": 0.026244920349843014}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1875, "acc_stderr,none": 0.023709788253811766}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.03484331592680589}, "mmlu_social_sciences": {"acc,none": 0.2772180695482613, "acc_stderr,none": 0.008032374615917454, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.17543859649122806, "acc_stderr,none": 0.0357795481394837}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2878787878787879, "acc_stderr,none": 0.03225883512300993}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23834196891191708, "acc_stderr,none": 0.030748905363909892}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2641025641025641, "acc_stderr,none": 0.02235219373745328}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.25630252100840334, "acc_stderr,none": 0.028359620870533953}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24036697247706423, "acc_stderr,none": 0.01832060732096407}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.35877862595419846, "acc_stderr,none": 0.04206739313864908}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.01798661530403032}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.04350271442923243}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.40816326530612246, "acc_stderr,none": 0.03146465712827423}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2736318407960199, "acc_stderr,none": 0.03152439186555401}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_stem": {"acc,none": 0.29083412622898824, "acc_stderr,none": 0.00809428857562141, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3037037037037037, "acc_stderr,none": 0.03972552884785137}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3026315789473684, "acc_stderr,none": 0.037385206761196686}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.03800968060554858}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.24680851063829787, "acc_stderr,none": 0.028185441301234092}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.36551724137931035, "acc_stderr,none": 0.04013124195424385}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.023456037383982026}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27741935483870966, "acc_stderr,none": 0.025470196835900055}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2955665024630542, "acc_stderr,none": 0.032104944337514575}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.27037037037037037, "acc_stderr,none": 0.02708037281514566}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31788079470198677, "acc_stderr,none": 0.03802039760107903}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03054674526495318}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.042878587513404544}, "sciq": {"alias": "sciq", "acc,none": 0.941, "acc_stderr,none": 0.0074548356504067275, "acc_norm,none": 0.905, "acc_norm_stderr,none": 0.009276910103103343}}
|
9 |
+
{"created_at": "2025-09-07T08:38:04.193866", "global_step": 270000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.23177723177723178, "acc_stderr,none": 0.012080893552302298}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.5126468830910177, "acc_stderr,none": 0.0049881849883452855, "acc_norm,none": 0.6827325234017128, "acc_norm_stderr,none": 0.0046446136011041575}, "mmlu": {"acc,none": 0.30337558752314486, "acc_stderr,none": 0.0038668927052848347, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.29521785334750267, "acc_stderr,none": 0.006629216610177871, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.373015873015873, "acc_stderr,none": 0.04325506042017086}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.32727272727272727, "acc_stderr,none": 0.03663974994391242}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.03166009679399813}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.29957805907172996, "acc_stderr,none": 0.029818024749753095}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4132231404958678, "acc_stderr,none": 0.04495087843548408}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2822085889570552, "acc_stderr,none": 0.03536117886664743}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3208092485549133, "acc_stderr,none": 0.025131000233647907}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808847}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3408360128617363, "acc_stderr,none": 0.02692084126077616}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3611111111111111, "acc_stderr,none": 0.026725868809100786}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.273142112125163, "acc_stderr,none": 0.011380150567830413}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.36257309941520466, "acc_stderr,none": 0.036871306155620606}, "mmlu_other": {"acc,none": 0.3064048921789508, "acc_stderr,none": 0.008245960726865217, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.33962264150943394, "acc_stderr,none": 0.02914690474779833}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.36416184971098264, "acc_stderr,none": 0.03669072477416907}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.38, "acc_stderr,none": 0.04878317312145632}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.19730941704035873, "acc_stderr,none": 0.02670985334496796}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.04354631077260595}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3504273504273504, "acc_stderr,none": 0.0312561082442188}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.3116219667943806, "acc_stderr,none": 0.016562433867284176}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3366013071895425, "acc_stderr,none": 0.027057974624494382}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3049645390070922, "acc_stderr,none": 0.027464708442022135}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2536764705882353, "acc_stderr,none": 0.026431329870789524}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3313253012048193, "acc_stderr,none": 0.03664314777288087}, "mmlu_social_sciences": {"acc,none": 0.30289242768930774, "acc_stderr,none": 0.008236105580385867, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.03505859682597264}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.03318477333845331}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.27979274611398963, "acc_stderr,none": 0.03239637046735703}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.022139081103971527}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2773109243697479, "acc_stderr,none": 0.029079374539480007}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.28440366972477066, "acc_stderr,none": 0.019342036587702588}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.35877862595419846, "acc_stderr,none": 0.04206739313864908}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3006535947712418, "acc_stderr,none": 0.01855063450295296}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2818181818181818, "acc_stderr,none": 0.043091187099464585}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.42857142857142855, "acc_stderr,none": 0.03168091161233882}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.31343283582089554, "acc_stderr,none": 0.03280188205348643}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.45, "acc_stderr,none": 0.04999999999999999}, "mmlu_stem": {"acc,none": 0.31303520456707895, "acc_stderr,none": 0.0082518439220375, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34814814814814815, "acc_stderr,none": 0.041153246103369526}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.375, "acc_stderr,none": 0.039397364351956274}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3125, "acc_stderr,none": 0.038760854559127644}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.39, "acc_stderr,none": 0.04902071300001974}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.042207736591714506}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.41, "acc_stderr,none": 0.049431107042371025}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.029241883869628796}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.35172413793103446, "acc_stderr,none": 0.03979236637497411}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.30687830687830686, "acc_stderr,none": 0.023752928712112133}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.31290322580645163, "acc_stderr,none": 0.026377567028645858}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3399014778325123, "acc_stderr,none": 0.033327690684107895}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.026842057873833713}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.33112582781456956, "acc_stderr,none": 0.038425817186598696}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.25462962962962965, "acc_stderr,none": 0.029711275860005344}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.043642261558410445}, "sciq": {"alias": "sciq", "acc,none": 0.939, "acc_stderr,none": 0.007572076091557424, "acc_norm,none": 0.905, "acc_norm_stderr,none": 0.009276910103103338}}
|
10 |
+
{"created_at": "2025-09-07T08:46:50.558206", "global_step": 300000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.22932022932022933, "acc_stderr,none": 0.012035891058050903}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.5155347540330611, "acc_stderr,none": 0.004987372476207027, "acc_norm,none": 0.688707428799044, "acc_norm_stderr,none": 0.0046207585796286576}, "mmlu": {"acc,none": 0.31348810710724967, "acc_stderr,none": 0.003898119121741806, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3028692879914984, "acc_stderr,none": 0.006673787545841354, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.04073524322147126}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3393939393939394, "acc_stderr,none": 0.03697442205031596}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29901960784313725, "acc_stderr,none": 0.03213325717373616}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3080168776371308, "acc_stderr,none": 0.030052389335605702}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4214876033057851, "acc_stderr,none": 0.045077322787750944}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.04453197507374984}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3374233128834356, "acc_stderr,none": 0.03714908409935574}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.33236994219653176, "acc_stderr,none": 0.025361168749688218}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.25251396648044694, "acc_stderr,none": 0.014530330201468643}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.35691318327974275, "acc_stderr,none": 0.027210420375934023}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3888888888888889, "acc_stderr,none": 0.02712511551316686}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27249022164276404, "acc_stderr,none": 0.011371658294311538}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3684210526315789, "acc_stderr,none": 0.036996580176568775}, "mmlu_other": {"acc,none": 0.31606050852912776, "acc_stderr,none": 0.00830421054490409, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3471698113207547, "acc_stderr,none": 0.029300101705549655}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3063583815028902, "acc_stderr,none": 0.03514942551267439}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.35, "acc_stderr,none": 0.04793724854411019}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.25112107623318386, "acc_stderr,none": 0.029105220833224633}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928315}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.4230769230769231, "acc_stderr,none": 0.032366121762202014}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.32950191570881227, "acc_stderr,none": 0.016808322261740456}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35947712418300654, "acc_stderr,none": 0.027475969910660952}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.28368794326241137, "acc_stderr,none": 0.02689170942834396}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.22794117647058823, "acc_stderr,none": 0.025483081468029804}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3614457831325301, "acc_stderr,none": 0.0374005938202932}, "mmlu_social_sciences": {"acc,none": 0.3217419564510887, "acc_stderr,none": 0.0083683457714155, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.039994238792813344}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.35858585858585856, "acc_stderr,none": 0.034169036403915214}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.32124352331606215, "acc_stderr,none": 0.033699508685490674}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.24871794871794872, "acc_stderr,none": 0.0219169577092138}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3067226890756303, "acc_stderr,none": 0.02995382389188705}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.30642201834862387, "acc_stderr,none": 0.01976551722045852}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3893129770992366, "acc_stderr,none": 0.04276486542814591}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.018342529845275915}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.35454545454545455, "acc_stderr,none": 0.045820048415054174}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.4448979591836735, "acc_stderr,none": 0.031814251181977865}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.34328358208955223, "acc_stderr,none": 0.03357379665433431}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.48, "acc_stderr,none": 0.05021167315686779}, "mmlu_stem": {"acc,none": 0.31874405328258804, "acc_stderr,none": 0.008279704075152263, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4222222222222222, "acc_stderr,none": 0.04266763404099582}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3684210526315789, "acc_stderr,none": 0.03925523381052932}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.03852084696008534}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542129}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.038739587141493524}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.4, "acc_stderr,none": 0.04923659639173309}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2936170212765957, "acc_stderr,none": 0.02977164271249123}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3724137931034483, "acc_stderr,none": 0.040287315329475604}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.29894179894179895, "acc_stderr,none": 0.0235776047916558}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3419354838709677, "acc_stderr,none": 0.026985289576552742}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.33004926108374383, "acc_stderr,none": 0.03308530426228258}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621505}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.027309140588230186}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.37748344370860926, "acc_stderr,none": 0.039580272311215706}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.030998666304560538}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340456}, "sciq": {"alias": "sciq", "acc,none": 0.947, "acc_stderr,none": 0.007088105617246444, "acc_norm,none": 0.911, "acc_norm_stderr,none": 0.009008893392651544}}
|